Auditor/theauditor/docker_analyzer.py

"""Docker container security analyzer module."""

import json
import logging
import re
import sqlite3
from pathlib import Path
from typing import Any, Dict, List

# Set up logger
logger = logging.getLogger(__name__)


def analyze_docker_images(db_path: str, check_vulnerabilities: bool = True) -> List[Dict[str, Any]]:
    """
    Analyze indexed Docker images for security misconfigurations.

    Args:
        db_path: Path to the repo_index.db database
        check_vulnerabilities: Whether to scan base images for vulnerabilities

    Returns:
        List of security findings with severity levels
    """
    findings = []

    # Connect to the database
    with sqlite3.connect(db_path) as conn:
        conn.row_factory = sqlite3.Row

        # Run each security check
        findings.extend(_find_root_containers(conn))
        findings.extend(_find_exposed_secrets(conn))

        # Base image vulnerability check
        if check_vulnerabilities:
            base_images = _prepare_base_image_scan(conn)
            if base_images:
                # Import here to avoid circular dependency
                from .vulnerability_scanner import scan_dependencies

                # Run vulnerability scan on Docker base images
                vuln_findings = scan_dependencies(base_images, offline=False)

                # Convert vulnerability findings to Docker-specific format
                for vuln in vuln_findings:
                    findings.append({
                        'type': 'docker_base_image_vulnerability',
                        'severity': vuln.get('severity', 'medium'),
                        'file': 'Dockerfile',
                        'message': f"Base image {vuln.get('package', 'unknown')} has vulnerability: {vuln.get('title', 'Unknown vulnerability')}",
                        'recommendation': vuln.get('recommendation', 'Update to latest secure version'),
                        'details': vuln
                    })

    return findings


def _find_root_containers(conn: sqlite3.Connection) -> List[Dict[str, Any]]:
    """
    Detect containers running as root user (default or explicit).

    CIS Docker Benchmark: Running containers as root is a major security risk.
    A container breakout would grant attacker root privileges on the host.

    Args:
        conn: SQLite database connection

    Returns:
        List of findings for containers running as root
    """
    findings = []
    cursor = conn.cursor()

    # Query all Docker images
    cursor.execute("SELECT file_path, env_vars FROM docker_images")

    for row in cursor:
        file_path = row['file_path']
        env_vars_json = row['env_vars']

        # Parse the JSON column
        try:
            env_vars = json.loads(env_vars_json) if env_vars_json else {}
        except json.JSONDecodeError as e:
            logger.debug(f"Non-critical error parsing Docker env vars JSON: {e}", exc_info=False)
            continue

        # Check for _DOCKER_USER key (set by USER instruction)
        docker_user = env_vars.get('_DOCKER_USER')

        # If no USER instruction or explicitly set to root
        if docker_user is None or docker_user.lower() == 'root':
            findings.append({
                'type': 'docker_root_user',
                'severity': 'High',
                'file': file_path,
                'message': f"Container runs as root user (USER instruction {'not set' if docker_user is None else 'set to root'})",
                'recommendation': "Add 'USER <non-root-user>' instruction to Dockerfile after installing dependencies"
            })

    return findings


def _find_exposed_secrets(conn: sqlite3.Connection) -> List[Dict[str, Any]]:
    """
    Detect hardcoded secrets in ENV and ARG instructions.

    ENV and ARG values are stored in image layers and can be inspected
    by anyone with access to the image, making them unsuitable for secrets.

    Args:
        conn: SQLite database connection

    Returns:
        List of findings for exposed secrets
    """
    findings = []
    cursor = conn.cursor()

    # Patterns for detecting sensitive keys
    sensitive_key_patterns = [
        r'(?i)password',
        r'(?i)secret',
        r'(?i)api[_-]?key',
        r'(?i)token',
        r'(?i)auth',
        r'(?i)credential',
        r'(?i)private[_-]?key',
        r'(?i)access[_-]?key'
    ]

    # Common secret value patterns
    secret_value_patterns = [
        r'^ghp_[A-Za-z0-9]{36}$',  # GitHub personal access token
        r'^ghs_[A-Za-z0-9]{36}$',  # GitHub secret
        r'^sk-[A-Za-z0-9]{48}$',   # OpenAI API key
        r'^xox[baprs]-[A-Za-z0-9-]+$',  # Slack token
        r'^AKIA[A-Z0-9]{16}$',     # AWS access key ID
    ]

    # Query all Docker images
    cursor.execute("SELECT file_path, env_vars, build_args FROM docker_images")

    for row in cursor:
        file_path = row['file_path']
        env_vars_json = row['env_vars']
        build_args_json = row['build_args']

        # Parse JSON columns
        try:
            env_vars = json.loads(env_vars_json) if env_vars_json else {}
            build_args = json.loads(build_args_json) if build_args_json else {}
        except json.JSONDecodeError as e:
            logger.debug(f"Non-critical error parsing Docker JSON columns: {e}", exc_info=False)
            continue

        # Check ENV variables
        for key, value in env_vars.items():
            # Skip internal tracking keys
            if key.startswith('_DOCKER_'):
                continue

            is_sensitive = False

            # Check if key name indicates sensitive data
            for pattern in sensitive_key_patterns:
                if re.search(pattern, key):
                    is_sensitive = True
                    findings.append({
                        'type': 'docker_exposed_secret',
                        'severity': 'Critical',
                        'file': file_path,
                        'message': f"Potential secret exposed in ENV instruction: {key}",
                        'recommendation': "Use Docker secrets or mount secrets at runtime instead of ENV"
                    })
                    break

            # Check if value matches known secret patterns
            if not is_sensitive and value:
                for pattern in secret_value_patterns:
                    if re.match(pattern, str(value)):
                        findings.append({
                            'type': 'docker_exposed_secret',
                            'severity': 'Critical',
                            'file': file_path,
                            'message': f"Detected secret pattern in ENV value for key: {key}",
                            'recommendation': "Remove hardcoded secrets and use runtime secret injection"
                        })
                        break

                # Check for high entropy strings (potential secrets)
                if not is_sensitive and value and _is_high_entropy(str(value)):
                    findings.append({
                        'type': 'docker_possible_secret',
                        'severity': 'Medium',
                        'file': file_path,
                        'message': f"High entropy value in ENV {key} - possible secret",
                        'recommendation': "Review if this is a secret and move to secure storage if so"
                    })

        # Check BUILD ARGs
        for key, value in build_args.items():
            # Check if key name indicates sensitive data
            for pattern in sensitive_key_patterns:
                if re.search(pattern, key):
                    findings.append({
                        'type': 'docker_exposed_secret',
                        'severity': 'High',  # Slightly lower than ENV as ARGs are build-time only
                        'file': file_path,
                        'message': f"Potential secret exposed in ARG instruction: {key}",
                        'recommendation': "Use --secret mount or BuildKit secrets instead of ARG for sensitive data"
                    })
                    break

    return findings


def _prepare_base_image_scan(conn: sqlite3.Connection) -> List[Dict[str, Any]]:
    """
    Prepare base image data for vulnerability scanning.

    This function extracts and parses base image information from the database,
    preparing it in the format expected by vulnerability_scanner.scan_dependencies().

    Args:
        conn: SQLite database connection

    Returns:
        List of dependency dicts with manager='docker', name, and version
    """
    dependencies = []
    cursor = conn.cursor()

    # Get all unique base images
    cursor.execute("SELECT DISTINCT base_image FROM docker_images WHERE base_image IS NOT NULL")

    for row in cursor:
        base_image = row[0]

        # Parse image string to extract name and version/tag
        # Format examples:
        # - python:3.11-slim
        # - node:18-alpine
        # - ubuntu:22.04
        # - gcr.io/project/image:tag
        # - image@sha256:hash

        if '@' in base_image:
            # Handle digest format (image@sha256:...)
            name = base_image.split('@')[0]
            version = base_image.split('@')[1]
        elif ':' in base_image:
            # Handle tag format (image:tag)
            parts = base_image.rsplit(':', 1)
            name = parts[0]
            version = parts[1]
        else:
            # No tag specified, defaults to 'latest'
            name = base_image
            version = 'latest'

        # Create dependency dict in vulnerability scanner format
        dependencies.append({
            'manager': 'docker',
            'name': name,
            'version': version,
            'source_file': 'Dockerfile'  # Could be enhanced to track actual file
        })

    return dependencies


def _is_high_entropy(value: str, threshold: float = 4.0) -> bool:
    """
    Check if a string has high entropy (potential secret).

    Uses Shannon entropy calculation to detect random-looking strings
    that might be secrets, API keys, or tokens.

    Args:
        value: String to check
        threshold: Entropy threshold (default 4.0)

    Returns:
        True if entropy exceeds threshold
    """
    import math

    # Skip short strings
    if len(value) < 10:
        return False

    # Skip strings with spaces (likely not secrets)
    if ' ' in value:
        return False

    # Calculate character frequency
    char_freq = {}
    for char in value:
        char_freq[char] = char_freq.get(char, 0) + 1

    # Calculate Shannon entropy
    entropy = 0.0
    for freq in char_freq.values():
        probability = freq / len(value)
        if probability > 0:
            entropy -= probability * math.log2(probability)

    return entropy > threshold