Auditor/theauditor/linters/parsers.py

"""Linter output parsers - converts various linter outputs to normalized format.

COURIER PHILOSOPHY:
- We translate tool output keys to standard keys
- We preserve exact messages and severities
- We perform direct data access without interpretation
- We validate translation, not content
"""

import json
import re
from pathlib import Path
from typing import Any


def parse_eslint_output(output: str, workset_files: set[str]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    """Parse ESLint JSON output.

    Returns:
        Tuple of (findings, ast_data) where ast_data maps file paths to AST objects
    """
    findings = []
    ast_data = {}

    try:
        results = json.loads(output)
        for file_result in results:
            file = Path(file_result["filePath"])
            # Normalize path to forward slashes for cross-platform compatibility
            file_str = str(file).replace("\\", "/")

            # Try to match against workset in various forms
            matched = False
            for workset_file in workset_files:
                # Check if the absolute path ends with the relative workset path
                # This handles both Windows absolute paths and Unix paths
                if file_str.endswith(workset_file):
                    matched = True
                    file_str = workset_file
                    break
                # Also check if workset file is contained in the path (with proper separators)
                elif "/" + workset_file in file_str or file_str.startswith(workset_file):
                    matched = True
                    file_str = workset_file
                    break

            if not matched:
                continue

            # Extract AST if present
            if "ast" in file_result:
                ast_data[file_str] = file_result["ast"]

            for message in file_result.get("messages", []):
                # Direct data access from ESLint output
                # Create the translated finding using standard keys
                # ESLint severity: numeric (2=error, 1=warning) - translate to standard
                eslint_severity = message.get("severity", 1)
                # Map numeric severity to standard strings
                if eslint_severity == 2:
                    standard_severity = "error"
                elif eslint_severity == 1:
                    standard_severity = "warning"
                else:
                    standard_severity = "warning"  # Default for unknown values

                translated = {
                    "tool": "eslint",
                    "file": file_str,
                    "line": int(message.get("line") or 0),
                    "column": int(message.get("column") or 0),
                    "rule": message.get("ruleId", ""),  # Empty not "unknown"
                    "message": message.get("message", ""),
                    "severity": standard_severity,  # Use standardized severity
                    "category": "lint",
                }

                # No validation needed

                findings.append(translated)
    except json.JSONDecodeError:
        # Fall back to regex parsing
        pattern = r"([^:]+):(\d+):(\d+):\s+(error|warning)\s+(.+?)\s+([a-z0-9\-\/]+)\s*$"
        for line in output.strip().split("\n"):
            match = re.match(pattern, line.strip())
            if match:
                file = match.group(1).strip()
                if file in workset_files:
                    findings.append(
                        {
                            "tool": "eslint",
                            "file": file,
                            "line": int(match.group(2)),
                            "column": int(match.group(3)),
                            "rule": match.group(6),
                            "message": match.group(5),
                            "severity": match.group(4),  # Keep original for /raw/
                            "category": "lint",
                        }
                    )

    return findings, ast_data


def parse_ruff_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse ruff output - translate to standard keys."""
    findings = []

    # Format: path:line:col: code message
    pattern = r"([^:]+):(\d+):(\d+):\s+([A-Z]\d+)\s+(.+)$"
    for line in output.strip().split("\n"):
        match = re.match(pattern, line.strip())
        if match:
            file = match.group(1).strip()
            # Normalize Windows backslashes to forward slashes for comparison
            normalized_file = file.replace("\\", "/")

            # Check if file is in workset (comparing normalized paths)
            if normalized_file in workset_files or file in workset_files:
                # Create original dict for validation
                original = {
                    "file": file,
                    "line": match.group(2),
                    "column": match.group(3),
                    "code": match.group(4),
                    "message": match.group(5)
                }

                # COURIER: Translate to standard keys
                code = match.group(4)
                translated = {
                    "tool": "ruff",
                    "file": normalized_file,  # Use normalized path
                    "line": int(match.group(2)),
                    "column": int(match.group(3)),
                    "rule": code,  # Preserve original code in rule field
                    "message": match.group(5),  # Preserve exactly
                    "severity": "warning",  # Direct preservation - ruff doesn't provide severity in concise format
                    "category": "lint",
                }

                # No validation needed

                findings.append(translated)

    return findings


def parse_mypy_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse mypy output."""
    findings = []

    # Format: path:line: error: message [type-code]
    pattern = r"([^:]+):(\d+):\s+(error|warning|note):\s+(.+?)(?:\s+\[([^\]]+)\])?$"
    for line in output.strip().split("\n"):
        match = re.match(pattern, line.strip())
        if match:
            file = match.group(1).strip()
            # Normalize Windows backslashes to forward slashes for comparison
            normalized_file = file.replace("\\", "/")

            # Check if file is in workset (comparing normalized paths)
            if normalized_file in workset_files or file in workset_files:
                # Create translated finding
                original = {
                    "file": file,
                    "line": match.group(2),
                    "severity": match.group(3),
                    "message": match.group(4),
                    "code": match.group(5)
                }

                translated = {
                    "tool": "mypy",
                    "file": normalized_file,  # Use normalized path
                    "line": int(match.group(2)),
                    "column": 0,
                    "rule": match.group(5) or "type-error",
                    "message": match.group(4),
                    "severity": match.group(3),  # Keep original mypy severity for /raw/
                    "category": "type",
                }

                # No validation needed

                findings.append(translated)

    return findings


def parse_tsc_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse TypeScript compiler output."""
    findings = []

    # Format: path(line,col): error TS1234: message
    pattern = r"([^(]+)\((\d+),(\d+)\):\s+(error|warning)\s+(TS\d+):\s+(.+)$"
    for line in output.strip().split("\n"):
        match = re.match(pattern, line.strip())
        if match:
            file = match.group(1).strip()
            if file in workset_files:
                # Create translated finding
                original = {
                    "file": file,
                    "line": match.group(2),
                    "column": match.group(3),
                    "severity": match.group(4),
                    "code": match.group(5),
                    "message": match.group(6)
                }

                translated = {
                    "tool": "tsc",
                    "file": file,
                    "line": int(match.group(2)),
                    "column": int(match.group(3)),
                    "rule": match.group(5),
                    "message": match.group(6),
                    "severity": match.group(4),  # Keep original tsc severity for /raw/
                    "category": "type",
                }

                # No validation needed

                findings.append(translated)

    return findings


def parse_prettier_output(
    stdout: str, stderr: str, workset_files: set[str]
) -> list[dict[str, Any]]:
    """Parse Prettier output."""
    findings = []

    # When run with --check, Prettier lists unformatted files on stderr with [warn] prefix
    # Example: "[warn] backend/src/app.ts" or with ANSI codes: "\x1b[33m[warn]\x1b[39m backend/src/app.ts"
    import re

    # Pattern to extract file path after [warn] prefix, handling ANSI codes
    # Matches: [warn] file.ts or \x1b[XXm[warn]\x1b[XXm file.ts
    pattern = r'\[warn\]\s+(.+?)$'

    for line in stderr.strip().split("\n"):
        if line and not line.startswith("Checking"):
            # Remove ANSI color codes first
            clean_line = re.sub(r'\x1b\[[0-9;]*m', '', line)
            clean_line = re.sub(r'\[\d+m', '', clean_line)  # Also handle [33m format

            # Extract file path after [warn]
            match = re.search(pattern, clean_line)
            if match:
                file = match.group(1).strip()
            else:
                # Fallback: if no [warn] prefix, use the whole line
                file = clean_line.strip()

            # Normalize path for comparison
            normalized_file = file.replace("\\", "/")

            # Check if file is in workset
            if normalized_file in workset_files or file in workset_files:
                # Create translated finding
                original = {"file": file}

                translated = {
                    "tool": "prettier",
                    "file": normalized_file,  # Use normalized path
                    "line": 0,
                    "column": 0,
                    "rule": "format",
                    "message": "File needs formatting",
                    "severity": "warning",  # Keep original for /raw/
                    "category": "style",
                }

                # No validation needed

                findings.append(translated)

    return findings


def parse_black_output(stdout: str, stderr: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse Black output."""
    findings = []

    # Black shows files that would be reformatted in stderr
    pattern = r"would reformat (.+)$"
    for match in re.finditer(pattern, stderr, re.MULTILINE):
        file = match.group(1)
        # Normalize Windows backslashes to forward slashes for comparison
        normalized_file = file.replace("\\", "/")

        # Check if file is in workset (comparing normalized paths)
        if normalized_file in workset_files or file in workset_files:
            # Create translated finding
            original = {"file": file}

            translated = {
                "tool": "black",
                "file": normalized_file,  # Use normalized path
                "line": 0,
                "column": 0,
                "rule": "format",
                "message": "File needs formatting",
                "severity": "warning",  # Keep original for /raw/
                "category": "style",
            }

            # No validation needed

            findings.append(translated)

    # Also check for --diff output in stdout
    # When --diff is used, Black outputs unified diff format to stdout
    if stdout and stdout.startswith("---"):
        # Extract filenames from diff headers
        diff_pattern = r"^---\s+(.+?)\s+\d{4}-\d{2}-\d{2}"
        for match in re.finditer(diff_pattern, stdout, re.MULTILINE):
            file = match.group(1)
            # Normalize Windows backslashes to forward slashes for comparison
            normalized_file = file.replace("\\", "/")

            # Check if file is in workset (comparing normalized paths)
            if normalized_file in workset_files or file in workset_files:
                # Check if we already added this file from stderr
                if not any(f["file"] == normalized_file for f in findings):
                    translated = {
                        "tool": "black",
                        "file": normalized_file,  # Use normalized path
                        "line": 0,
                        "column": 0,
                        "rule": "format",
                        "message": "File needs formatting",
                        "severity": "warning",
                        "category": "style",
                    }
                    findings.append(translated)

    return findings


def parse_golangci_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse golangci-lint output."""
    findings = []

    # Format: path:line:col: message (linter)
    pattern = r"([^:]+):(\d+):(\d+):\s+(.+?)\s+\(([^)]+)\)$"
    for match in re.finditer(pattern, output, re.MULTILINE):
        file = match.group(1)
        if file in workset_files:
            # Create translated finding
            original = {
                "file": file,
                "line": match.group(2),
                "column": match.group(3),
                "message": match.group(4),
                "linter": match.group(5)
            }

            translated = {
                "tool": "golangci-lint",
                "file": file,
                "line": int(match.group(2)),
                "column": int(match.group(3)),
                "rule": match.group(5),
                "message": match.group(4),
                "severity": "warning",  # Keep original for /raw/
                "category": "lint",
            }

            # No validation needed

            findings.append(translated)

    return findings


def parse_go_vet_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse go vet output."""
    findings = []

    # Format: path:line:col: message
    pattern = r"([^:]+):(\d+):(\d+):\s+(.+)$"
    for match in re.finditer(pattern, output, re.MULTILINE):
        file = match.group(1)
        if file in workset_files:
            # Create translated finding
            original = {
                "file": file,
                "line": match.group(2),
                "column": match.group(3),
                "message": match.group(4)
            }

            translated = {
                "tool": "go-vet",
                "file": file,
                "line": int(match.group(2)),
                "column": int(match.group(3)),
                "rule": "vet",
                "message": match.group(4),
                "severity": "warning",  # Keep original for /raw/
                "category": "lint",
            }

            # No validation needed

            findings.append(translated)

    return findings


def parse_maven_output(tool: str, output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse Maven-based tool output (SpotBugs/Checkstyle)."""
    findings = []

    # Simple pattern matching for Maven output
    pattern = r"\[(?:ERROR|WARNING)\]\s+([^:]+):(\d+):\s+(.+)$"
    for match in re.finditer(pattern, output, re.MULTILINE):
        file = match.group(1)
        if file in workset_files:
            # Create translated finding
            original = {
                "file": file,
                "line": match.group(2),
                "message": match.group(3)
            }

            translated = {
                "tool": tool,
                "file": file,
                "line": int(match.group(2)),
                "column": 0,
                "rule": tool,
                "message": match.group(3),
                "severity": "warning",  # Keep original for /raw/
                "category": "lint",
            }

            # No validation needed

            findings.append(translated)

    return findings


def parse_bandit_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
    """Parse bandit JSON output for Python security issues."""
    findings = []

    try:
        results = json.loads(output)
        # Bandit JSON structure has "results" key with findings
        for result in results.get("results", []):
            file = result.get("filename", "")
            # Normalize Windows backslashes to forward slashes for comparison
            normalized_file = file.replace("\\", "/")

            # Check if the absolute path from Bandit matches any relative workset path
            # Bandit returns absolute paths like C:/Users/.../file.py
            # Workset has relative paths like scrapers/file.py
            matched = False
            matched_file = normalized_file  # Default to normalized absolute path

            for workset_file in workset_files:
                # Normalize workset file for comparison
                normalized_workset = workset_file.replace("\\", "/")
                # Check if absolute path ends with the relative path
                if normalized_file.endswith(normalized_workset):
                    matched = True
                    matched_file = normalized_workset  # Use the workset's relative path
                    break
                # Also check with leading slash
                elif normalized_file.endswith("/" + normalized_workset):
                    matched = True
                    matched_file = normalized_workset
                    break

            if matched:
                # Map bandit severity/confidence to standard
                severity_map = {
                    "HIGH": "error",
                    "MEDIUM": "warning",
                    "LOW": "warning"
                }

                translated = {
                    "tool": "bandit",
                    "file": matched_file,  # Use the matched relative path from workset
                    "line": int(result.get("line_number") or 0),
                    "column": int(result.get("col_offset") or 0),
                    "rule": result.get("test_id", ""),
                    "message": result.get("issue_text", ""),
                    "severity": severity_map.get(result.get("issue_severity", "MEDIUM"), "warning"),
                    "category": "security",
                }
                findings.append(translated)
    except json.JSONDecodeError:
        # Fallback to text parsing if JSON fails
        pass

    return findings