Auditor/theauditor/linters/runner.py

"""Linter runner module - executes linter subprocesses."""

import json
import os
import platform
import subprocess
from pathlib import Path
from typing import Any

# Import our custom temp manager to avoid WSL2/Windows issues
from theauditor.utils.temp_manager import TempManager

# Detect if running on Windows for subprocess shell handling
IS_WINDOWS = platform.system() == "Windows"

# Note: Path quoting is NOT needed when using shell=False (which we now use everywhere).
# subprocess.run() with shell=False passes arguments directly to the OS without
# shell interpretation, so paths with spaces work correctly without quotes.


def run_linter(
    tool: str,
    command: list[str],
    root_path: str,
    workset_files: set[str],
    timeout: int,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    """
    Run a linter and parse its output.

    Returns:
        Tuple of (findings, ast_data) where ast_data maps file paths to AST objects
    """
    findings = []
    ast_data = {}

    try:
        # Add workset files to command if tool supports it
        if tool in ["eslint", "ruff", "mypy", "prettier", "black", "bandit"]:
            # Filter files by appropriate extension for each tool
            if tool in ["ruff", "mypy", "black", "bandit"]:
                # Python linters - only process .py files
                file_args = [f for f in workset_files if f.endswith('.py')]
                # Skip Python linters entirely if no Python files in workset
                if not file_args:
                    # Silent skip - no Python files to lint
                    return [], {}
            elif tool == "eslint":
                # JavaScript/TypeScript linter
                file_args = []
                has_standard_structure = False

                for f in workset_files:
                    if f.endswith(('.js', '.jsx', '.ts', '.tsx', '.mjs')):
                        normalized = f.replace('\\', '/')

                        # Professional structures: /src/ anywhere (monorepo + traditional)
                        if '/src/' in normalized:
                            has_standard_structure = True
                            # Exclude obvious non-source files with more robust path-based exclusions
                            if not any(excluded_path in normalized for excluded_path in [
                                '/config/', '/scripts/', '/migrations/', '/seeders/',
                                '.config.', '.test.', '.spec.',
                                '/node_modules/', '/dist/', '/build/', '/.next/', '/.nuxt/'
                            ]):
                                file_args.append(f)

                # Fallback for non-standard projects
                if not file_args and not has_standard_structure:
                    print("\n" + "="*60)
                    print("WARNING: NON-STANDARD PROJECT STRUCTURE DETECTED")
                    print("="*60)
                    print("This project does not follow conventional src/ directory structure.")
                    print("TheAuditor will attempt to lint ALL JavaScript files.")
                    print("This is HIGH RISK and may produce incorrect results.")
                    print("Consider restructuring your project to use:")
                    print("  - frontend/src/ and backend/src/ (traditional)")
                    print("  - packages/*/src/ or apps/*/src/ (monorepo)")
                    print("="*60 + "\n")

                    # Just grab everything and pray
                    for f in workset_files:
                        if f.endswith(('.js', '.jsx', '.ts', '.tsx', '.mjs')):
                            normalized = f.replace('\\', '/')
                            # At least skip the absolute garbage
                            if not any(x in normalized.lower() for x in [
                                '/node_modules/', '/dist/', '/build/', '/.git/'
                            ]):
                                file_args.append(f)
            elif tool == "prettier":
                # Prettier can handle many file types - focus on source code only
                file_args = [f for f in workset_files if f.endswith(('.js', '.jsx', '.ts', '.tsx', '.json', '.css', '.scss', '.html'))]
            else:
                # Default: use all files
                file_args = list(workset_files)

            if not file_args:
                return [], {}

            # Check if we need to chunk for Windows command line limit
            CHUNK_SIZE = 50  # Safe for 8KB Windows limit
            # Enable chunking for all tools that accept file lists
            all_chunking_tools = ["eslint", "prettier", "ruff", "mypy", "black", "bandit"]
            needs_chunking = tool in all_chunking_tools and len(file_args) > CHUNK_SIZE

            if needs_chunking:
                # We'll process in chunks - set up aggregation
                all_findings = []
                all_ast_data = {}
                total_chunks = (len(file_args) + CHUNK_SIZE - 1) // CHUNK_SIZE
                print(f"  Processing {len(file_args)} files in {total_chunks} chunks...")

                # Process each chunk
                for chunk_num, i in enumerate(range(0, len(file_args), CHUNK_SIZE), 1):
                    chunk_files = file_args[i:i + CHUNK_SIZE]
                    print(f"    Chunk {chunk_num}/{total_chunks}: {len(chunk_files)} files")

                    # Normalize paths for JavaScript tools only
                    if tool in ["eslint", "prettier"]:
                        chunk_files = [f.replace('\\', '/') for f in chunk_files]
                    # Python tools use native path format
                    chunk_command = command + chunk_files

                    # Execute this chunk
                    chunk_findings, chunk_ast_data = _execute_linter_command(
                        tool, chunk_command, root_path, workset_files, timeout
                    )

                    # Aggregate results
                    all_findings.extend(chunk_findings)
                    all_ast_data.update(chunk_ast_data)

                return all_findings, all_ast_data
            else:
                # Single command execution (no chunking needed)
                if tool in ["eslint", "prettier"]:  # JS tools need normalized paths
                    file_args = [f.replace('\\', '/') for f in file_args]
                # Python tools use native path format
                command = command + file_args

                # Execute single command
                return _execute_linter_command(
                    tool, command, root_path, workset_files, timeout
                )

        elif tool in ["golangci-lint", "go-vet"]:
            # Go linters - filter to .go files if needed
            go_files = [f for f in workset_files if f.endswith('.go')]
            if not go_files:
                return [], {}
            # Note: These tools typically operate on packages/directories, not individual files
        elif tool == "tsc":
            # TypeScript compiler - check if we have any TS/TSX files
            ts_files = [f for f in workset_files if f.endswith(('.ts', '.tsx'))]
            if not ts_files:
                return [], {}
            # Note: tsc doesn't take file arguments - it uses tsconfig.json
        elif tool in ["spotbugs", "checkstyle"]:
            # Java linters - check if we have any Java files
            java_files = [f for f in workset_files if f.endswith('.java')]
            if not java_files:
                return [], {}
            # Note: Maven tools operate on the whole project

        # For non-chunked tools, execute directly
        return _execute_linter_command(tool, command, root_path, workset_files, timeout)

    except subprocess.TimeoutExpired:
        print(f"Warning: {tool} timed out after {timeout}s")
    except FileNotFoundError:
        print(f"Warning: {tool} not found, skipping")
    except Exception as e:
        print(f"Warning: Error running {tool}: {e}")

    return findings, ast_data


def _execute_linter_command(
    tool: str,
    command: list[str],
    root_path: str,
    workset_files: set[str],
    timeout: int,
) -> tuple[list[dict[str, Any]], dict[str, Any]]:
    """
    Execute a single linter command and parse its output.
    This is called once for non-chunked tools, multiple times for chunked tools.
    """
    findings = []
    ast_data = {}

    try:
        # Create debug log file when debug flag is set
        debug_log_path = None
        if os.environ.get("THEAUDITOR_DEBUG"):
            debug_log_path = Path(".pf") / "linter_debug.log"
            debug_log_path.parent.mkdir(exist_ok=True)

            # Log ground truth before execution
            debug_info = {
                "tool": tool,
                "command": command,
                "root_path": root_path,
                "cwd": os.getcwd(),
                "PATH": os.environ.get('PATH', ''),
                "NODE_PATH": os.environ.get('NODE_PATH', ''),
                "platform": platform.system(),
                "IS_WINDOWS": IS_WINDOWS,
                "workset_files_count": len(workset_files)
            }

            with open(debug_log_path, 'a', encoding='utf-8') as f:
                f.write(f"\n{'='*60}\n")
                f.write(f"[{tool}] Pre-execution debug at {os.path.basename(__file__)}:{_execute_linter_command.__name__}\n")
                f.write(json.dumps(debug_info, indent=2))
                f.write("\n")

        # Run the linter using our custom temp files to avoid buffer overflow and WSL2 issues
        stdout_path, stderr_path = TempManager.create_temp_files_for_subprocess(root_path, tool)

        with open(stdout_path, 'w+', encoding='utf-8') as stdout_fp, \
             open(stderr_path, 'w+', encoding='utf-8') as stderr_fp:

            # Use bundled Node.js for JavaScript tools on ALL platforms
            js_tools = ["eslint", "tsc", "prettier"]
            if tool in js_tools:
                # Find bundled Node.js runtime
                sandbox_base = Path(root_path) / ".auditor_venv" / ".theauditor_tools"
                node_runtime = sandbox_base / "node-runtime"

                # Platform-specific Node.js executable
                if IS_WINDOWS:
                    node_exe = node_runtime / "node.exe"
                else:
                    node_exe = node_runtime / "bin" / "node"

                if node_exe.exists():
                    # The command[0] is the .cmd or shell wrapper path
                    # We need to find the actual JavaScript entry point
                    # npm installs in node_modules/<package>/<entry>

                    # Map tool to its JavaScript entry point
                    # These are the ACTUAL paths where npm installs them
                    node_modules = sandbox_base / "node_modules"

                    if tool == "eslint":
                        # ESLint main entry is at node_modules/eslint/bin/eslint.js
                        js_script = node_modules / "eslint" / "bin" / "eslint.js"
                        # Fallback to lib/cli.js if bin doesn't exist (older versions)
                        if not js_script.exists():
                            js_script = node_modules / "eslint" / "lib" / "cli.js"
                    elif tool == "tsc":
                        # TypeScript compiler is at node_modules/typescript/lib/tsc.js
                        js_script = node_modules / "typescript" / "lib" / "tsc.js"
                    elif tool == "prettier":
                        # Prettier can be at different locations
                        # Try node_modules/prettier/bin/prettier.cjs first
                        js_script = node_modules / "prettier" / "bin" / "prettier.cjs"
                        if not js_script.exists():
                            # Try prettier.js
                            js_script = node_modules / "prettier" / "bin" / "prettier.js"
                        if not js_script.exists():
                            # Try the main entry
                            js_script = node_modules / "prettier" / "index.js"

                    if js_script.exists():
                        # Build new command using bundled Node.js
                        # Direct execution: node script.js [args...]
                        command_to_run = [str(node_exe), str(js_script)] + command[1:]
                        use_shell = False  # No shell needed with direct execution

                        if debug_log_path:
                            with open(debug_log_path, 'a', encoding='utf-8') as f:
                                f.write(f"[{tool}] Using bundled Node.js runtime\n")
                                f.write(f"  Node: {node_exe}\n")
                                f.write(f"  Script: {js_script}\n")
                                f.write(f"  Command: {command_to_run}\n")
                    else:
                        # Script not found - try to help debug
                        if debug_log_path:
                            with open(debug_log_path, 'a', encoding='utf-8') as f:
                                f.write(f"[{tool}] Script not found: {js_script}\n")
                                # List what actually exists to help debug
                                tool_dir = node_modules / tool.replace("tsc", "typescript")
                                if tool_dir.exists():
                                    f.write(f"[{tool}] Directory exists: {tool_dir}\n")
                                    try:
                                        files = list(tool_dir.rglob("*.js"))[:5]
                                        f.write(f"[{tool}] Found JS files: {files}\n")
                                    except:
                                        pass
                        print(f"ERROR: JavaScript entry point not found: {js_script}")
                        print(f"       Expected location: {js_script}")
                        print(f"       Run 'aud setup-claude --target .' to reinstall")
                        return [], {}
                else:
                    # No bundled Node.js - fail with clear error
                    if debug_log_path:
                        with open(debug_log_path, 'a', encoding='utf-8') as f:
                            f.write(f"[{tool}] Bundled Node.js not found at: {node_exe}\n")
                    print(f"WARNING: {tool} requires bundled Node.js runtime")
                    print(f"         Expected at: {node_exe}")
                    print(f"         Run 'aud setup-claude --target .' to install")
                    return [], {}
            else:
                # Non-JS tools: always use list-based execution
                command_to_run = command
                use_shell = False  # Never use shell

            # Log the actual command that will be executed
            if debug_log_path:
                with open(debug_log_path, 'a', encoding='utf-8') as f:
                    f.write(f"[{tool}] Actual command to execute:\n")
                    f.write(f"  Type: {type(command_to_run)}\n")
                    f.write(f"  Value: {command_to_run}\n")
                    f.write(f"  Shell: {use_shell}\n")

            result = subprocess.run(
                command_to_run,
                cwd=root_path,
                stdout=stdout_fp,
                stderr=stderr_fp,
                text=True,
                encoding='utf-8',
                errors='replace',  # Handle encoding errors gracefully
                timeout=timeout,
                shell=use_shell,  # Determined above based on tool and platform
            )

        with open(stdout_path, 'r', encoding='utf-8', errors='replace') as f:
            result.stdout = f.read()
        with open(stderr_path, 'r', encoding='utf-8', errors='replace') as f:
            result.stderr = f.read()

        # Log the result after execution
        if debug_log_path:
            with open(debug_log_path, 'a', encoding='utf-8') as f:
                f.write(f"[{tool}] Post-execution results:\n")
                f.write(f"  Return code: {result.returncode}\n")
                f.write(f"  Stdout length: {len(result.stdout)} bytes\n")
                f.write(f"  Stderr length: {len(result.stderr)} bytes\n")
                if result.stdout:
                    f.write(f"  Stdout first 500 chars: {result.stdout[:500]}\n")
                if result.stderr:
                    f.write(f"  Stderr first 500 chars: {result.stderr[:500]}\n")
                f.write(f"{'='*60}\n")

        # Clean up temp files - best effort, don't fail if can't delete
        try:
            Path(stdout_path).unlink()
            Path(stderr_path).unlink()
        except (OSError, PermissionError):
            pass  # WSL2/Windows may hold locks

        # Import parsers dynamically to avoid circular imports
        from . import parsers

        # Parse output based on tool
        if tool == "eslint":
            findings, ast_data = parsers.parse_eslint_output(result.stdout, workset_files)
        elif tool == "ruff":
            findings = parsers.parse_ruff_output(result.stdout, workset_files)
        elif tool == "mypy":
            findings = parsers.parse_mypy_output(result.stdout, workset_files)
        elif tool == "tsc":
            findings = parsers.parse_tsc_output(result.stdout, workset_files)
        elif tool == "prettier":
            findings = parsers.parse_prettier_output(result.stdout, result.stderr, workset_files)
        elif tool == "black":
            findings = parsers.parse_black_output(result.stdout, result.stderr, workset_files)
        elif tool == "bandit":
            findings = parsers.parse_bandit_output(result.stdout, workset_files)
        elif tool == "golangci-lint":
            findings = parsers.parse_golangci_output(result.stdout, workset_files)
        elif tool == "go-vet":
            findings = parsers.parse_go_vet_output(result.stderr, workset_files)
        elif tool in ["spotbugs", "checkstyle"]:
            findings = parsers.parse_maven_output(tool, result.stdout, workset_files)

    except subprocess.TimeoutExpired:
        print(f"Warning: {tool} timed out after {timeout}s")
    except FileNotFoundError:
        print(f"Warning: {tool} not found, skipping")
    except Exception as e:
        print(f"Warning: Error running {tool}: {e}")

    return findings, ast_data