Files
Auditor/theauditor/linters/parsers.py
TheAuditorTool 4f6729d64c Fix: Handle null line/column values in ESLint and Bandit parsers
- Fix TypeError when linters return null instead of missing fields
  - Use 'or' operator to safely convert null to 0
  - Affects only JSON-based parsers (ESLint, Bandit)
  - Line 0 indicates file-level or configuration issues

  Fixes GitHub issue: "TypeError: '<' not supported between instances of 'str' and 'NoneType'"
2025-09-09 17:46:15 +07:00

504 lines
19 KiB
Python

"""Linter output parsers - converts various linter outputs to normalized format.
COURIER PHILOSOPHY:
- We translate tool output keys to standard keys
- We preserve exact messages and severities
- We perform direct data access without interpretation
- We validate translation, not content
"""
import json
import re
from pathlib import Path
from typing import Any
def parse_eslint_output(output: str, workset_files: set[str]) -> tuple[list[dict[str, Any]], dict[str, Any]]:
"""Parse ESLint JSON output.
Returns:
Tuple of (findings, ast_data) where ast_data maps file paths to AST objects
"""
findings = []
ast_data = {}
try:
results = json.loads(output)
for file_result in results:
file = Path(file_result["filePath"])
# Normalize path to forward slashes for cross-platform compatibility
file_str = str(file).replace("\\", "/")
# Try to match against workset in various forms
matched = False
for workset_file in workset_files:
# Check if the absolute path ends with the relative workset path
# This handles both Windows absolute paths and Unix paths
if file_str.endswith(workset_file):
matched = True
file_str = workset_file
break
# Also check if workset file is contained in the path (with proper separators)
elif "/" + workset_file in file_str or file_str.startswith(workset_file):
matched = True
file_str = workset_file
break
if not matched:
continue
# Extract AST if present
if "ast" in file_result:
ast_data[file_str] = file_result["ast"]
for message in file_result.get("messages", []):
# Direct data access from ESLint output
# Create the translated finding using standard keys
# ESLint severity: numeric (2=error, 1=warning) - translate to standard
eslint_severity = message.get("severity", 1)
# Map numeric severity to standard strings
if eslint_severity == 2:
standard_severity = "error"
elif eslint_severity == 1:
standard_severity = "warning"
else:
standard_severity = "warning" # Default for unknown values
translated = {
"tool": "eslint",
"file": file_str,
"line": int(message.get("line") or 0),
"column": int(message.get("column") or 0),
"rule": message.get("ruleId", ""), # Empty not "unknown"
"message": message.get("message", ""),
"severity": standard_severity, # Use standardized severity
"category": "lint",
}
# No validation needed
findings.append(translated)
except json.JSONDecodeError:
# Fall back to regex parsing
pattern = r"([^:]+):(\d+):(\d+):\s+(error|warning)\s+(.+?)\s+([a-z0-9\-\/]+)\s*$"
for line in output.strip().split("\n"):
match = re.match(pattern, line.strip())
if match:
file = match.group(1).strip()
if file in workset_files:
findings.append(
{
"tool": "eslint",
"file": file,
"line": int(match.group(2)),
"column": int(match.group(3)),
"rule": match.group(6),
"message": match.group(5),
"severity": match.group(4), # Keep original for /raw/
"category": "lint",
}
)
return findings, ast_data
def parse_ruff_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse ruff output - translate to standard keys."""
findings = []
# Format: path:line:col: code message
pattern = r"([^:]+):(\d+):(\d+):\s+([A-Z]\d+)\s+(.+)$"
for line in output.strip().split("\n"):
match = re.match(pattern, line.strip())
if match:
file = match.group(1).strip()
# Normalize Windows backslashes to forward slashes for comparison
normalized_file = file.replace("\\", "/")
# Check if file is in workset (comparing normalized paths)
if normalized_file in workset_files or file in workset_files:
# Create original dict for validation
original = {
"file": file,
"line": match.group(2),
"column": match.group(3),
"code": match.group(4),
"message": match.group(5)
}
# COURIER: Translate to standard keys
code = match.group(4)
translated = {
"tool": "ruff",
"file": normalized_file, # Use normalized path
"line": int(match.group(2)),
"column": int(match.group(3)),
"rule": code, # Preserve original code in rule field
"message": match.group(5), # Preserve exactly
"severity": "warning", # Direct preservation - ruff doesn't provide severity in concise format
"category": "lint",
}
# No validation needed
findings.append(translated)
return findings
def parse_mypy_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse mypy output."""
findings = []
# Format: path:line: error: message [type-code]
pattern = r"([^:]+):(\d+):\s+(error|warning|note):\s+(.+?)(?:\s+\[([^\]]+)\])?$"
for line in output.strip().split("\n"):
match = re.match(pattern, line.strip())
if match:
file = match.group(1).strip()
# Normalize Windows backslashes to forward slashes for comparison
normalized_file = file.replace("\\", "/")
# Check if file is in workset (comparing normalized paths)
if normalized_file in workset_files or file in workset_files:
# Create translated finding
original = {
"file": file,
"line": match.group(2),
"severity": match.group(3),
"message": match.group(4),
"code": match.group(5)
}
translated = {
"tool": "mypy",
"file": normalized_file, # Use normalized path
"line": int(match.group(2)),
"column": 0,
"rule": match.group(5) or "type-error",
"message": match.group(4),
"severity": match.group(3), # Keep original mypy severity for /raw/
"category": "type",
}
# No validation needed
findings.append(translated)
return findings
def parse_tsc_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse TypeScript compiler output."""
findings = []
# Format: path(line,col): error TS1234: message
pattern = r"([^(]+)\((\d+),(\d+)\):\s+(error|warning)\s+(TS\d+):\s+(.+)$"
for line in output.strip().split("\n"):
match = re.match(pattern, line.strip())
if match:
file = match.group(1).strip()
if file in workset_files:
# Create translated finding
original = {
"file": file,
"line": match.group(2),
"column": match.group(3),
"severity": match.group(4),
"code": match.group(5),
"message": match.group(6)
}
translated = {
"tool": "tsc",
"file": file,
"line": int(match.group(2)),
"column": int(match.group(3)),
"rule": match.group(5),
"message": match.group(6),
"severity": match.group(4), # Keep original tsc severity for /raw/
"category": "type",
}
# No validation needed
findings.append(translated)
return findings
def parse_prettier_output(
stdout: str, stderr: str, workset_files: set[str]
) -> list[dict[str, Any]]:
"""Parse Prettier output."""
findings = []
# When run with --check, Prettier lists unformatted files on stderr with [warn] prefix
# Example: "[warn] backend/src/app.ts" or with ANSI codes: "\x1b[33m[warn]\x1b[39m backend/src/app.ts"
import re
# Pattern to extract file path after [warn] prefix, handling ANSI codes
# Matches: [warn] file.ts or \x1b[XXm[warn]\x1b[XXm file.ts
pattern = r'\[warn\]\s+(.+?)$'
for line in stderr.strip().split("\n"):
if line and not line.startswith("Checking"):
# Remove ANSI color codes first
clean_line = re.sub(r'\x1b\[[0-9;]*m', '', line)
clean_line = re.sub(r'\[\d+m', '', clean_line) # Also handle [33m format
# Extract file path after [warn]
match = re.search(pattern, clean_line)
if match:
file = match.group(1).strip()
else:
# Fallback: if no [warn] prefix, use the whole line
file = clean_line.strip()
# Normalize path for comparison
normalized_file = file.replace("\\", "/")
# Check if file is in workset
if normalized_file in workset_files or file in workset_files:
# Create translated finding
original = {"file": file}
translated = {
"tool": "prettier",
"file": normalized_file, # Use normalized path
"line": 0,
"column": 0,
"rule": "format",
"message": "File needs formatting",
"severity": "warning", # Keep original for /raw/
"category": "style",
}
# No validation needed
findings.append(translated)
return findings
def parse_black_output(stdout: str, stderr: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse Black output."""
findings = []
# Black shows files that would be reformatted in stderr
pattern = r"would reformat (.+)$"
for match in re.finditer(pattern, stderr, re.MULTILINE):
file = match.group(1)
# Normalize Windows backslashes to forward slashes for comparison
normalized_file = file.replace("\\", "/")
# Check if file is in workset (comparing normalized paths)
if normalized_file in workset_files or file in workset_files:
# Create translated finding
original = {"file": file}
translated = {
"tool": "black",
"file": normalized_file, # Use normalized path
"line": 0,
"column": 0,
"rule": "format",
"message": "File needs formatting",
"severity": "warning", # Keep original for /raw/
"category": "style",
}
# No validation needed
findings.append(translated)
# Also check for --diff output in stdout
# When --diff is used, Black outputs unified diff format to stdout
if stdout and stdout.startswith("---"):
# Extract filenames from diff headers
diff_pattern = r"^---\s+(.+?)\s+\d{4}-\d{2}-\d{2}"
for match in re.finditer(diff_pattern, stdout, re.MULTILINE):
file = match.group(1)
# Normalize Windows backslashes to forward slashes for comparison
normalized_file = file.replace("\\", "/")
# Check if file is in workset (comparing normalized paths)
if normalized_file in workset_files or file in workset_files:
# Check if we already added this file from stderr
if not any(f["file"] == normalized_file for f in findings):
translated = {
"tool": "black",
"file": normalized_file, # Use normalized path
"line": 0,
"column": 0,
"rule": "format",
"message": "File needs formatting",
"severity": "warning",
"category": "style",
}
findings.append(translated)
return findings
def parse_golangci_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse golangci-lint output."""
findings = []
# Format: path:line:col: message (linter)
pattern = r"([^:]+):(\d+):(\d+):\s+(.+?)\s+\(([^)]+)\)$"
for match in re.finditer(pattern, output, re.MULTILINE):
file = match.group(1)
if file in workset_files:
# Create translated finding
original = {
"file": file,
"line": match.group(2),
"column": match.group(3),
"message": match.group(4),
"linter": match.group(5)
}
translated = {
"tool": "golangci-lint",
"file": file,
"line": int(match.group(2)),
"column": int(match.group(3)),
"rule": match.group(5),
"message": match.group(4),
"severity": "warning", # Keep original for /raw/
"category": "lint",
}
# No validation needed
findings.append(translated)
return findings
def parse_go_vet_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse go vet output."""
findings = []
# Format: path:line:col: message
pattern = r"([^:]+):(\d+):(\d+):\s+(.+)$"
for match in re.finditer(pattern, output, re.MULTILINE):
file = match.group(1)
if file in workset_files:
# Create translated finding
original = {
"file": file,
"line": match.group(2),
"column": match.group(3),
"message": match.group(4)
}
translated = {
"tool": "go-vet",
"file": file,
"line": int(match.group(2)),
"column": int(match.group(3)),
"rule": "vet",
"message": match.group(4),
"severity": "warning", # Keep original for /raw/
"category": "lint",
}
# No validation needed
findings.append(translated)
return findings
def parse_maven_output(tool: str, output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse Maven-based tool output (SpotBugs/Checkstyle)."""
findings = []
# Simple pattern matching for Maven output
pattern = r"\[(?:ERROR|WARNING)\]\s+([^:]+):(\d+):\s+(.+)$"
for match in re.finditer(pattern, output, re.MULTILINE):
file = match.group(1)
if file in workset_files:
# Create translated finding
original = {
"file": file,
"line": match.group(2),
"message": match.group(3)
}
translated = {
"tool": tool,
"file": file,
"line": int(match.group(2)),
"column": 0,
"rule": tool,
"message": match.group(3),
"severity": "warning", # Keep original for /raw/
"category": "lint",
}
# No validation needed
findings.append(translated)
return findings
def parse_bandit_output(output: str, workset_files: set[str]) -> list[dict[str, Any]]:
"""Parse bandit JSON output for Python security issues."""
findings = []
try:
results = json.loads(output)
# Bandit JSON structure has "results" key with findings
for result in results.get("results", []):
file = result.get("filename", "")
# Normalize Windows backslashes to forward slashes for comparison
normalized_file = file.replace("\\", "/")
# Check if the absolute path from Bandit matches any relative workset path
# Bandit returns absolute paths like C:/Users/.../file.py
# Workset has relative paths like scrapers/file.py
matched = False
matched_file = normalized_file # Default to normalized absolute path
for workset_file in workset_files:
# Normalize workset file for comparison
normalized_workset = workset_file.replace("\\", "/")
# Check if absolute path ends with the relative path
if normalized_file.endswith(normalized_workset):
matched = True
matched_file = normalized_workset # Use the workset's relative path
break
# Also check with leading slash
elif normalized_file.endswith("/" + normalized_workset):
matched = True
matched_file = normalized_workset
break
if matched:
# Map bandit severity/confidence to standard
severity_map = {
"HIGH": "error",
"MEDIUM": "warning",
"LOW": "warning"
}
translated = {
"tool": "bandit",
"file": matched_file, # Use the matched relative path from workset
"line": int(result.get("line_number") or 0),
"column": int(result.get("col_offset") or 0),
"rule": result.get("test_id", ""),
"message": result.get("issue_text", ""),
"severity": severity_map.get(result.get("issue_severity", "MEDIUM"), "warning"),
"category": "security",
}
findings.append(translated)
except json.JSONDecodeError:
# Fallback to text parsing if JSON fails
pass
return findings