mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
785 lines
31 KiB
Python
785 lines
31 KiB
Python
"""Factual Correlation Engine - aggregates and correlates findings from all analysis tools."""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import shlex
|
||
import sqlite3
|
||
import subprocess
|
||
from collections import defaultdict, deque
|
||
from datetime import UTC, datetime
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from theauditor.test_frameworks import detect_test_framework
|
||
from theauditor.correlations import CorrelationLoader
|
||
|
||
|
||
|
||
|
||
def scan_all_findings(raw_dir: Path) -> list[dict[str, Any]]:
|
||
"""
|
||
Scan ALL raw outputs for structured findings with line-level detail.
|
||
Extract findings from JSON outputs with file, line, rule, and tool information.
|
||
"""
|
||
all_findings = []
|
||
|
||
for output_file in raw_dir.glob('*.json'):
|
||
if not output_file.is_file():
|
||
continue
|
||
|
||
tool_name = output_file.stem
|
||
try:
|
||
with open(output_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
# Handle different JSON structures based on tool
|
||
findings = []
|
||
|
||
# Standard findings structure (lint.json, patterns.json, etc.)
|
||
if isinstance(data, dict) and 'findings' in data:
|
||
findings = data['findings']
|
||
# Vulnerabilities structure
|
||
elif isinstance(data, dict) and 'vulnerabilities' in data:
|
||
findings = data['vulnerabilities']
|
||
# Taint analysis structure
|
||
elif isinstance(data, dict) and 'taint_paths' in data:
|
||
for path in data['taint_paths']:
|
||
# Create a finding for each taint path
|
||
if 'file' in path and 'line' in path:
|
||
findings.append({
|
||
'file': path['file'],
|
||
'line': path['line'],
|
||
'rule': f"taint-{path.get('sink_type', 'unknown')}",
|
||
'message': path.get('message', 'Taint path detected')
|
||
})
|
||
# Direct list of findings
|
||
elif isinstance(data, list):
|
||
findings = data
|
||
# RCA/test results structure
|
||
elif isinstance(data, dict) and 'failures' in data:
|
||
findings = data['failures']
|
||
|
||
# Process each finding
|
||
for finding in findings:
|
||
if isinstance(finding, dict):
|
||
# Ensure required fields exist
|
||
if 'file' in finding:
|
||
# Create standardized finding
|
||
standardized = {
|
||
'file': finding.get('file', ''),
|
||
'line': int(finding.get('line', 0)),
|
||
'rule': finding.get('rule', finding.get('code', finding.get('pattern', 'unknown'))),
|
||
'tool': finding.get('tool', tool_name),
|
||
'message': finding.get('message', ''),
|
||
'severity': finding.get('severity', 'warning')
|
||
}
|
||
all_findings.append(standardized)
|
||
|
||
except (json.JSONDecodeError, KeyError, TypeError):
|
||
# Skip files that can't be parsed as JSON or don't have expected structure
|
||
continue
|
||
except Exception:
|
||
# Skip files with other errors
|
||
continue
|
||
|
||
return all_findings
|
||
|
||
|
||
def run_tool(command: str, root_path: str, timeout: int = 600) -> tuple[int, str, str]:
|
||
"""Run build/test tool with timeout and capture output."""
|
||
try:
|
||
# Use deque as ring buffer to limit memory usage
|
||
max_lines = 10000
|
||
stdout_buffer = deque(maxlen=max_lines)
|
||
stderr_buffer = deque(maxlen=max_lines)
|
||
|
||
# Run command - safely split command string into arguments
|
||
cmd_args = shlex.split(command)
|
||
|
||
# Write directly to temp files to avoid buffer overflow
|
||
import tempfile
|
||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt') as out_tmp, \
|
||
tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt') as err_tmp:
|
||
|
||
process = subprocess.Popen(
|
||
cmd_args,
|
||
cwd=root_path,
|
||
stdout=out_tmp,
|
||
stderr=err_tmp,
|
||
text=True,
|
||
env={**os.environ, "CI": "true"}, # Set CI env for tools
|
||
)
|
||
|
||
stdout_file = out_tmp.name
|
||
stderr_file = err_tmp.name
|
||
|
||
# Stream output with timeout
|
||
try:
|
||
process.communicate(timeout=timeout)
|
||
|
||
# Read back the outputs
|
||
with open(stdout_file, 'r') as f:
|
||
stdout = f.read()
|
||
with open(stderr_file, 'r') as f:
|
||
stderr = f.read()
|
||
|
||
# Clean up temp files
|
||
os.unlink(stdout_file)
|
||
os.unlink(stderr_file)
|
||
|
||
# Append any errors to the global error.log
|
||
if stderr:
|
||
from pathlib import Path
|
||
error_log = Path(root_path) / ".pf" / "error.log"
|
||
error_log.parent.mkdir(parents=True, exist_ok=True)
|
||
with open(error_log, 'a') as f:
|
||
f.write(f"\n=== RCA Subprocess Error ({command[:50]}) ===\n")
|
||
f.write(f"Timestamp: {datetime.now().isoformat()}\n")
|
||
f.write(stderr)
|
||
f.write("\n")
|
||
# Store in buffers
|
||
stdout_buffer.extend(stdout.splitlines())
|
||
stderr_buffer.extend(stderr.splitlines())
|
||
except subprocess.TimeoutExpired:
|
||
process.kill()
|
||
return 124, "Process timed out", f"Command exceeded {timeout}s timeout"
|
||
|
||
# Join lines
|
||
stdout_text = "\n".join(stdout_buffer)
|
||
stderr_text = "\n".join(stderr_buffer)
|
||
|
||
return process.returncode, stdout_text, stderr_text
|
||
|
||
except Exception as e:
|
||
return 1, "", str(e)
|
||
|
||
|
||
def parse_typescript_errors(output: str) -> list[dict[str, Any]]:
|
||
"""Parse TypeScript/TSNode compiler errors."""
|
||
errors = []
|
||
|
||
# TypeScript error format: file:line:col - error CODE: message
|
||
pattern = (
|
||
r"(?P<file>[^:\n]+):(?P<line>\d+):(?P<col>\d+) - error (?P<code>[A-Z]+\d+): (?P<msg>.+)"
|
||
)
|
||
|
||
for match in re.finditer(pattern, output):
|
||
errors.append(
|
||
{
|
||
"tool": "tsc",
|
||
"file": match.group("file"),
|
||
"line": int(match.group("line")),
|
||
"column": int(match.group("col")),
|
||
"message": match.group("msg"),
|
||
"code": match.group("code"),
|
||
"category": "type_error",
|
||
}
|
||
)
|
||
|
||
return errors
|
||
|
||
|
||
def parse_jest_errors(output: str) -> list[dict[str, Any]]:
|
||
"""Parse Jest/Vitest test failures."""
|
||
errors = []
|
||
|
||
# Jest failed test: ● Test Suite Name › test name
|
||
# Followed by stack trace: at Object.<anonymous> (file:line:col)
|
||
test_pattern = r"● (?P<testname>[^\n]+)"
|
||
stack_pattern = r"at .*? \((?P<file>[^:]+):(?P<line>\d+):(?P<col>\d+)\)"
|
||
|
||
lines = output.splitlines()
|
||
for i, line in enumerate(lines):
|
||
test_match = re.match(test_pattern, line)
|
||
if test_match:
|
||
# Look for stack trace in next lines
|
||
for j in range(i + 1, min(i + 20, len(lines))):
|
||
stack_match = re.search(stack_pattern, lines[j])
|
||
if stack_match:
|
||
errors.append(
|
||
{
|
||
"tool": "jest",
|
||
"file": stack_match.group("file"),
|
||
"line": int(stack_match.group("line")),
|
||
"column": int(stack_match.group("col")),
|
||
"message": f"Test failed: {test_match.group('testname')}",
|
||
"category": "test_failure",
|
||
}
|
||
)
|
||
break
|
||
|
||
return errors
|
||
|
||
|
||
def parse_pytest_errors(output: str) -> list[dict[str, Any]]:
|
||
"""Parse pytest failures."""
|
||
errors = []
|
||
|
||
# Pytest error format varies, but typically:
|
||
# FAILED path/to/test.py::TestClass::test_method - AssertionError: message
|
||
# Or: E AssertionError: message
|
||
# path/to/file.py:42: AssertionError
|
||
|
||
failed_pattern = r"FAILED (?P<file>[^:]+)(?:::(?P<test>[^\s]+))? - (?P<msg>.+)"
|
||
error_pattern = r"^E\s+(?P<msg>.+)\n.*?(?P<file>[^:]+):(?P<line>\d+):"
|
||
|
||
for match in re.finditer(failed_pattern, output):
|
||
errors.append(
|
||
{
|
||
"tool": "pytest",
|
||
"file": match.group("file"),
|
||
"line": 0, # Line not in FAILED format
|
||
"message": match.group("msg"),
|
||
"category": "test_failure",
|
||
}
|
||
)
|
||
|
||
for match in re.finditer(error_pattern, output, re.MULTILINE):
|
||
errors.append(
|
||
{
|
||
"tool": "pytest",
|
||
"file": match.group("file"),
|
||
"line": int(match.group("line")),
|
||
"message": match.group("msg"),
|
||
"category": "test_failure",
|
||
}
|
||
)
|
||
|
||
return errors
|
||
|
||
|
||
def parse_python_compile_errors(output: str) -> list[dict[str, Any]]:
|
||
"""Parse Python compilation errors from py_compile output."""
|
||
errors = []
|
||
|
||
# Python compile error format:
|
||
# Traceback (most recent call last):
|
||
# File "path/to/file.py", line X, in <module>
|
||
# SyntaxError: invalid syntax
|
||
# Or: ModuleNotFoundError: No module named 'xxx'
|
||
|
||
# Parse traceback format
|
||
lines = output.splitlines()
|
||
for i, line in enumerate(lines):
|
||
# Look for File references in tracebacks
|
||
if 'File "' in line and '", line ' in line:
|
||
# Extract file and line number
|
||
match = re.match(r'.*File "([^"]+)", line (\d+)', line)
|
||
if match and i + 1 < len(lines):
|
||
file_path = match.group(1)
|
||
line_num = int(match.group(2))
|
||
|
||
# Look for the error type in following lines
|
||
for j in range(i + 1, min(i + 5, len(lines))):
|
||
if 'Error:' in lines[j]:
|
||
error_msg = lines[j].strip()
|
||
errors.append({
|
||
"tool": "py_compile",
|
||
"file": file_path,
|
||
"line": line_num,
|
||
"message": error_msg,
|
||
"category": "compile_error",
|
||
})
|
||
break
|
||
|
||
# Also catch simple error messages
|
||
if 'SyntaxError:' in line or 'ModuleNotFoundError:' in line or 'ImportError:' in line:
|
||
# Try to extract file info from previous lines
|
||
file_info = None
|
||
for j in range(max(0, i - 3), i):
|
||
if '***' in lines[j] and '.py' in lines[j]:
|
||
# py_compile format: *** path/to/file.py
|
||
file_match = re.match(r'\*\*\* (.+\.py)', lines[j])
|
||
if file_match:
|
||
file_info = file_match.group(1)
|
||
break
|
||
|
||
if file_info:
|
||
errors.append({
|
||
"tool": "py_compile",
|
||
"file": file_info,
|
||
"line": 0,
|
||
"message": line.strip(),
|
||
"category": "compile_error",
|
||
})
|
||
|
||
return errors
|
||
|
||
|
||
def parse_errors(output: str, tool_name: str) -> list[dict[str, Any]]:
|
||
"""Parse errors based on tool type."""
|
||
all_errors = []
|
||
|
||
# Try all parsers
|
||
all_errors.extend(parse_typescript_errors(output))
|
||
all_errors.extend(parse_jest_errors(output))
|
||
all_errors.extend(parse_pytest_errors(output))
|
||
all_errors.extend(parse_python_compile_errors(output))
|
||
|
||
return all_errors
|
||
|
||
|
||
def load_capsule(capsules_dir: str, file_hash: str) -> dict | None:
|
||
"""Load capsule by file hash."""
|
||
capsule_path = Path(capsules_dir) / f"{file_hash}.json"
|
||
if not capsule_path.exists():
|
||
return None
|
||
|
||
try:
|
||
with open(capsule_path) as f:
|
||
return json.load(f)
|
||
except json.JSONDecodeError:
|
||
return None
|
||
|
||
|
||
|
||
|
||
def correlate_failures(
|
||
errors: list[dict[str, Any]],
|
||
manifest_path: str,
|
||
workset_path: str,
|
||
capsules_dir: str,
|
||
db_path: str,
|
||
) -> list[dict[str, Any]]:
|
||
"""Correlate failures with capsules for factual enrichment."""
|
||
# Load manifest for hash lookup
|
||
file_hashes = {}
|
||
try:
|
||
with open(manifest_path) as f:
|
||
manifest = json.load(f)
|
||
for entry in manifest:
|
||
file_hashes[entry["path"]] = entry.get("sha256")
|
||
except (FileNotFoundError, json.JSONDecodeError):
|
||
pass
|
||
|
||
# Load workset
|
||
workset_files = set()
|
||
try:
|
||
with open(workset_path) as f:
|
||
workset = json.load(f)
|
||
workset_files = {p["path"] for p in workset.get("paths", [])}
|
||
except (FileNotFoundError, json.JSONDecodeError):
|
||
pass
|
||
|
||
# Correlate each error
|
||
for error in errors:
|
||
file = error.get("file", "")
|
||
|
||
# Load capsule if file in workset/manifest
|
||
if file in file_hashes:
|
||
file_hash = file_hashes[file]
|
||
capsule = load_capsule(capsules_dir, file_hash)
|
||
if capsule:
|
||
error["capsule"] = {
|
||
"path": capsule.get("path"),
|
||
"hash": capsule.get("sha256"),
|
||
"interfaces": capsule.get("interfaces", {}),
|
||
}
|
||
|
||
|
||
return errors
|
||
|
||
|
||
def generate_rca_json(failures: list[dict[str, Any]]) -> dict[str, Any]:
|
||
"""Generate RCA JSON output."""
|
||
return {
|
||
"completed_at": datetime.now(UTC).isoformat(),
|
||
"failures": failures,
|
||
}
|
||
|
||
|
||
def run_fce(
|
||
root_path: str = ".",
|
||
capsules_dir: str = "./.pf/capsules",
|
||
manifest_path: str = "manifest.json",
|
||
workset_path: str = "./.pf/workset.json",
|
||
db_path: str = "repo_index.db",
|
||
timeout: int = 600,
|
||
print_plan: bool = False,
|
||
) -> dict[str, Any]:
|
||
"""Run factual correlation engine - NO interpretation, just facts."""
|
||
try:
|
||
# Step A: Initialization
|
||
raw_dir = Path(root_path) / ".pf" / "raw"
|
||
results = {
|
||
"timestamp": datetime.now(UTC).isoformat(),
|
||
"all_findings": [],
|
||
"test_results": {},
|
||
"correlations": {}
|
||
}
|
||
|
||
# Step B: Phase 1 - Gather All Findings
|
||
if raw_dir.exists():
|
||
results["all_findings"] = scan_all_findings(raw_dir)
|
||
|
||
# Step B2: Load Optional Insights (ML predictions, etc.)
|
||
insights_dir = Path(root_path) / ".pf" / "insights"
|
||
if insights_dir.exists():
|
||
# Load ML suggestions if available
|
||
ml_path = insights_dir / "ml_suggestions.json"
|
||
if ml_path.exists():
|
||
try:
|
||
with open(ml_path) as f:
|
||
ml_data = json.load(f)
|
||
|
||
# Convert ML predictions to correlatable findings
|
||
# ML has separate lists for root causes, risk scores, etc.
|
||
for root_cause in ml_data.get("likely_root_causes", [])[:5]: # Top 5 root causes
|
||
if root_cause.get("score", 0) > 0.7:
|
||
results["all_findings"].append({
|
||
"file": root_cause["path"],
|
||
"line": 0, # ML doesn't provide line-level predictions
|
||
"rule": "ML_ROOT_CAUSE",
|
||
"tool": "ml",
|
||
"message": f"ML predicts {root_cause['score']:.1%} probability as root cause",
|
||
"severity": "high"
|
||
})
|
||
|
||
for risk_item in ml_data.get("risk", [])[:5]: # Top 5 risky files
|
||
if risk_item.get("score", 0) > 0.7:
|
||
results["all_findings"].append({
|
||
"file": risk_item["path"],
|
||
"line": 0,
|
||
"rule": f"ML_RISK_{int(risk_item['score']*100)}",
|
||
"tool": "ml",
|
||
"message": f"ML predicts {risk_item['score']:.1%} risk score",
|
||
"severity": "high" if risk_item.get("score", 0) > 0.85 else "medium"
|
||
})
|
||
except (json.JSONDecodeError, KeyError):
|
||
pass # ML insights are optional, continue if they fail
|
||
|
||
# Load taint severity insights if available
|
||
taint_severity_path = insights_dir / "taint_severity.json"
|
||
if taint_severity_path.exists():
|
||
try:
|
||
with open(taint_severity_path) as f:
|
||
taint_data = json.load(f)
|
||
|
||
# Add severity-enhanced taint findings
|
||
for item in taint_data.get("severity_analysis", []):
|
||
if item.get("severity") in ["critical", "high"]:
|
||
results["all_findings"].append({
|
||
"file": item.get("file", ""),
|
||
"line": item.get("line", 0),
|
||
"rule": f"TAINT_{item.get('vulnerability_type', 'UNKNOWN').upper().replace(' ', '_')}",
|
||
"tool": "taint-insights",
|
||
"message": f"{item.get('vulnerability_type')} with {item.get('severity')} severity",
|
||
"severity": item.get("severity")
|
||
})
|
||
except (json.JSONDecodeError, KeyError):
|
||
pass # Insights are optional
|
||
|
||
# Step C: Phase 2 - Execute Tests
|
||
# Detect test framework
|
||
framework_info = detect_test_framework(root_path)
|
||
|
||
tools = []
|
||
if framework_info["name"] != "unknown" and framework_info["cmd"]:
|
||
command = framework_info["cmd"]
|
||
|
||
# Add quiet flags
|
||
if "pytest" in command:
|
||
command = "pytest -q -p no:cacheprovider"
|
||
elif "npm test" in command:
|
||
command = "npm test --silent"
|
||
elif "unittest" in command:
|
||
command = "python -m unittest discover -q"
|
||
|
||
tools.append({
|
||
"name": framework_info["name"],
|
||
"command": command,
|
||
"type": "test"
|
||
})
|
||
|
||
# Check for build scripts
|
||
package_json = Path(root_path) / "package.json"
|
||
if package_json.exists():
|
||
try:
|
||
with open(package_json) as f:
|
||
package = json.load(f)
|
||
scripts = package.get("scripts", {})
|
||
if "build" in scripts:
|
||
tools.append({
|
||
"name": "npm build",
|
||
"command": "npm run build --silent",
|
||
"type": "build"
|
||
})
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
if print_plan:
|
||
print("Detected tools:")
|
||
for tool in tools:
|
||
print(f" - {tool['name']}: {tool['command']}")
|
||
return {"success": True, "printed_plan": True}
|
||
|
||
if not tools:
|
||
tools = [] # No test tools, continue processing
|
||
|
||
# Run tools and collect failures
|
||
all_failures = []
|
||
|
||
for tool in tools:
|
||
print(f"Running {tool['name']}...")
|
||
exit_code, stdout, stderr = run_tool(tool["command"], root_path, timeout)
|
||
|
||
if exit_code != 0:
|
||
output = stdout + "\n" + stderr
|
||
errors = parse_errors(output, tool["name"])
|
||
|
||
# Special handling for pytest collection failures
|
||
if tool["name"] == "pytest" and exit_code == 2 and "ERROR collecting" in output:
|
||
print("Pytest collection failed. Falling back to Python compilation check...")
|
||
|
||
py_files = []
|
||
for py_file in Path(root_path).rglob("*.py"):
|
||
if "__pycache__" not in str(py_file) and not any(part.startswith('.') for part in py_file.parts):
|
||
py_files.append(str(py_file.relative_to(root_path)))
|
||
|
||
if py_files:
|
||
print(f"Checking {len(py_files)} Python files for compilation errors...")
|
||
compile_errors = []
|
||
|
||
for py_file in py_files[:50]:
|
||
module_path = str(Path(py_file).as_posix()).replace('/', '.').replace('.py', '')
|
||
import_cmd = f'python3 -c "import {module_path}"'
|
||
comp_exit, comp_out, comp_err = run_tool(import_cmd, root_path, 10)
|
||
|
||
if comp_exit != 0:
|
||
comp_output = comp_out + "\n" + comp_err
|
||
if comp_output.strip():
|
||
error_lines = comp_output.strip().split('\n')
|
||
error_msg = "Import failed"
|
||
|
||
for line in error_lines:
|
||
if 'ModuleNotFoundError:' in line:
|
||
error_msg = line.strip()
|
||
break
|
||
elif 'ImportError:' in line:
|
||
error_msg = line.strip()
|
||
break
|
||
elif 'SyntaxError:' in line:
|
||
error_msg = line.strip()
|
||
break
|
||
elif 'AttributeError:' in line:
|
||
error_msg = line.strip()
|
||
break
|
||
|
||
compile_errors.append({
|
||
"tool": "py_import",
|
||
"file": py_file,
|
||
"line": 0,
|
||
"message": error_msg,
|
||
"category": "compile_error",
|
||
})
|
||
|
||
if compile_errors:
|
||
print(f"Found {len(compile_errors)} compilation errors")
|
||
errors.extend(compile_errors)
|
||
|
||
# If no errors parsed, create generic one
|
||
if not errors and exit_code != 0:
|
||
errors.append({
|
||
"tool": tool["name"],
|
||
"file": "unknown",
|
||
"line": 0,
|
||
"message": f"Tool failed with exit code {exit_code}",
|
||
"category": "runtime",
|
||
})
|
||
|
||
all_failures.extend(errors)
|
||
|
||
# Correlate with capsules
|
||
all_failures = correlate_failures(
|
||
all_failures,
|
||
Path(root_path) / manifest_path,
|
||
Path(root_path) / workset_path,
|
||
Path(root_path) / capsules_dir,
|
||
Path(root_path) / db_path,
|
||
)
|
||
|
||
# Store test results
|
||
results["test_results"] = {
|
||
"completed_at": datetime.now(UTC).isoformat(),
|
||
"failures": all_failures,
|
||
"tools_run": len(tools)
|
||
}
|
||
|
||
# Step D: Consolidate Evidence
|
||
consolidated_findings = results["all_findings"].copy()
|
||
|
||
# Add test failures to consolidated list
|
||
if all_failures:
|
||
for failure in all_failures:
|
||
if 'file' in failure and 'line' in failure:
|
||
consolidated_findings.append({
|
||
'file': failure['file'],
|
||
'line': int(failure.get('line', 0)),
|
||
'rule': failure.get('code', failure.get('category', 'test-failure')),
|
||
'tool': failure.get('tool', 'test'),
|
||
'message': failure.get('message', ''),
|
||
'severity': failure.get('severity', 'error')
|
||
})
|
||
|
||
# Step E: Phase 3 - Line-Level Correlation (Hotspots)
|
||
# Group findings by file:line
|
||
line_groups = defaultdict(list)
|
||
for finding in consolidated_findings:
|
||
if finding['line'] > 0:
|
||
key = f"{finding['file']}:{finding['line']}"
|
||
line_groups[key].append(finding)
|
||
|
||
# Find hotspots
|
||
hotspots = {}
|
||
for line_key, findings in line_groups.items():
|
||
tools_on_line = set(f['tool'] for f in findings)
|
||
if len(tools_on_line) > 1:
|
||
hotspots[line_key] = findings
|
||
|
||
# Enrich hotspots with symbol context
|
||
full_db_path = Path(root_path) / db_path
|
||
if hotspots and full_db_path.exists():
|
||
try:
|
||
conn = sqlite3.connect(str(full_db_path))
|
||
cursor = conn.cursor()
|
||
|
||
enriched_hotspots = {}
|
||
for line_key, findings in hotspots.items():
|
||
if ':' in line_key:
|
||
file_path, line_str = line_key.rsplit(':', 1)
|
||
try:
|
||
line_num = int(line_str)
|
||
|
||
query = """
|
||
SELECT name, type, line
|
||
FROM symbols
|
||
WHERE file = ?
|
||
AND line <= ?
|
||
AND type IN ('function', 'class')
|
||
ORDER BY line DESC
|
||
LIMIT 1
|
||
"""
|
||
cursor.execute(query, (file_path, line_num))
|
||
result = cursor.fetchone()
|
||
|
||
hotspot_data = {"findings": findings}
|
||
|
||
if result:
|
||
symbol_name, symbol_type, symbol_line = result
|
||
hotspot_data["in_symbol"] = f"{symbol_type}: {symbol_name}"
|
||
|
||
enriched_hotspots[line_key] = hotspot_data
|
||
except (ValueError, TypeError):
|
||
enriched_hotspots[line_key] = {"findings": findings}
|
||
else:
|
||
enriched_hotspots[line_key] = {"findings": findings}
|
||
|
||
conn.close()
|
||
hotspots = enriched_hotspots
|
||
except (sqlite3.Error, Exception):
|
||
hotspots = {k: {"findings": v} for k, v in hotspots.items()}
|
||
else:
|
||
hotspots = {k: {"findings": v} for k, v in hotspots.items()}
|
||
|
||
# Store hotspots in correlations
|
||
results["correlations"]["hotspots"] = hotspots
|
||
results["correlations"]["total_findings"] = len(consolidated_findings)
|
||
results["correlations"]["total_lines_with_findings"] = len(line_groups)
|
||
results["correlations"]["total_hotspots"] = len(hotspots)
|
||
|
||
# Step F: Phase 4 - Factual Cluster Detection
|
||
factual_clusters = []
|
||
|
||
# Load correlation rules
|
||
correlation_loader = CorrelationLoader()
|
||
correlation_rules = correlation_loader.load_rules()
|
||
|
||
if correlation_rules and consolidated_findings:
|
||
# Group findings by file
|
||
findings_by_file = defaultdict(list)
|
||
for finding in consolidated_findings:
|
||
if 'file' in finding:
|
||
findings_by_file[finding['file']].append(finding)
|
||
|
||
# Check each file against each rule
|
||
for file_path, file_findings in findings_by_file.items():
|
||
for rule in correlation_rules:
|
||
all_facts_matched = True
|
||
|
||
for fact_index, fact in enumerate(rule.co_occurring_facts):
|
||
fact_matched = False
|
||
for finding in file_findings:
|
||
if rule.matches_finding(finding, fact_index):
|
||
fact_matched = True
|
||
break
|
||
|
||
if not fact_matched:
|
||
all_facts_matched = False
|
||
break
|
||
|
||
if all_facts_matched:
|
||
factual_clusters.append({
|
||
"name": rule.name,
|
||
"file": file_path,
|
||
"description": rule.description,
|
||
"confidence": rule.confidence
|
||
})
|
||
|
||
# Store factual clusters
|
||
results["correlations"]["factual_clusters"] = factual_clusters
|
||
|
||
# Step G: Finalization - Apply intelligent organization sorting
|
||
from theauditor.utils.finding_priority import sort_findings, normalize_severity
|
||
|
||
# CRITICAL: Normalize all severities BEFORE sorting
|
||
# This handles Docker's integer severity and ESLint's "error" strings
|
||
if results.get("all_findings"):
|
||
# First pass: normalize severity in-place
|
||
for finding in results["all_findings"]:
|
||
original_severity = finding.get("severity")
|
||
finding["severity"] = normalize_severity(original_severity)
|
||
|
||
# Debug log for unusual severities (helps catch new formats)
|
||
if original_severity and str(original_severity) != finding["severity"]:
|
||
if isinstance(original_severity, int):
|
||
# Expected for Docker, don't log
|
||
pass
|
||
else:
|
||
print(f"[FCE] Normalized severity: {original_severity} -> {finding['severity']}")
|
||
|
||
# Second pass: sort using centralized logic
|
||
results["all_findings"] = sort_findings(results["all_findings"])
|
||
|
||
# Log sorting results for verification
|
||
if results["all_findings"]:
|
||
print(f"[FCE] Sorted {len(results['all_findings'])} findings")
|
||
first = results["all_findings"][0]
|
||
last = results["all_findings"][-1] if len(results["all_findings"]) > 1 else first
|
||
print(f"[FCE] First: {first.get('severity')} from {first.get('tool')}")
|
||
print(f"[FCE] Last: {last.get('severity')} from {last.get('tool')}")
|
||
|
||
# Write results to JSON
|
||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||
fce_path = raw_dir / "fce.json"
|
||
fce_path.write_text(json.dumps(results, indent=2))
|
||
|
||
# Count total failures/findings
|
||
failures_found = len(results.get("all_findings", []))
|
||
|
||
# Return success structure
|
||
return {
|
||
"success": True,
|
||
"failures_found": failures_found,
|
||
"output_files": [str(fce_path)],
|
||
"results": results
|
||
}
|
||
|
||
except Exception as e:
|
||
# Step H: Error Handling
|
||
return {
|
||
"success": False,
|
||
"failures_found": 0,
|
||
"error": str(e)
|
||
}
|