Files
Auditor/theauditor/commands/insights.py

443 lines
16 KiB
Python

"""Run optional insights analysis on existing audit data.
This command runs interpretive analysis modules (ML, graph health, taint severity)
on top of existing raw audit data, generating insights and predictions.
"""
import json
import sys
from pathlib import Path
from typing import Dict, Any, List
import click
@click.command()
@click.option("--mode", "-m",
type=click.Choice(["ml", "graph", "taint", "impact", "all"]),
default="all",
help="Which insights modules to run")
@click.option("--ml-train", is_flag=True,
help="Train ML models before generating suggestions")
@click.option("--topk", default=10, type=int,
help="Top K files for ML suggestions")
@click.option("--output-dir", "-o", type=click.Path(),
default="./.pf/insights",
help="Directory for insights output")
@click.option("--print-summary", is_flag=True,
help="Print summary to console")
def insights(mode: str, ml_train: bool, topk: int, output_dir: str, print_summary: bool) -> None:
"""Run optional insights analysis on existing audit data.
This command generates interpretive analysis and predictions based on
the raw facts collected by the audit pipeline. All insights are optional
and separate from the core truth data.
Available insights modules:
- ml: Machine learning risk predictions and root cause analysis
- graph: Graph health metrics and architectural scoring
- taint: Severity scoring for taint analysis paths
- impact: Impact radius and blast zone analysis
- all: Run all available insights
Examples:
# Run all insights
aud insights
# Only ML predictions
aud insights --mode ml
# Train ML first, then predict
aud insights --mode ml --ml-train
# Graph health only with summary
aud insights --mode graph --print-summary
"""
# Ensure we have raw data to analyze
pf_dir = Path(".pf")
raw_dir = pf_dir / "raw"
if not raw_dir.exists():
click.echo("[ERROR] No raw audit data found. Run 'aud full' first.", err=True)
sys.exit(1)
# Create insights directory
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
click.echo(f"\n{'='*60}")
click.echo(f"INSIGHTS ANALYSIS - {mode.upper()} Mode")
click.echo(f"{'='*60}")
click.echo(f"Output directory: {output_path}")
results = {}
errors = []
# ML Insights
if mode in ["ml", "all"]:
click.echo("\n[ML] Running machine learning insights...")
ml_result = run_ml_insights(ml_train, topk, output_path)
results["ml"] = ml_result
if ml_result.get("error"):
errors.append(f"ML: {ml_result['error']}")
else:
click.echo(f" ✓ ML predictions saved to {output_path}/ml_suggestions.json")
# Graph Health Insights
if mode in ["graph", "all"]:
click.echo("\n[GRAPH] Running graph health analysis...")
graph_result = run_graph_insights(output_path)
results["graph"] = graph_result
if graph_result.get("error"):
errors.append(f"Graph: {graph_result['error']}")
else:
click.echo(f" ✓ Graph health saved to {output_path}/graph_health.json")
# Taint Severity Insights
if mode in ["taint", "all"]:
click.echo("\n[TAINT] Running taint severity scoring...")
taint_result = run_taint_insights(output_path)
results["taint"] = taint_result
if taint_result.get("error"):
errors.append(f"Taint: {taint_result['error']}")
else:
click.echo(f" ✓ Taint severity saved to {output_path}/taint_severity.json")
# Impact Analysis Insights
if mode in ["impact", "all"]:
click.echo("\n[IMPACT] Running impact analysis...")
impact_result = run_impact_insights(output_path)
results["impact"] = impact_result
if impact_result.get("error"):
errors.append(f"Impact: {impact_result['error']}")
else:
click.echo(f" ✓ Impact analysis saved to {output_path}/impact_analysis.json")
# Aggregate all insights into unified summary
click.echo("\n[AGGREGATE] Creating unified insights summary...")
summary = aggregate_insights(results, output_path)
# Save unified summary
summary_path = output_path / "unified_insights.json"
with open(summary_path, 'w') as f:
json.dump(summary, f, indent=2, default=str)
click.echo(f" ✓ Unified summary saved to {summary_path}")
# Print summary if requested
if print_summary:
print_insights_summary(summary)
# Final status
click.echo(f"\n{'='*60}")
if errors:
click.echo(f"[WARN] Insights completed with {len(errors)} errors:", err=True)
for error in errors:
click.echo(f"{error}", err=True)
else:
click.echo("[OK] All insights generated successfully")
click.echo(f"\n[TIP] Insights are interpretive and optional.")
click.echo(f" Raw facts remain in .pf/raw/ unchanged.")
sys.exit(1 if errors else 0)
def run_ml_insights(train: bool, topk: int, output_dir: Path) -> Dict[str, Any]:
"""Run ML insights generation."""
try:
from theauditor.ml import check_ml_available, learn, suggest
if not check_ml_available():
return {"error": "ML module not installed. Run: pip install -e .[ml]"}
# Train if requested
if train:
learn_result = learn(
db_path="./.pf/repo_index.db",
manifest_path="./.pf/manifest.json",
print_stats=False
)
if not learn_result.get("success"):
return {"error": f"ML training failed: {learn_result.get('error')}"}
# Generate suggestions
suggest_result = suggest(
db_path="./.pf/repo_index.db",
manifest_path="./.pf/manifest.json",
workset_path="./.pf/workset.json",
topk=topk,
out_path=str(output_dir / "ml_suggestions.json")
)
return suggest_result
except ImportError:
return {"error": "ML module not available"}
except Exception as e:
return {"error": str(e)}
def run_graph_insights(output_dir: Path) -> Dict[str, Any]:
"""Run graph health insights."""
try:
from theauditor.graph.insights import GraphInsights
from theauditor.graph.analyzer import XGraphAnalyzer
from theauditor.graph.store import XGraphStore
# Load graph from SQLite database (SINGLE SOURCE OF TRUTH)
store = XGraphStore(db_path="./.pf/graphs.db")
import_graph = store.load_import_graph()
if not import_graph or not import_graph.get("nodes"):
return {"error": "No import graph found. Run 'aud graph build' first."}
# Load analysis data if it exists
analysis_path = Path(".pf/raw/graph_analysis.json")
analysis_data = {}
if analysis_path.exists():
with open(analysis_path) as f:
analysis_data = json.load(f)
# Run insights analysis
insights = GraphInsights()
analyzer = XGraphAnalyzer()
# Use pre-calculated cycles and hotspots if available, otherwise calculate
if 'cycles' in analysis_data:
cycles = analysis_data['cycles']
else:
cycles = analyzer.detect_cycles(import_graph)
# Use pre-calculated hotspots if available, otherwise calculate
if 'hotspots' in analysis_data:
hotspots = analysis_data['hotspots']
else:
hotspots = insights.rank_hotspots(import_graph)
# Calculate health metrics
health = insights.calculate_health_metrics(
import_graph,
cycles=cycles,
hotspots=hotspots
)
# Generate recommendations
recommendations = insights.generate_recommendations(
import_graph,
cycles=cycles,
hotspots=hotspots
)
# Save results
output = {
"health_metrics": health,
"top_hotspots": hotspots[:10],
"recommendations": recommendations,
"cycles_found": len(cycles),
"total_nodes": len(import_graph.get("nodes", [])),
"total_edges": len(import_graph.get("edges", []))
}
output_path = output_dir / "graph_health.json"
with open(output_path, 'w') as f:
json.dump(output, f, indent=2)
return {"success": True, "health_score": health.get("health_score")}
except ImportError:
return {"error": "Graph insights module not available"}
except Exception as e:
return {"error": str(e)}
def run_taint_insights(output_dir: Path) -> Dict[str, Any]:
"""Run taint severity insights."""
try:
from datetime import datetime, UTC
from theauditor.taint.insights import calculate_severity, classify_vulnerability, generate_summary
from theauditor.taint_analyzer import SECURITY_SINKS
# Load raw taint data
taint_path = Path(".pf/raw/taint_analysis.json")
if not taint_path.exists():
return {"error": "No taint data found. Run 'aud taint-analyze' first."}
with open(taint_path) as f:
taint_data = json.load(f)
if not taint_data.get("success"):
return {"error": "Taint analysis was not successful"}
# Calculate severity for each path and create enriched versions
severity_analysis = []
enriched_paths = []
for path in taint_data.get("taint_paths", []):
severity = calculate_severity(path)
vuln_type = classify_vulnerability(path.get("sink", {}), SECURITY_SINKS)
severity_analysis.append({
"file": path.get("sink", {}).get("file"),
"line": path.get("sink", {}).get("line"),
"severity": severity,
"vulnerability_type": vuln_type,
"path_length": len(path.get("path", [])),
"risk_score": 1.0 if severity == "critical" else 0.7 if severity == "high" else 0.4
})
# Create enriched path with severity for summary generation
enriched_path = dict(path)
enriched_path["severity"] = severity
enriched_path["vulnerability_type"] = vuln_type
enriched_paths.append(enriched_path)
# Generate summary using enriched paths with severity
summary = generate_summary(enriched_paths)
# Save results
output = {
"generated_at": datetime.now(UTC).isoformat(),
"severity_analysis": severity_analysis,
"summary": summary,
"total_vulnerabilities": len(severity_analysis),
"sources_analyzed": taint_data.get("sources_found", 0),
"sinks_analyzed": taint_data.get("sinks_found", 0)
}
output_path = output_dir / "taint_severity.json"
with open(output_path, 'w') as f:
json.dump(output, f, indent=2)
return {"success": True, "risk_level": summary.get("risk_level")}
except ImportError:
return {"error": "Taint insights module not available"}
except Exception as e:
return {"error": str(e)}
def run_impact_insights(output_dir: Path) -> Dict[str, Any]:
"""Run impact analysis insights."""
try:
# Check if workset exists
workset_path = Path(".pf/workset.json")
if not workset_path.exists():
return {"error": "No workset found. Run 'aud workset' first."}
with open(workset_path) as f:
workset_data = json.load(f)
# For now, create a simple impact summary
# In future, this could run actual impact analysis on changed files
output = {
"files_changed": len(workset_data.get("files", [])),
"potential_impact": "Analysis pending",
"recommendation": "Run 'aud impact --file <file> --line <line>' for detailed analysis"
}
output_path = output_dir / "impact_analysis.json"
with open(output_path, 'w') as f:
json.dump(output, f, indent=2)
return {"success": True, "files_analyzed": len(workset_data.get("files", []))}
except Exception as e:
return {"error": str(e)}
def aggregate_insights(results: Dict[str, Any], output_dir: Path) -> Dict[str, Any]:
"""Aggregate all insights into unified summary."""
summary = {
"insights_generated": list(results.keys()),
"timestamp": __import__('datetime').datetime.now().isoformat(),
"output_directory": str(output_dir)
}
# ML insights
if "ml" in results and results["ml"].get("success"):
summary["ml"] = {
"status": "success",
"workset_size": results["ml"].get("workset_size", 0),
"predictions_generated": True
}
elif "ml" in results:
summary["ml"] = {"status": "error", "error": results["ml"].get("error")}
# Graph insights
if "graph" in results and results["graph"].get("success"):
summary["graph"] = {
"status": "success",
"health_score": results["graph"].get("health_score", 0)
}
elif "graph" in results:
summary["graph"] = {"status": "error", "error": results["graph"].get("error")}
# Taint insights
if "taint" in results and results["taint"].get("success"):
summary["taint"] = {
"status": "success",
"risk_level": results["taint"].get("risk_level", "unknown")
}
elif "taint" in results:
summary["taint"] = {"status": "error", "error": results["taint"].get("error")}
# Impact insights
if "impact" in results and results["impact"].get("success"):
summary["impact"] = {
"status": "success",
"files_analyzed": results["impact"].get("files_analyzed", 0)
}
elif "impact" in results:
summary["impact"] = {"status": "error", "error": results["impact"].get("error")}
return summary
def print_insights_summary(summary: Dict[str, Any]) -> None:
"""Print insights summary to console."""
click.echo(f"\n{'='*60}")
click.echo("INSIGHTS SUMMARY")
click.echo(f"{'='*60}")
# ML Summary
if "ml" in summary:
if summary["ml"]["status"] == "success":
click.echo(f"\n[ML] Machine Learning Insights:")
click.echo(f" • Workset size: {summary['ml'].get('workset_size', 0)} files")
click.echo(f" • Predictions: Generated successfully")
else:
click.echo(f"\n[ML] Machine Learning Insights: {summary['ml'].get('error')}")
# Graph Summary
if "graph" in summary:
if summary["graph"]["status"] == "success":
health = summary["graph"].get("health_score", 0)
grade = "A" if health >= 90 else "B" if health >= 80 else "C" if health >= 70 else "D" if health >= 60 else "F"
click.echo(f"\n[GRAPH] Architecture Health:")
click.echo(f" • Health score: {health}/100 (Grade: {grade})")
else:
click.echo(f"\n[GRAPH] Architecture Health: {summary['graph'].get('error')}")
# Taint Summary
if "taint" in summary:
if summary["taint"]["status"] == "success":
risk = summary["taint"].get("risk_level", "unknown")
color = "red" if risk == "critical" else "yellow" if risk == "high" else "green"
click.echo(f"\n[TAINT] Security Risk:")
click.echo(f" • Risk level: {risk.upper()}")
else:
click.echo(f"\n[TAINT] Security Risk: {summary['taint'].get('error')}")
# Impact Summary
if "impact" in summary:
if summary["impact"]["status"] == "success":
click.echo(f"\n[IMPACT] Change Impact:")
click.echo(f" • Files analyzed: {summary['impact'].get('files_analyzed', 0)}")
else:
click.echo(f"\n[IMPACT] Change Impact: {summary['impact'].get('error')}")
click.echo(f"\n{'='*60}")
# Register command
insights_command = insights