Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform

2025-12-17 19:34:19 +01:00 · 2025-09-07 20:39:47 +07:00
commit ba5c287b02
215 changed files with 50911 additions and 0 deletions
--- a/theauditor/commands/insights.py
+++ b/theauditor/commands/insights.py
@@ -0,0 +1,443 @@
+"""Run optional insights analysis on existing audit data.
+
+This command runs interpretive analysis modules (ML, graph health, taint severity)
+on top of existing raw audit data, generating insights and predictions.
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+
+import click
+
+
+@click.command()
+@click.option("--mode", "-m", 
+              type=click.Choice(["ml", "graph", "taint", "impact", "all"]),
+              default="all",
+              help="Which insights modules to run")
+@click.option("--ml-train", is_flag=True,
+              help="Train ML models before generating suggestions")
+@click.option("--topk", default=10, type=int,
+              help="Top K files for ML suggestions")
+@click.option("--output-dir", "-o", type=click.Path(),
+              default="./.pf/insights",
+              help="Directory for insights output")
+@click.option("--print-summary", is_flag=True,
+              help="Print summary to console")
+def insights(mode: str, ml_train: bool, topk: int, output_dir: str, print_summary: bool) -> None:
+    """Run optional insights analysis on existing audit data.
+    
+    This command generates interpretive analysis and predictions based on
+    the raw facts collected by the audit pipeline. All insights are optional
+    and separate from the core truth data.
+    
+    Available insights modules:
+    - ml: Machine learning risk predictions and root cause analysis
+    - graph: Graph health metrics and architectural scoring
+    - taint: Severity scoring for taint analysis paths
+    - impact: Impact radius and blast zone analysis
+    - all: Run all available insights
+    
+    Examples:
+        # Run all insights
+        aud insights
+        
+        # Only ML predictions
+        aud insights --mode ml
+        
+        # Train ML first, then predict
+        aud insights --mode ml --ml-train
+        
+        # Graph health only with summary
+        aud insights --mode graph --print-summary
+    """
+    
+    # Ensure we have raw data to analyze
+    pf_dir = Path(".pf")
+    raw_dir = pf_dir / "raw"
+    
+    if not raw_dir.exists():
+        click.echo("[ERROR] No raw audit data found. Run 'aud full' first.", err=True)
+        sys.exit(1)
+    
+    # Create insights directory
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    click.echo(f"\n{'='*60}")
+    click.echo(f"INSIGHTS ANALYSIS - {mode.upper()} Mode")
+    click.echo(f"{'='*60}")
+    click.echo(f"Output directory: {output_path}")
+    
+    results = {}
+    errors = []
+    
+    # ML Insights
+    if mode in ["ml", "all"]:
+        click.echo("\n[ML] Running machine learning insights...")
+        ml_result = run_ml_insights(ml_train, topk, output_path)
+        results["ml"] = ml_result
+        if ml_result.get("error"):
+            errors.append(f"ML: {ml_result['error']}")
+        else:
+            click.echo(f"  ✓ ML predictions saved to {output_path}/ml_suggestions.json")
+    
+    # Graph Health Insights
+    if mode in ["graph", "all"]:
+        click.echo("\n[GRAPH] Running graph health analysis...")
+        graph_result = run_graph_insights(output_path)
+        results["graph"] = graph_result
+        if graph_result.get("error"):
+            errors.append(f"Graph: {graph_result['error']}")
+        else:
+            click.echo(f"  ✓ Graph health saved to {output_path}/graph_health.json")
+    
+    # Taint Severity Insights
+    if mode in ["taint", "all"]:
+        click.echo("\n[TAINT] Running taint severity scoring...")
+        taint_result = run_taint_insights(output_path)
+        results["taint"] = taint_result
+        if taint_result.get("error"):
+            errors.append(f"Taint: {taint_result['error']}")
+        else:
+            click.echo(f"  ✓ Taint severity saved to {output_path}/taint_severity.json")
+    
+    # Impact Analysis Insights
+    if mode in ["impact", "all"]:
+        click.echo("\n[IMPACT] Running impact analysis...")
+        impact_result = run_impact_insights(output_path)
+        results["impact"] = impact_result
+        if impact_result.get("error"):
+            errors.append(f"Impact: {impact_result['error']}")
+        else:
+            click.echo(f"  ✓ Impact analysis saved to {output_path}/impact_analysis.json")
+    
+    # Aggregate all insights into unified summary
+    click.echo("\n[AGGREGATE] Creating unified insights summary...")
+    summary = aggregate_insights(results, output_path)
+    
+    # Save unified summary
+    summary_path = output_path / "unified_insights.json"
+    with open(summary_path, 'w') as f:
+        json.dump(summary, f, indent=2, default=str)
+    click.echo(f"  ✓ Unified summary saved to {summary_path}")
+    
+    # Print summary if requested
+    if print_summary:
+        print_insights_summary(summary)
+    
+    # Final status
+    click.echo(f"\n{'='*60}")
+    if errors:
+        click.echo(f"[WARN] Insights completed with {len(errors)} errors:", err=True)
+        for error in errors:
+            click.echo(f"  • {error}", err=True)
+    else:
+        click.echo("[OK] All insights generated successfully")
+    
+    click.echo(f"\n[TIP] Insights are interpretive and optional.")
+    click.echo(f"      Raw facts remain in .pf/raw/ unchanged.")
+    
+    sys.exit(1 if errors else 0)
+
+
+def run_ml_insights(train: bool, topk: int, output_dir: Path) -> Dict[str, Any]:
+    """Run ML insights generation."""
+    try:
+        from theauditor.ml import check_ml_available, learn, suggest
+        
+        if not check_ml_available():
+            return {"error": "ML module not installed. Run: pip install -e .[ml]"}
+        
+        # Train if requested
+        if train:
+            learn_result = learn(
+                db_path="./.pf/repo_index.db",
+                manifest_path="./.pf/manifest.json",
+                print_stats=False
+            )
+            if not learn_result.get("success"):
+                return {"error": f"ML training failed: {learn_result.get('error')}"}
+        
+        # Generate suggestions
+        suggest_result = suggest(
+            db_path="./.pf/repo_index.db",
+            manifest_path="./.pf/manifest.json",
+            workset_path="./.pf/workset.json",
+            topk=topk,
+            out_path=str(output_dir / "ml_suggestions.json")
+        )
+        
+        return suggest_result
+        
+    except ImportError:
+        return {"error": "ML module not available"}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def run_graph_insights(output_dir: Path) -> Dict[str, Any]:
+    """Run graph health insights."""
+    try:
+        from theauditor.graph.insights import GraphInsights
+        from theauditor.graph.analyzer import XGraphAnalyzer
+        from theauditor.graph.store import XGraphStore
+        
+        # Load graph from SQLite database (SINGLE SOURCE OF TRUTH)
+        store = XGraphStore(db_path="./.pf/graphs.db")
+        import_graph = store.load_import_graph()
+        
+        if not import_graph or not import_graph.get("nodes"):
+            return {"error": "No import graph found. Run 'aud graph build' first."}
+        
+        # Load analysis data if it exists
+        analysis_path = Path(".pf/raw/graph_analysis.json")
+        analysis_data = {}
+        if analysis_path.exists():
+            with open(analysis_path) as f:
+                analysis_data = json.load(f)
+        
+        # Run insights analysis
+        insights = GraphInsights()
+        analyzer = XGraphAnalyzer()
+        
+        # Use pre-calculated cycles and hotspots if available, otherwise calculate
+        if 'cycles' in analysis_data:
+            cycles = analysis_data['cycles']
+        else:
+            cycles = analyzer.detect_cycles(import_graph)
+        
+        # Use pre-calculated hotspots if available, otherwise calculate
+        if 'hotspots' in analysis_data:
+            hotspots = analysis_data['hotspots']
+        else:
+            hotspots = insights.rank_hotspots(import_graph)
+        
+        # Calculate health metrics
+        health = insights.calculate_health_metrics(
+            import_graph, 
+            cycles=cycles, 
+            hotspots=hotspots
+        )
+        
+        # Generate recommendations
+        recommendations = insights.generate_recommendations(
+            import_graph,
+            cycles=cycles,
+            hotspots=hotspots
+        )
+        
+        # Save results
+        output = {
+            "health_metrics": health,
+            "top_hotspots": hotspots[:10],
+            "recommendations": recommendations,
+            "cycles_found": len(cycles),
+            "total_nodes": len(import_graph.get("nodes", [])),
+            "total_edges": len(import_graph.get("edges", []))
+        }
+        
+        output_path = output_dir / "graph_health.json"
+        with open(output_path, 'w') as f:
+            json.dump(output, f, indent=2)
+        
+        return {"success": True, "health_score": health.get("health_score")}
+        
+    except ImportError:
+        return {"error": "Graph insights module not available"}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def run_taint_insights(output_dir: Path) -> Dict[str, Any]:
+    """Run taint severity insights."""
+    try:
+        from datetime import datetime, UTC
+        from theauditor.taint.insights import calculate_severity, classify_vulnerability, generate_summary
+        from theauditor.taint_analyzer import SECURITY_SINKS
+        
+        # Load raw taint data
+        taint_path = Path(".pf/raw/taint_analysis.json")
+        if not taint_path.exists():
+            return {"error": "No taint data found. Run 'aud taint-analyze' first."}
+        
+        with open(taint_path) as f:
+            taint_data = json.load(f)
+        
+        if not taint_data.get("success"):
+            return {"error": "Taint analysis was not successful"}
+        
+        # Calculate severity for each path and create enriched versions
+        severity_analysis = []
+        enriched_paths = []
+        for path in taint_data.get("taint_paths", []):
+            severity = calculate_severity(path)
+            vuln_type = classify_vulnerability(path.get("sink", {}), SECURITY_SINKS)
+            
+            severity_analysis.append({
+                "file": path.get("sink", {}).get("file"),
+                "line": path.get("sink", {}).get("line"),
+                "severity": severity,
+                "vulnerability_type": vuln_type,
+                "path_length": len(path.get("path", [])),
+                "risk_score": 1.0 if severity == "critical" else 0.7 if severity == "high" else 0.4
+            })
+            
+            # Create enriched path with severity for summary generation
+            enriched_path = dict(path)
+            enriched_path["severity"] = severity
+            enriched_path["vulnerability_type"] = vuln_type
+            enriched_paths.append(enriched_path)
+        
+        # Generate summary using enriched paths with severity
+        summary = generate_summary(enriched_paths)
+        
+        # Save results
+        output = {
+            "generated_at": datetime.now(UTC).isoformat(),
+            "severity_analysis": severity_analysis,
+            "summary": summary,
+            "total_vulnerabilities": len(severity_analysis),
+            "sources_analyzed": taint_data.get("sources_found", 0),
+            "sinks_analyzed": taint_data.get("sinks_found", 0)
+        }
+        
+        output_path = output_dir / "taint_severity.json"
+        with open(output_path, 'w') as f:
+            json.dump(output, f, indent=2)
+        
+        return {"success": True, "risk_level": summary.get("risk_level")}
+        
+    except ImportError:
+        return {"error": "Taint insights module not available"}
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def run_impact_insights(output_dir: Path) -> Dict[str, Any]:
+    """Run impact analysis insights."""
+    try:
+        # Check if workset exists
+        workset_path = Path(".pf/workset.json")
+        if not workset_path.exists():
+            return {"error": "No workset found. Run 'aud workset' first."}
+        
+        with open(workset_path) as f:
+            workset_data = json.load(f)
+        
+        # For now, create a simple impact summary
+        # In future, this could run actual impact analysis on changed files
+        output = {
+            "files_changed": len(workset_data.get("files", [])),
+            "potential_impact": "Analysis pending",
+            "recommendation": "Run 'aud impact --file <file> --line <line>' for detailed analysis"
+        }
+        
+        output_path = output_dir / "impact_analysis.json"
+        with open(output_path, 'w') as f:
+            json.dump(output, f, indent=2)
+        
+        return {"success": True, "files_analyzed": len(workset_data.get("files", []))}
+        
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def aggregate_insights(results: Dict[str, Any], output_dir: Path) -> Dict[str, Any]:
+    """Aggregate all insights into unified summary."""
+    summary = {
+        "insights_generated": list(results.keys()),
+        "timestamp": __import__('datetime').datetime.now().isoformat(),
+        "output_directory": str(output_dir)
+    }
+    
+    # ML insights
+    if "ml" in results and results["ml"].get("success"):
+        summary["ml"] = {
+            "status": "success",
+            "workset_size": results["ml"].get("workset_size", 0),
+            "predictions_generated": True
+        }
+    elif "ml" in results:
+        summary["ml"] = {"status": "error", "error": results["ml"].get("error")}
+    
+    # Graph insights
+    if "graph" in results and results["graph"].get("success"):
+        summary["graph"] = {
+            "status": "success",
+            "health_score": results["graph"].get("health_score", 0)
+        }
+    elif "graph" in results:
+        summary["graph"] = {"status": "error", "error": results["graph"].get("error")}
+    
+    # Taint insights
+    if "taint" in results and results["taint"].get("success"):
+        summary["taint"] = {
+            "status": "success",
+            "risk_level": results["taint"].get("risk_level", "unknown")
+        }
+    elif "taint" in results:
+        summary["taint"] = {"status": "error", "error": results["taint"].get("error")}
+    
+    # Impact insights
+    if "impact" in results and results["impact"].get("success"):
+        summary["impact"] = {
+            "status": "success",
+            "files_analyzed": results["impact"].get("files_analyzed", 0)
+        }
+    elif "impact" in results:
+        summary["impact"] = {"status": "error", "error": results["impact"].get("error")}
+    
+    return summary
+
+
+def print_insights_summary(summary: Dict[str, Any]) -> None:
+    """Print insights summary to console."""
+    click.echo(f"\n{'='*60}")
+    click.echo("INSIGHTS SUMMARY")
+    click.echo(f"{'='*60}")
+    
+    # ML Summary
+    if "ml" in summary:
+        if summary["ml"]["status"] == "success":
+            click.echo(f"\n[ML] Machine Learning Insights:")
+            click.echo(f"  • Workset size: {summary['ml'].get('workset_size', 0)} files")
+            click.echo(f"  • Predictions: Generated successfully")
+        else:
+            click.echo(f"\n[ML] Machine Learning Insights: {summary['ml'].get('error')}")
+    
+    # Graph Summary
+    if "graph" in summary:
+        if summary["graph"]["status"] == "success":
+            health = summary["graph"].get("health_score", 0)
+            grade = "A" if health >= 90 else "B" if health >= 80 else "C" if health >= 70 else "D" if health >= 60 else "F"
+            click.echo(f"\n[GRAPH] Architecture Health:")
+            click.echo(f"  • Health score: {health}/100 (Grade: {grade})")
+        else:
+            click.echo(f"\n[GRAPH] Architecture Health: {summary['graph'].get('error')}")
+    
+    # Taint Summary
+    if "taint" in summary:
+        if summary["taint"]["status"] == "success":
+            risk = summary["taint"].get("risk_level", "unknown")
+            color = "red" if risk == "critical" else "yellow" if risk == "high" else "green"
+            click.echo(f"\n[TAINT] Security Risk:")
+            click.echo(f"  • Risk level: {risk.upper()}")
+        else:
+            click.echo(f"\n[TAINT] Security Risk: {summary['taint'].get('error')}")
+    
+    # Impact Summary
+    if "impact" in summary:
+        if summary["impact"]["status"] == "success":
+            click.echo(f"\n[IMPACT] Change Impact:")
+            click.echo(f"  • Files analyzed: {summary['impact'].get('files_analyzed', 0)}")
+        else:
+            click.echo(f"\n[IMPACT] Change Impact: {summary['impact'].get('error')}")
+    
+    click.echo(f"\n{'='*60}")
+
+
+# Register command
+insights_command = insights