Auditor/theauditor/commands/insights.py

"""Run optional insights analysis on existing audit data.

This command runs interpretive analysis modules (ML, graph health, taint severity)
on top of existing raw audit data, generating insights and predictions.
"""

import json
import sys
from pathlib import Path
from typing import Dict, Any, List

import click


@click.command()
@click.option("--mode", "-m",
              type=click.Choice(["ml", "graph", "taint", "impact", "all"]),
              default="all",
              help="Which insights modules to run")
@click.option("--ml-train", is_flag=True,
              help="Train ML models before generating suggestions")
@click.option("--topk", default=10, type=int,
              help="Top K files for ML suggestions")
@click.option("--output-dir", "-o", type=click.Path(),
              default="./.pf/insights",
              help="Directory for insights output")
@click.option("--print-summary", is_flag=True,
              help="Print summary to console")
def insights(mode: str, ml_train: bool, topk: int, output_dir: str, print_summary: bool) -> None:
    """Run optional insights analysis on existing audit data.

    This command generates interpretive analysis and predictions based on
    the raw facts collected by the audit pipeline. All insights are optional
    and separate from the core truth data.

    Available insights modules:
    - ml: Machine learning risk predictions and root cause analysis
    - graph: Graph health metrics and architectural scoring
    - taint: Severity scoring for taint analysis paths
    - impact: Impact radius and blast zone analysis
    - all: Run all available insights

    Examples:
        # Run all insights
        aud insights

        # Only ML predictions
        aud insights --mode ml

        # Train ML first, then predict
        aud insights --mode ml --ml-train

        # Graph health only with summary
        aud insights --mode graph --print-summary
    """

    # Ensure we have raw data to analyze
    pf_dir = Path(".pf")
    raw_dir = pf_dir / "raw"

    if not raw_dir.exists():
        click.echo("[ERROR] No raw audit data found. Run 'aud full' first.", err=True)
        sys.exit(1)

    # Create insights directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    click.echo(f"\n{'='*60}")
    click.echo(f"INSIGHTS ANALYSIS - {mode.upper()} Mode")
    click.echo(f"{'='*60}")
    click.echo(f"Output directory: {output_path}")

    results = {}
    errors = []

    # ML Insights
    if mode in ["ml", "all"]:
        click.echo("\n[ML] Running machine learning insights...")
        ml_result = run_ml_insights(ml_train, topk, output_path)
        results["ml"] = ml_result
        if ml_result.get("error"):
            errors.append(f"ML: {ml_result['error']}")
        else:
            click.echo(f"  ✓ ML predictions saved to {output_path}/ml_suggestions.json")

    # Graph Health Insights
    if mode in ["graph", "all"]:
        click.echo("\n[GRAPH] Running graph health analysis...")
        graph_result = run_graph_insights(output_path)
        results["graph"] = graph_result
        if graph_result.get("error"):
            errors.append(f"Graph: {graph_result['error']}")
        else:
            click.echo(f"  ✓ Graph health saved to {output_path}/graph_health.json")

    # Taint Severity Insights
    if mode in ["taint", "all"]:
        click.echo("\n[TAINT] Running taint severity scoring...")
        taint_result = run_taint_insights(output_path)
        results["taint"] = taint_result
        if taint_result.get("error"):
            errors.append(f"Taint: {taint_result['error']}")
        else:
            click.echo(f"  ✓ Taint severity saved to {output_path}/taint_severity.json")

    # Impact Analysis Insights
    if mode in ["impact", "all"]:
        click.echo("\n[IMPACT] Running impact analysis...")
        impact_result = run_impact_insights(output_path)
        results["impact"] = impact_result
        if impact_result.get("error"):
            errors.append(f"Impact: {impact_result['error']}")
        else:
            click.echo(f"  ✓ Impact analysis saved to {output_path}/impact_analysis.json")

    # Aggregate all insights into unified summary
    click.echo("\n[AGGREGATE] Creating unified insights summary...")
    summary = aggregate_insights(results, output_path)

    # Save unified summary
    summary_path = output_path / "unified_insights.json"
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    click.echo(f"  ✓ Unified summary saved to {summary_path}")

    # Print summary if requested
    if print_summary:
        print_insights_summary(summary)

    # Final status
    click.echo(f"\n{'='*60}")
    if errors:
        click.echo(f"[WARN] Insights completed with {len(errors)} errors:", err=True)
        for error in errors:
            click.echo(f"  • {error}", err=True)
    else:
        click.echo("[OK] All insights generated successfully")

    click.echo(f"\n[TIP] Insights are interpretive and optional.")
    click.echo(f"      Raw facts remain in .pf/raw/ unchanged.")

    sys.exit(1 if errors else 0)


def run_ml_insights(train: bool, topk: int, output_dir: Path) -> Dict[str, Any]:
    """Run ML insights generation."""
    try:
        from theauditor.ml import check_ml_available, learn, suggest

        if not check_ml_available():
            return {"error": "ML module not installed. Run: pip install -e .[ml]"}

        # Train if requested
        if train:
            learn_result = learn(
                db_path="./.pf/repo_index.db",
                manifest_path="./.pf/manifest.json",
                print_stats=False
            )
            if not learn_result.get("success"):
                return {"error": f"ML training failed: {learn_result.get('error')}"}

        # Generate suggestions
        suggest_result = suggest(
            db_path="./.pf/repo_index.db",
            manifest_path="./.pf/manifest.json",
            workset_path="./.pf/workset.json",
            topk=topk,
            out_path=str(output_dir / "ml_suggestions.json")
        )

        return suggest_result

    except ImportError:
        return {"error": "ML module not available"}
    except Exception as e:
        return {"error": str(e)}


def run_graph_insights(output_dir: Path) -> Dict[str, Any]:
    """Run graph health insights."""
    try:
        from theauditor.graph.insights import GraphInsights
        from theauditor.graph.analyzer import XGraphAnalyzer
        from theauditor.graph.store import XGraphStore

        # Load graph from SQLite database (SINGLE SOURCE OF TRUTH)
        store = XGraphStore(db_path="./.pf/graphs.db")
        import_graph = store.load_import_graph()

        if not import_graph or not import_graph.get("nodes"):
            return {"error": "No import graph found. Run 'aud graph build' first."}

        # Load analysis data if it exists
        analysis_path = Path(".pf/raw/graph_analysis.json")
        analysis_data = {}
        if analysis_path.exists():
            with open(analysis_path) as f:
                analysis_data = json.load(f)

        # Run insights analysis
        insights = GraphInsights()
        analyzer = XGraphAnalyzer()

        # Use pre-calculated cycles and hotspots if available, otherwise calculate
        if 'cycles' in analysis_data:
            cycles = analysis_data['cycles']
        else:
            cycles = analyzer.detect_cycles(import_graph)

        # Use pre-calculated hotspots if available, otherwise calculate
        if 'hotspots' in analysis_data:
            hotspots = analysis_data['hotspots']
        else:
            hotspots = insights.rank_hotspots(import_graph)

        # Calculate health metrics
        health = insights.calculate_health_metrics(
            import_graph,
            cycles=cycles,
            hotspots=hotspots
        )

        # Generate recommendations
        recommendations = insights.generate_recommendations(
            import_graph,
            cycles=cycles,
            hotspots=hotspots
        )

        # Save results
        output = {
            "health_metrics": health,
            "top_hotspots": hotspots[:10],
            "recommendations": recommendations,
            "cycles_found": len(cycles),
            "total_nodes": len(import_graph.get("nodes", [])),
            "total_edges": len(import_graph.get("edges", []))
        }

        output_path = output_dir / "graph_health.json"
        with open(output_path, 'w') as f:
            json.dump(output, f, indent=2)

        return {"success": True, "health_score": health.get("health_score")}

    except ImportError:
        return {"error": "Graph insights module not available"}
    except Exception as e:
        return {"error": str(e)}


def run_taint_insights(output_dir: Path) -> Dict[str, Any]:
    """Run taint severity insights."""
    try:
        from datetime import datetime, UTC
        from theauditor.taint.insights import calculate_severity, classify_vulnerability, generate_summary
        from theauditor.taint_analyzer import SECURITY_SINKS

        # Load raw taint data
        taint_path = Path(".pf/raw/taint_analysis.json")
        if not taint_path.exists():
            return {"error": "No taint data found. Run 'aud taint-analyze' first."}

        with open(taint_path) as f:
            taint_data = json.load(f)

        if not taint_data.get("success"):
            return {"error": "Taint analysis was not successful"}

        # Calculate severity for each path and create enriched versions
        severity_analysis = []
        enriched_paths = []
        for path in taint_data.get("taint_paths", []):
            severity = calculate_severity(path)
            vuln_type = classify_vulnerability(path.get("sink", {}), SECURITY_SINKS)

            severity_analysis.append({
                "file": path.get("sink", {}).get("file"),
                "line": path.get("sink", {}).get("line"),
                "severity": severity,
                "vulnerability_type": vuln_type,
                "path_length": len(path.get("path", [])),
                "risk_score": 1.0 if severity == "critical" else 0.7 if severity == "high" else 0.4
            })

            # Create enriched path with severity for summary generation
            enriched_path = dict(path)
            enriched_path["severity"] = severity
            enriched_path["vulnerability_type"] = vuln_type
            enriched_paths.append(enriched_path)

        # Generate summary using enriched paths with severity
        summary = generate_summary(enriched_paths)

        # Save results
        output = {
            "generated_at": datetime.now(UTC).isoformat(),
            "severity_analysis": severity_analysis,
            "summary": summary,
            "total_vulnerabilities": len(severity_analysis),
            "sources_analyzed": taint_data.get("sources_found", 0),
            "sinks_analyzed": taint_data.get("sinks_found", 0)
        }

        output_path = output_dir / "taint_severity.json"
        with open(output_path, 'w') as f:
            json.dump(output, f, indent=2)

        return {"success": True, "risk_level": summary.get("risk_level")}

    except ImportError:
        return {"error": "Taint insights module not available"}
    except Exception as e:
        return {"error": str(e)}


def run_impact_insights(output_dir: Path) -> Dict[str, Any]:
    """Run impact analysis insights."""
    try:
        # Check if workset exists
        workset_path = Path(".pf/workset.json")
        if not workset_path.exists():
            return {"error": "No workset found. Run 'aud workset' first."}

        with open(workset_path) as f:
            workset_data = json.load(f)

        # For now, create a simple impact summary
        # In future, this could run actual impact analysis on changed files
        output = {
            "files_changed": len(workset_data.get("files", [])),
            "potential_impact": "Analysis pending",
            "recommendation": "Run 'aud impact --file <file> --line <line>' for detailed analysis"
        }

        output_path = output_dir / "impact_analysis.json"
        with open(output_path, 'w') as f:
            json.dump(output, f, indent=2)

        return {"success": True, "files_analyzed": len(workset_data.get("files", []))}

    except Exception as e:
        return {"error": str(e)}


def aggregate_insights(results: Dict[str, Any], output_dir: Path) -> Dict[str, Any]:
    """Aggregate all insights into unified summary."""
    summary = {
        "insights_generated": list(results.keys()),
        "timestamp": __import__('datetime').datetime.now().isoformat(),
        "output_directory": str(output_dir)
    }

    # ML insights
    if "ml" in results and results["ml"].get("success"):
        summary["ml"] = {
            "status": "success",
            "workset_size": results["ml"].get("workset_size", 0),
            "predictions_generated": True
        }
    elif "ml" in results:
        summary["ml"] = {"status": "error", "error": results["ml"].get("error")}

    # Graph insights
    if "graph" in results and results["graph"].get("success"):
        summary["graph"] = {
            "status": "success",
            "health_score": results["graph"].get("health_score", 0)
        }
    elif "graph" in results:
        summary["graph"] = {"status": "error", "error": results["graph"].get("error")}

    # Taint insights
    if "taint" in results and results["taint"].get("success"):
        summary["taint"] = {
            "status": "success",
            "risk_level": results["taint"].get("risk_level", "unknown")
        }
    elif "taint" in results:
        summary["taint"] = {"status": "error", "error": results["taint"].get("error")}

    # Impact insights
    if "impact" in results and results["impact"].get("success"):
        summary["impact"] = {
            "status": "success",
            "files_analyzed": results["impact"].get("files_analyzed", 0)
        }
    elif "impact" in results:
        summary["impact"] = {"status": "error", "error": results["impact"].get("error")}

    return summary


def print_insights_summary(summary: Dict[str, Any]) -> None:
    """Print insights summary to console."""
    click.echo(f"\n{'='*60}")
    click.echo("INSIGHTS SUMMARY")
    click.echo(f"{'='*60}")

    # ML Summary
    if "ml" in summary:
        if summary["ml"]["status"] == "success":
            click.echo(f"\n[ML] Machine Learning Insights:")
            click.echo(f"  • Workset size: {summary['ml'].get('workset_size', 0)} files")
            click.echo(f"  • Predictions: Generated successfully")
        else:
            click.echo(f"\n[ML] Machine Learning Insights: {summary['ml'].get('error')}")

    # Graph Summary
    if "graph" in summary:
        if summary["graph"]["status"] == "success":
            health = summary["graph"].get("health_score", 0)
            grade = "A" if health >= 90 else "B" if health >= 80 else "C" if health >= 70 else "D" if health >= 60 else "F"
            click.echo(f"\n[GRAPH] Architecture Health:")
            click.echo(f"  • Health score: {health}/100 (Grade: {grade})")
        else:
            click.echo(f"\n[GRAPH] Architecture Health: {summary['graph'].get('error')}")

    # Taint Summary
    if "taint" in summary:
        if summary["taint"]["status"] == "success":
            risk = summary["taint"].get("risk_level", "unknown")
            color = "red" if risk == "critical" else "yellow" if risk == "high" else "green"
            click.echo(f"\n[TAINT] Security Risk:")
            click.echo(f"  • Risk level: {risk.upper()}")
        else:
            click.echo(f"\n[TAINT] Security Risk: {summary['taint'].get('error')}")

    # Impact Summary
    if "impact" in summary:
        if summary["impact"]["status"] == "success":
            click.echo(f"\n[IMPACT] Change Impact:")
            click.echo(f"  • Files analyzed: {summary['impact'].get('files_analyzed', 0)}")
        else:
            click.echo(f"\n[IMPACT] Change Impact: {summary['impact'].get('error')}")

    click.echo(f"\n{'='*60}")


# Register command
insights_command = insights