"""Documentation generator from index and capsules (optional feature).""" import hashlib import json import platform import sqlite3 import sys from collections import defaultdict from datetime import UTC, datetime from pathlib import Path from typing import Any from theauditor import __version__ def is_source_file(file_path: str) -> bool: """Check if a file is a source code file (not test, config, or docs).""" path = Path(file_path) # Skip test files and directories if any(part in ['test', 'tests', '__tests__', 'spec', 'fixtures', 'fixture_repo', 'test_scaffold'] for part in path.parts): return False if path.name.startswith('test_') or path.name.endswith('_test.py') or '.test.' in path.name or '.spec.' in path.name: return False if 'test' in str(path).lower() and any(ext in str(path).lower() for ext in ['.spec.', '_test.', 'test_']): return False # Skip documentation if path.suffix.lower() in ['.md', '.rst', '.txt']: return False # Skip configuration files config_files = { '.gitignore', '.gitattributes', '.editorconfig', 'pyproject.toml', 'setup.py', 'setup.cfg', 'package.json', 'package-lock.json', 'yarn.lock', 'package-template.json', 'tsconfig.json', 'Makefile', 'makefile', 'requirements.txt', 'Dockerfile', 'docker-compose.yml', '.dockerignore', 'manifest.json', 'repo_index.db' } if path.name.lower() in config_files: return False # Skip build artifacts and caches skip_dirs = {'docs', 'documentation', 'examples', 'samples', 'schemas', 'agent_templates'} if any(part.lower() in skip_dirs for part in path.parts): return False return True def load_manifest(manifest_path: str) -> tuple[list[dict], str]: """Load manifest and compute its hash.""" with open(manifest_path, "rb") as f: content = f.read() manifest_hash = hashlib.sha256(content).hexdigest() manifest = json.loads(content) return manifest, manifest_hash def load_workset(workset_path: str) -> set[str]: """Load workset file paths.""" if not Path(workset_path).exists(): return set() with open(workset_path) as f: workset = json.load(f) return {p["path"] for p in workset.get("paths", [])} def load_capsules(capsules_dir: str, workset_paths: set[str] | None = None) -> list[dict]: """Load capsules, optionally filtered by workset.""" capsules = [] capsules_path = Path(capsules_dir) if not capsules_path.exists(): raise RuntimeError(f"Capsules directory not found: {capsules_dir}") for capsule_file in sorted(capsules_path.glob("*.json")): with open(capsule_file) as f: capsule = json.load(f) # Filter by workset if provided if workset_paths is None or capsule.get("path") in workset_paths: # Filter out non-source files if is_source_file(capsule.get("path", "")): capsules.append(capsule) return capsules def get_routes(db_path: str, workset_paths: set[str] | None = None) -> list[dict]: """Get routes from database, excluding test files.""" if not Path(db_path).exists(): return [] conn = sqlite3.connect(db_path) cursor = conn.cursor() if workset_paths: placeholders = ",".join("?" * len(workset_paths)) query = f""" SELECT method, pattern, file FROM api_endpoints WHERE file IN ({placeholders}) ORDER BY file, pattern """ cursor.execute(query, tuple(workset_paths)) else: cursor.execute( """ SELECT method, pattern, file FROM api_endpoints ORDER BY file, pattern """ ) routes = [] for row in cursor.fetchall(): # Filter out test files if is_source_file(row[2]): routes.append({"method": row[0], "pattern": row[1], "file": row[2]}) conn.close() return routes def get_sql_objects(db_path: str, workset_paths: set[str] | None = None) -> list[dict]: """Get SQL objects from database, excluding test files.""" if not Path(db_path).exists(): return [] conn = sqlite3.connect(db_path) cursor = conn.cursor() if workset_paths: placeholders = ",".join("?" * len(workset_paths)) query = f""" SELECT kind, name, file FROM sql_objects WHERE file IN ({placeholders}) ORDER BY kind, name """ cursor.execute(query, tuple(workset_paths)) else: cursor.execute( """ SELECT kind, name, file FROM sql_objects ORDER BY kind, name """ ) objects = [] for row in cursor.fetchall(): # Filter out test files if is_source_file(row[2]): objects.append({"kind": row[0], "name": row[1], "file": row[2]}) conn.close() return objects def group_files_by_folder(capsules: list[dict]) -> dict[str, list[dict]]: """Group files by their first directory segment.""" groups = defaultdict(list) for capsule in capsules: path = capsule.get("path", "") if "/" in path: folder = path.split("/")[0] else: folder = "." groups[folder].append(capsule) # Sort by folder name return dict(sorted(groups.items())) def generate_architecture_md( routes: list[dict], sql_objects: list[dict], capsules: list[dict], scope: str, ) -> str: """Generate ARCHITECTURE.md content.""" now = datetime.now(UTC).isoformat() content = [ "# Architecture", f"Generated at: {now}", "", "## Scope", f"Mode: {scope}", "", ] # Routes table if routes: content.extend( [ "## Routes", "", "| Method | Pattern | File |", "|--------|---------|------|", ] ) for route in routes: content.append(f"| {route['method']} | {route['pattern']} | {route['file']} |") content.append("") # SQL Objects table if sql_objects: content.extend( [ "## SQL Objects", "", "| Kind | Name | File |", "|------|------|------|", ] ) for obj in sql_objects: content.append(f"| {obj['kind']} | {obj['name']} | {obj['file']} |") content.append("") # Core Modules (group by actual functionality) groups = group_files_by_folder(capsules) if groups: content.extend( [ "## Core Modules", "", ] ) # Filter and organize by purpose module_categories = { "Core CLI": {}, "Analysis & Detection": {}, "Code Generation": {}, "Reporting": {}, "Utilities": {}, } for folder, folder_capsules in groups.items(): if folder == "theauditor": for capsule in folder_capsules: path = Path(capsule.get("path", "")) name = path.stem # Skip duplicates and internal modules if name in ['__init__', 'parsers'] or name.endswith('.py.tpl'): continue exports = capsule.get("interfaces", {}).get("exports", []) functions = capsule.get("interfaces", {}).get("functions", []) classes = capsule.get("interfaces", {}).get("classes", []) # Categorize based on filename if name in ['cli', 'orchestrator', 'config', 'config_runtime']: category = "Core CLI" elif name in ['lint', 'ast_verify', 'universal_detector', 'pattern_loader', 'flow_analyzer', 'risk_scorer', 'pattern_rca', 'xgraph_analyzer']: category = "Analysis & Detection" elif name in ['scaffolder', 'test_generator', 'claude_setup', 'claude_autogen', 'venv_install']: category = "Code Generation" elif name in ['report', 'capsules', 'docgen', 'journal_view']: category = "Reporting" else: # Skip certain utility files from main display if name in ['utils', 'evidence', 'runner', 'contracts', 'tools']: continue category = "Utilities" # Build summary (only add if not already present) if name not in module_categories[category]: summary_parts = [] if classes: summary_parts.append(f"Classes: {', '.join(classes[:3])}") elif functions: summary_parts.append(f"Functions: {', '.join(functions[:3])}") elif exports: summary_parts.append(f"Exports: {', '.join(exports[:3])}") summary = " | ".join(summary_parts) if summary_parts else "Utility module" module_categories[category][name] = f"- **{name}**: {summary}" # Output categorized modules for category, modules_dict in module_categories.items(): if modules_dict: content.append(f"### {category}") # Sort modules by name and get their descriptions for name in sorted(modules_dict.keys()): content.append(modules_dict[name]) content.append("") return "\n".join(content) def generate_features_md(capsules: list[dict]) -> str: """Generate FEATURES.md content with meaningful feature descriptions.""" content = [ "# Features & Capabilities", "", "## Core Functionality", "", ] # Analyze capsules to extract features features = { "Code Analysis": [], "Test Generation": [], "Documentation": [], "CI/CD Integration": [], "ML Capabilities": [], } cli_commands = set() for capsule in capsules: path = Path(capsule.get("path", "")) if path.parent.name != "theauditor": continue name = path.stem exports = capsule.get("interfaces", {}).get("exports", []) functions = capsule.get("interfaces", {}).get("functions", []) # Extract features based on module if name == "cli": # Try to extract CLI commands from functions for func in functions: if func not in ['main', 'cli']: cli_commands.add(func) elif name == "lint": features["Code Analysis"].append("- **Linting**: Custom security and quality rules") elif name == "ast_verify": features["Code Analysis"].append("- **AST Verification**: Contract-based code verification") elif name == "universal_detector": features["Code Analysis"].append("- **Pattern Detection**: Security and performance anti-patterns") elif name == "flow_analyzer": features["Code Analysis"].append("- **Flow Analysis**: Deadlock and race condition detection") elif name == "risk_scorer": features["Code Analysis"].append("- **Risk Scoring**: Automated risk assessment for files") elif name == "test_generator": features["Test Generation"].append("- **Test Scaffolding**: Generate test stubs from code") elif name == "scaffolder": features["Test Generation"].append("- **Contract Tests**: Generate DB/API contract tests") elif name == "docgen": features["Documentation"].append("- **Architecture Docs**: Auto-generate architecture documentation") elif name == "capsules": features["Documentation"].append("- **Code Capsules**: Compressed code summaries") elif name == "report": features["Documentation"].append("- **Audit Reports**: Comprehensive audit report generation") elif name == "claude_setup": features["CI/CD Integration"].append("- **Claude Code Integration**: Automated hooks for Claude AI") elif name == "orchestrator": features["CI/CD Integration"].append("- **Event-Driven Automation**: Git hooks and CI pipeline support") elif name == "ml": features["ML Capabilities"].append("- **ML-Based Suggestions**: Learn from codebase patterns") features["ML Capabilities"].append("- **Root Cause Prediction**: Predict likely failure points") # Output features by category for category, feature_list in features.items(): if feature_list: content.append(f"### {category}") # Deduplicate seen = set() for feature in feature_list: if feature not in seen: content.append(feature) seen.add(feature) content.append("") # Add CLI commands summary if cli_commands: content.append("## Available Commands") content.append("") content.append("The following commands are available through the CLI:") content.append("") # Group commands by purpose cmd_groups = { "Analysis": ['lint', 'ast_verify', 'detect_patterns', 'flow_analyze', 'risk_score'], "Generation": ['gen_tests', 'scaffold', 'suggest_fixes'], "Reporting": ['report', 'journal', 'capsules'], "Setup": ['init', 'setup_claude', 'deps'], } for group, cmds in cmd_groups.items(): group_cmds = [c for c in cli_commands if any(cmd in c for cmd in cmds)] if group_cmds: content.append(f"**{group}**: {', '.join(sorted(group_cmds)[:5])}") content.append("") # Add configuration info content.append("## Configuration") content.append("") content.append("- **Zero Dependencies**: Core functionality uses only Python stdlib") content.append("- **Offline Mode**: All operations work without network access") content.append("- **Per-Project**: No global state, everything is project-local") content.append("") return "\n".join(content) def generate_trace_md( manifest_hash: str, manifest: list[dict], capsules: list[dict], db_path: str, workset_paths: set[str] | None, ) -> str: """Generate TRACE.md content with meaningful metrics.""" # Count database entries routes_count = 0 sql_objects_count = 0 refs_count = 0 imports_count = 0 if Path(db_path).exists(): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM api_endpoints") routes_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM sql_objects") sql_objects_count = cursor.fetchone()[0] # Count refs (files table) cursor.execute("SELECT COUNT(*) FROM files") refs_count = cursor.fetchone()[0] # Count imports try: cursor.execute("SELECT COUNT(*) FROM imports") imports_count = cursor.fetchone()[0] except sqlite3.OperationalError: imports_count = 0 conn.close() # Separate source files from all files source_files = [f for f in manifest if is_source_file(f.get("path", ""))] test_files = [f for f in manifest if 'test' in f.get("path", "").lower()] doc_files = [f for f in manifest if f.get("path", "").endswith(('.md', '.rst', '.txt'))] # Calculate coverage if workset_paths: coverage = len(capsules) / len(workset_paths) * 100 if workset_paths else 0 else: coverage = len(capsules) / len(source_files) * 100 if source_files else 0 content = [ "# Audit Trace", "", "## Repository Snapshot", f"**Manifest Hash**: `{manifest_hash}`", f"**Timestamp**: {datetime.now(UTC).strftime('%Y-%m-%d %H:%M:%S UTC')}", "", "## File Statistics", f"- **Total Files**: {len(manifest)}", f" - Source Files: {len(source_files)}", f" - Test Files: {len(test_files)}", f" - Documentation: {len(doc_files)}", f" - Other: {len(manifest) - len(source_files) - len(test_files) - len(doc_files)}", "", "## Code Metrics", f"- **Cross-References**: {refs_count}", f"- **Import Statements**: {imports_count}", f"- **HTTP Routes**: {routes_count}", f"- **SQL Objects**: {sql_objects_count}", "", "## Analysis Coverage", f"- **Coverage**: {coverage:.1f}% of source files", f"- **Capsules Generated**: {len(capsules)}", f"- **Scope**: {'Workset' if workset_paths else 'Full repository'}", "", "## Language Distribution", ] # Count languages lang_counts = defaultdict(int) for capsule in capsules: lang = capsule.get("language", "") # Empty not unknown lang_counts[lang] += 1 for lang, count in sorted(lang_counts.items(), key=lambda x: x[1], reverse=True): content.append(f"- {lang}: {count} files") content.extend([ "", "## Environment", f"- **TheAuditor Version**: {__version__}", f"- **Python**: {sys.version.split()[0]}", f"- **Platform**: {platform.platform()}", f"- **Processor**: {platform.processor() or 'Unknown'}", "", "## Audit Trail", "This document provides cryptographic proof of the codebase state at audit time.", "The manifest hash can be used to verify no files have been modified since analysis.", "", ]) return "\n".join(content) # This function was moved above generate_trace_md def generate_docs( manifest_path: str = "manifest.json", db_path: str = "repo_index.db", capsules_dir: str = "./.pf/capsules", workset_path: str = "./.pf/workset.json", out_dir: str = "./.pf/docs", full: bool = False, print_stats: bool = False, ) -> dict[str, Any]: """Generate documentation from index and capsules.""" # Load data manifest, manifest_hash = load_manifest(manifest_path) workset_paths = None if full else load_workset(workset_path) try: capsules = load_capsules(capsules_dir, workset_paths) except RuntimeError as e: raise RuntimeError(f"Cannot generate docs: {e}. Run 'aud capsules' first.") from e # Get database data routes = get_routes(db_path, workset_paths) sql_objects = get_sql_objects(db_path, workset_paths) # Generate content scope = "full" if full else "workset" architecture_content = generate_architecture_md(routes, sql_objects, capsules, scope) trace_content = generate_trace_md(manifest_hash, manifest, capsules, db_path, workset_paths) features_content = generate_features_md(capsules) # Write files out_path = Path(out_dir) out_path.mkdir(parents=True, exist_ok=True) (out_path / "ARCHITECTURE.md").write_text(architecture_content) (out_path / "TRACE.md").write_text(trace_content) (out_path / "FEATURES.md").write_text(features_content) result = { "files_written": 3, "scope": scope, "capsules_used": len(capsules), "routes": len(routes), "sql_objects": len(sql_objects), } if print_stats: print(f"Generated {result['files_written']} docs in {out_dir}") print(f" Scope: {result['scope']}") print(f" Capsules: {result['capsules_used']}") print(f" Routes: {result['routes']}") print(f" SQL Objects: {result['sql_objects']}") return result