"""Generate project structure and intelligence reports for AI consumption.""" import json import os import sqlite3 from pathlib import Path from typing import Dict, List, Tuple, Any from .indexer.config import SKIP_DIRS def generate_directory_tree(root_path: str = ".", max_depth: int = 4) -> str: """ Generate a text-based directory tree representation. Args: root_path: Root directory to analyze max_depth: Maximum depth to traverse Returns: String representation of directory tree """ root = Path(root_path).resolve() tree_lines = [] # Critical files to always show explicitly critical_files = { # Python 'main.py', 'app.py', '__main__.py', 'config.py', 'settings.py', 'models.py', 'schemas.py', 'auth.py', 'authentication.py', 'middleware.py', 'routes.py', 'urls.py', 'api.py', # Node/JS/TS 'index.js', 'index.ts', 'app.js', 'app.ts', 'server.js', 'server.ts', 'package.json', 'tsconfig.json', 'types.ts', # General 'requirements.txt', 'setup.py', 'pyproject.toml', 'Dockerfile', 'docker-compose.yml', 'Makefile', '.env.example' } def should_skip(path: Path) -> bool: """Check if directory should be skipped.""" return path.name in SKIP_DIRS or path.name.startswith('.') def add_directory(dir_path: Path, prefix: str = "", depth: int = 0): """Recursively add directory contents to tree.""" if depth > max_depth: return try: items = sorted(dir_path.iterdir(), key=lambda x: (not x.is_dir(), x.name.lower())) except PermissionError: return dirs = [item for item in items if item.is_dir() and not should_skip(item)] files = [item for item in items if item.is_file()] # Group files by extension file_groups = {} critical_in_dir = [] for file in files: if file.name in critical_files: critical_in_dir.append(file) else: ext = file.suffix or 'no-ext' if ext not in file_groups: file_groups[ext] = 0 file_groups[ext] += 1 # Show critical files explicitly for file in critical_in_dir: is_last = (file == critical_in_dir[-1]) and not dirs and not file_groups tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{file.name}") # Show file count summary by type if file_groups: summary_parts = [] for ext, count in sorted(file_groups.items()): if count > 1: summary_parts.append(f"{count} {ext} files") elif count == 1: summary_parts.append(f"1 {ext} file") if summary_parts: is_last = not dirs summary = f"[{', '.join(summary_parts)}]" tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{summary}") # Process subdirectories for i, subdir in enumerate(dirs): is_last_dir = (i == len(dirs) - 1) tree_lines.append(f"{prefix}{'└── ' if is_last_dir else '├── '}{subdir.name}/") extension = " " if is_last_dir else "│ " add_directory(subdir, prefix + extension, depth + 1) tree_lines.append(f"{root.name}/") add_directory(root, "", 0) return "\n".join(tree_lines) def aggregate_statistics(manifest_path: str, db_path: str) -> Dict[str, Any]: """ Aggregate project-wide statistics from manifest and database. Args: manifest_path: Path to manifest.json db_path: Path to repo_index.db Returns: Dictionary containing project statistics """ stats = { 'total_files': 0, 'total_loc': 0, 'total_bytes': 0, 'total_tokens': 0, # Estimated as chars/4 'languages': {}, 'total_functions': 0, 'total_classes': 0, 'total_imports': 0, 'total_calls': 0, 'top_10_largest': [], 'top_15_critical': [] } # Read manifest.json if it exists if Path(manifest_path).exists(): with open(manifest_path, 'r') as f: manifest = json.load(f) stats['total_files'] = len(manifest) # Language distribution and totals for file_info in manifest: stats['total_loc'] += file_info.get('loc', 0) stats['total_bytes'] += file_info.get('bytes', 0) ext = file_info.get('ext', '').lower() if ext: # Map extensions to languages lang_map = { '.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', '.jsx': 'JSX', '.tsx': 'TSX', '.java': 'Java', '.go': 'Go', '.rs': 'Rust', '.cpp': 'C++', '.cc': 'C++', '.c': 'C', '.rb': 'Ruby', '.php': 'PHP', '.cs': 'C#', '.swift': 'Swift', '.kt': 'Kotlin', '.r': 'R', '.m': 'MATLAB', '.jl': 'Julia', '.sh': 'Shell', '.yml': 'YAML', '.yaml': 'YAML', '.json': 'JSON', '.xml': 'XML', '.html': 'HTML', '.css': 'CSS', '.scss': 'SCSS', '.sql': 'SQL', '.md': 'Markdown' } lang = lang_map.get(ext, 'Other') stats['languages'][lang] = stats['languages'].get(lang, 0) + 1 # Estimate tokens (rough approximation: 1 token ≈ 4 characters) stats['total_tokens'] = stats['total_bytes'] // 4 # Find top 10 largest files by LOC sorted_by_size = sorted(manifest, key=lambda x: x.get('loc', 0), reverse=True) for file_info in sorted_by_size[:10]: stats['top_10_largest'].append({ 'path': file_info['path'], 'loc': file_info['loc'], 'bytes': file_info['bytes'], 'tokens': file_info['bytes'] // 4, 'percent': round((file_info['bytes'] / stats['total_bytes']) * 100, 2) if stats['total_bytes'] > 0 else 0 }) # Find critical files based on naming patterns critical_patterns = { # Python patterns 'main.py': 'Entry point', 'app.py': 'Application entry', '__main__.py': 'Module entry', 'config.py': 'Configuration', 'settings.py': 'Settings', 'models.py': 'Data models', 'schemas.py': 'Data schemas', 'auth.py': 'Authentication', 'authentication.py': 'Authentication', 'middleware.py': 'Middleware', 'routes.py': 'Routes', 'urls.py': 'URL patterns', 'api.py': 'API endpoints', 'views.py': 'Views', 'database.py': 'Database', 'db.py': 'Database', # Node/JS/TS patterns 'index.js': 'Entry point', 'index.ts': 'Entry point', 'app.js': 'Application', 'app.ts': 'Application', 'server.js': 'Server', 'server.ts': 'Server', 'package.json': 'Dependencies', 'tsconfig.json': 'TypeScript config', 'types.ts': 'Type definitions', 'types.d.ts': 'Type definitions', 'middleware.js': 'Middleware', 'middleware.ts': 'Middleware', 'routes.js': 'Routes', 'routes.ts': 'Routes', 'config.js': 'Configuration', 'config.ts': 'Configuration', # General 'Dockerfile': 'Container definition', 'docker-compose.yml': 'Container orchestration', 'requirements.txt': 'Python dependencies', 'setup.py': 'Python package', 'pyproject.toml': 'Python project', 'Makefile': 'Build automation' } for file_info in manifest: filename = Path(file_info['path']).name if filename in critical_patterns: stats['top_15_critical'].append({ 'path': file_info['path'], 'filename': filename, 'purpose': critical_patterns[filename], 'loc': file_info['loc'], 'bytes': file_info['bytes'] }) # Limit to top 15 critical files stats['top_15_critical'] = stats['top_15_critical'][:15] # Query database for symbol counts if it exists if Path(db_path).exists(): try: conn = sqlite3.connect(db_path) cursor = conn.cursor() # Count functions cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'function'") stats['total_functions'] = cursor.fetchone()[0] # Count classes cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'class'") stats['total_classes'] = cursor.fetchone()[0] # Count calls (can represent imports/dependencies) cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'call'") stats['total_calls'] = cursor.fetchone()[0] # Count imports from refs table if it exists try: cursor.execute("SELECT COUNT(*) FROM refs WHERE kind IN ('import', 'from', 'require')") stats['total_imports'] = cursor.fetchone()[0] except sqlite3.OperationalError: # refs table might not exist pass conn.close() except Exception as e: # Database might be empty or malformed pass return stats def generate_project_summary( root_path: str = ".", manifest_path: str = "./.pf/manifest.json", db_path: str = "./.pf/repo_index.db", max_depth: int = 4 ) -> str: """ Generate comprehensive project summary markdown report. Args: root_path: Root directory of project manifest_path: Path to manifest.json db_path: Path to repo_index.db Returns: Markdown formatted project summary report """ lines = [] # Header lines.append("# Project Structure & Intelligence Report") lines.append("") lines.append("*This AI-optimized report provides immediate project comprehension.*") lines.append("") # Get statistics stats = aggregate_statistics(manifest_path, db_path) # Project Summary Section lines.append("## Project Summary") lines.append("") lines.append(f"- **Total Files**: {stats['total_files']:,} (analyzable)") # Calculate token percentage of Claude's context claude_context = 400000 # Approximate context window token_percent = (stats['total_tokens'] / claude_context * 100) if stats['total_tokens'] > 0 else 0 lines.append(f"- **Total Tokens**: ~{stats['total_tokens']:,} ({token_percent:.1f}% of Claude's context)") lines.append(f"- **Total LOC**: {stats['total_loc']:,}") # Language breakdown if stats['languages']: # Sort languages by file count sorted_langs = sorted(stats['languages'].items(), key=lambda x: x[1], reverse=True) total_files = sum(stats['languages'].values()) lang_parts = [] for lang, count in sorted_langs[:5]: # Top 5 languages percent = (count / total_files * 100) if total_files > 0 else 0 lang_parts.append(f"{lang} ({percent:.0f}%)") lines.append(f"- **Languages**: {', '.join(lang_parts)}") # Key metrics lines.append("") lines.append("### Key Metrics") lines.append("") if stats['total_classes'] > 0: lines.append(f"- **Classes**: {stats['total_classes']:,}") if stats['total_functions'] > 0: lines.append(f"- **Functions**: {stats['total_functions']:,}") if stats['total_imports'] > 0: lines.append(f"- **Imports**: {stats['total_imports']:,}") if stats['total_calls'] > 0: lines.append(f"- **Function Calls**: {stats['total_calls']:,}") lines.append("") # Top 10 Largest Files if stats['top_10_largest']: lines.append("## Largest Files (by tokens)") lines.append("") lines.append("| # | File | LOC | Tokens | % of Codebase |") lines.append("|---|------|-----|--------|---------------|") for i, file_info in enumerate(stats['top_10_largest'], 1): path = file_info['path'] if len(path) > 50: # Truncate long paths path = "..." + path[-47:] lines.append(f"| {i} | `{path}` | {file_info['loc']:,} | {file_info['tokens']:,} | {file_info['percent']:.1f}% |") lines.append("") # Top Critical Files if stats['top_15_critical']: lines.append("## Critical Files (by convention)") lines.append("") lines.append("*Files identified as architecturally significant based on naming patterns:*") lines.append("") lines.append("| File | Purpose | LOC |") lines.append("|------|---------|-----|") for file_info in stats['top_15_critical']: path = file_info['path'] if len(path) > 40: # Show just filename and parent dir parts = Path(path).parts if len(parts) > 2: path = f".../{parts[-2]}/{parts[-1]}" else: path = "/".join(parts) lines.append(f"| `{path}` | {file_info['purpose']} | {file_info['loc']:,} |") lines.append("") # Directory Tree lines.append("## Directory Structure") lines.append("") lines.append("```") tree = generate_directory_tree(root_path, max_depth) lines.append(tree) lines.append("```") lines.append("") # Token Tracking for AI Context lines.append("## AI Context Optimization") lines.append("") lines.append("### Reading Order for Maximum Comprehension") lines.append("") lines.append("1. **Start here**: This file (STRUCTURE.md) - ~2,000 tokens") lines.append("2. **Core understanding**: Critical files listed above - ~10,000 tokens") lines.append("3. **Issues & findings**: AUDIT.md - ~15,000 tokens") lines.append("4. **Detailed analysis**: Other reports as needed") lines.append("") lines.append("### Token Budget Recommendations") lines.append("") if stats['total_tokens'] < 50000: lines.append("- **Small project**: Can load entire codebase if needed") elif stats['total_tokens'] < 150000: lines.append("- **Medium project**: Focus on critical files and problem areas") else: lines.append("- **Large project**: Use worksets and targeted analysis") lines.append("") lines.append("---") lines.append("*Generated by TheAuditor - Truth through systematic observation*") return "\n".join(lines)