Files
Auditor/theauditor/project_summary.py

421 lines
15 KiB
Python

"""Generate project structure and intelligence reports for AI consumption."""
import json
import os
import sqlite3
from pathlib import Path
from typing import Dict, List, Tuple, Any
from .indexer.config import SKIP_DIRS
def generate_directory_tree(root_path: str = ".", max_depth: int = 4) -> str:
"""
Generate a text-based directory tree representation.
Args:
root_path: Root directory to analyze
max_depth: Maximum depth to traverse
Returns:
String representation of directory tree
"""
root = Path(root_path).resolve()
tree_lines = []
# Critical files to always show explicitly
critical_files = {
# Python
'main.py', 'app.py', '__main__.py', 'config.py', 'settings.py',
'models.py', 'schemas.py', 'auth.py', 'authentication.py',
'middleware.py', 'routes.py', 'urls.py', 'api.py',
# Node/JS/TS
'index.js', 'index.ts', 'app.js', 'app.ts', 'server.js', 'server.ts',
'package.json', 'tsconfig.json', 'types.ts',
# General
'requirements.txt', 'setup.py', 'pyproject.toml', 'Dockerfile',
'docker-compose.yml', 'Makefile', '.env.example'
}
def should_skip(path: Path) -> bool:
"""Check if directory should be skipped."""
return path.name in SKIP_DIRS or path.name.startswith('.')
def add_directory(dir_path: Path, prefix: str = "", depth: int = 0):
"""Recursively add directory contents to tree."""
if depth > max_depth:
return
try:
items = sorted(dir_path.iterdir(), key=lambda x: (not x.is_dir(), x.name.lower()))
except PermissionError:
return
dirs = [item for item in items if item.is_dir() and not should_skip(item)]
files = [item for item in items if item.is_file()]
# Group files by extension
file_groups = {}
critical_in_dir = []
for file in files:
if file.name in critical_files:
critical_in_dir.append(file)
else:
ext = file.suffix or 'no-ext'
if ext not in file_groups:
file_groups[ext] = 0
file_groups[ext] += 1
# Show critical files explicitly
for file in critical_in_dir:
is_last = (file == critical_in_dir[-1]) and not dirs and not file_groups
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{file.name}")
# Show file count summary by type
if file_groups:
summary_parts = []
for ext, count in sorted(file_groups.items()):
if count > 1:
summary_parts.append(f"{count} {ext} files")
elif count == 1:
summary_parts.append(f"1 {ext} file")
if summary_parts:
is_last = not dirs
summary = f"[{', '.join(summary_parts)}]"
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{summary}")
# Process subdirectories
for i, subdir in enumerate(dirs):
is_last_dir = (i == len(dirs) - 1)
tree_lines.append(f"{prefix}{'└── ' if is_last_dir else '├── '}{subdir.name}/")
extension = " " if is_last_dir else ""
add_directory(subdir, prefix + extension, depth + 1)
tree_lines.append(f"{root.name}/")
add_directory(root, "", 0)
return "\n".join(tree_lines)
def aggregate_statistics(manifest_path: str, db_path: str) -> Dict[str, Any]:
"""
Aggregate project-wide statistics from manifest and database.
Args:
manifest_path: Path to manifest.json
db_path: Path to repo_index.db
Returns:
Dictionary containing project statistics
"""
stats = {
'total_files': 0,
'total_loc': 0,
'total_bytes': 0,
'total_tokens': 0, # Estimated as chars/4
'languages': {},
'total_functions': 0,
'total_classes': 0,
'total_imports': 0,
'total_calls': 0,
'top_10_largest': [],
'top_15_critical': []
}
# Read manifest.json if it exists
if Path(manifest_path).exists():
with open(manifest_path, 'r') as f:
manifest = json.load(f)
stats['total_files'] = len(manifest)
# Language distribution and totals
for file_info in manifest:
stats['total_loc'] += file_info.get('loc', 0)
stats['total_bytes'] += file_info.get('bytes', 0)
ext = file_info.get('ext', '').lower()
if ext:
# Map extensions to languages
lang_map = {
'.py': 'Python',
'.js': 'JavaScript',
'.ts': 'TypeScript',
'.jsx': 'JSX',
'.tsx': 'TSX',
'.java': 'Java',
'.go': 'Go',
'.rs': 'Rust',
'.cpp': 'C++', '.cc': 'C++',
'.c': 'C',
'.rb': 'Ruby',
'.php': 'PHP',
'.cs': 'C#',
'.swift': 'Swift',
'.kt': 'Kotlin',
'.r': 'R',
'.m': 'MATLAB',
'.jl': 'Julia',
'.sh': 'Shell',
'.yml': 'YAML', '.yaml': 'YAML',
'.json': 'JSON',
'.xml': 'XML',
'.html': 'HTML',
'.css': 'CSS',
'.scss': 'SCSS',
'.sql': 'SQL',
'.md': 'Markdown'
}
lang = lang_map.get(ext, 'Other')
stats['languages'][lang] = stats['languages'].get(lang, 0) + 1
# Estimate tokens (rough approximation: 1 token ≈ 4 characters)
stats['total_tokens'] = stats['total_bytes'] // 4
# Find top 10 largest files by LOC
sorted_by_size = sorted(manifest, key=lambda x: x.get('loc', 0), reverse=True)
for file_info in sorted_by_size[:10]:
stats['top_10_largest'].append({
'path': file_info['path'],
'loc': file_info['loc'],
'bytes': file_info['bytes'],
'tokens': file_info['bytes'] // 4,
'percent': round((file_info['bytes'] / stats['total_bytes']) * 100, 2) if stats['total_bytes'] > 0 else 0
})
# Find critical files based on naming patterns
critical_patterns = {
# Python patterns
'main.py': 'Entry point',
'app.py': 'Application entry',
'__main__.py': 'Module entry',
'config.py': 'Configuration',
'settings.py': 'Settings',
'models.py': 'Data models',
'schemas.py': 'Data schemas',
'auth.py': 'Authentication',
'authentication.py': 'Authentication',
'middleware.py': 'Middleware',
'routes.py': 'Routes',
'urls.py': 'URL patterns',
'api.py': 'API endpoints',
'views.py': 'Views',
'database.py': 'Database',
'db.py': 'Database',
# Node/JS/TS patterns
'index.js': 'Entry point',
'index.ts': 'Entry point',
'app.js': 'Application',
'app.ts': 'Application',
'server.js': 'Server',
'server.ts': 'Server',
'package.json': 'Dependencies',
'tsconfig.json': 'TypeScript config',
'types.ts': 'Type definitions',
'types.d.ts': 'Type definitions',
'middleware.js': 'Middleware',
'middleware.ts': 'Middleware',
'routes.js': 'Routes',
'routes.ts': 'Routes',
'config.js': 'Configuration',
'config.ts': 'Configuration',
# General
'Dockerfile': 'Container definition',
'docker-compose.yml': 'Container orchestration',
'requirements.txt': 'Python dependencies',
'setup.py': 'Python package',
'pyproject.toml': 'Python project',
'Makefile': 'Build automation'
}
for file_info in manifest:
filename = Path(file_info['path']).name
if filename in critical_patterns:
stats['top_15_critical'].append({
'path': file_info['path'],
'filename': filename,
'purpose': critical_patterns[filename],
'loc': file_info['loc'],
'bytes': file_info['bytes']
})
# Limit to top 15 critical files
stats['top_15_critical'] = stats['top_15_critical'][:15]
# Query database for symbol counts if it exists
if Path(db_path).exists():
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Count functions
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'function'")
stats['total_functions'] = cursor.fetchone()[0]
# Count classes
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'class'")
stats['total_classes'] = cursor.fetchone()[0]
# Count calls (can represent imports/dependencies)
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'call'")
stats['total_calls'] = cursor.fetchone()[0]
# Count imports from refs table if it exists
try:
cursor.execute("SELECT COUNT(*) FROM refs WHERE kind IN ('import', 'from', 'require')")
stats['total_imports'] = cursor.fetchone()[0]
except sqlite3.OperationalError:
# refs table might not exist
pass
conn.close()
except Exception as e:
# Database might be empty or malformed
pass
return stats
def generate_project_summary(
root_path: str = ".",
manifest_path: str = "./.pf/manifest.json",
db_path: str = "./.pf/repo_index.db",
max_depth: int = 4
) -> str:
"""
Generate comprehensive project summary markdown report.
Args:
root_path: Root directory of project
manifest_path: Path to manifest.json
db_path: Path to repo_index.db
Returns:
Markdown formatted project summary report
"""
lines = []
# Header
lines.append("# Project Structure & Intelligence Report")
lines.append("")
lines.append("*This AI-optimized report provides immediate project comprehension.*")
lines.append("")
# Get statistics
stats = aggregate_statistics(manifest_path, db_path)
# Project Summary Section
lines.append("## Project Summary")
lines.append("")
lines.append(f"- **Total Files**: {stats['total_files']:,} (analyzable)")
# Calculate token percentage of Claude's context
claude_context = 400000 # Approximate context window
token_percent = (stats['total_tokens'] / claude_context * 100) if stats['total_tokens'] > 0 else 0
lines.append(f"- **Total Tokens**: ~{stats['total_tokens']:,} ({token_percent:.1f}% of Claude's context)")
lines.append(f"- **Total LOC**: {stats['total_loc']:,}")
# Language breakdown
if stats['languages']:
# Sort languages by file count
sorted_langs = sorted(stats['languages'].items(), key=lambda x: x[1], reverse=True)
total_files = sum(stats['languages'].values())
lang_parts = []
for lang, count in sorted_langs[:5]: # Top 5 languages
percent = (count / total_files * 100) if total_files > 0 else 0
lang_parts.append(f"{lang} ({percent:.0f}%)")
lines.append(f"- **Languages**: {', '.join(lang_parts)}")
# Key metrics
lines.append("")
lines.append("### Key Metrics")
lines.append("")
if stats['total_classes'] > 0:
lines.append(f"- **Classes**: {stats['total_classes']:,}")
if stats['total_functions'] > 0:
lines.append(f"- **Functions**: {stats['total_functions']:,}")
if stats['total_imports'] > 0:
lines.append(f"- **Imports**: {stats['total_imports']:,}")
if stats['total_calls'] > 0:
lines.append(f"- **Function Calls**: {stats['total_calls']:,}")
lines.append("")
# Top 10 Largest Files
if stats['top_10_largest']:
lines.append("## Largest Files (by tokens)")
lines.append("")
lines.append("| # | File | LOC | Tokens | % of Codebase |")
lines.append("|---|------|-----|--------|---------------|")
for i, file_info in enumerate(stats['top_10_largest'], 1):
path = file_info['path']
if len(path) > 50:
# Truncate long paths
path = "..." + path[-47:]
lines.append(f"| {i} | `{path}` | {file_info['loc']:,} | {file_info['tokens']:,} | {file_info['percent']:.1f}% |")
lines.append("")
# Top Critical Files
if stats['top_15_critical']:
lines.append("## Critical Files (by convention)")
lines.append("")
lines.append("*Files identified as architecturally significant based on naming patterns:*")
lines.append("")
lines.append("| File | Purpose | LOC |")
lines.append("|------|---------|-----|")
for file_info in stats['top_15_critical']:
path = file_info['path']
if len(path) > 40:
# Show just filename and parent dir
parts = Path(path).parts
if len(parts) > 2:
path = f".../{parts[-2]}/{parts[-1]}"
else:
path = "/".join(parts)
lines.append(f"| `{path}` | {file_info['purpose']} | {file_info['loc']:,} |")
lines.append("")
# Directory Tree
lines.append("## Directory Structure")
lines.append("")
lines.append("```")
tree = generate_directory_tree(root_path, max_depth)
lines.append(tree)
lines.append("```")
lines.append("")
# Token Tracking for AI Context
lines.append("## AI Context Optimization")
lines.append("")
lines.append("### Reading Order for Maximum Comprehension")
lines.append("")
lines.append("1. **Start here**: This file (STRUCTURE.md) - ~2,000 tokens")
lines.append("2. **Core understanding**: Critical files listed above - ~10,000 tokens")
lines.append("3. **Issues & findings**: AUDIT.md - ~15,000 tokens")
lines.append("4. **Detailed analysis**: Other reports as needed")
lines.append("")
lines.append("### Token Budget Recommendations")
lines.append("")
if stats['total_tokens'] < 50000:
lines.append("- **Small project**: Can load entire codebase if needed")
elif stats['total_tokens'] < 150000:
lines.append("- **Medium project**: Focus on critical files and problem areas")
else:
lines.append("- **Large project**: Use worksets and targeted analysis")
lines.append("")
lines.append("---")
lines.append("*Generated by TheAuditor - Truth through systematic observation*")
return "\n".join(lines)