mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 11:24:19 +01:00
421 lines
15 KiB
Python
421 lines
15 KiB
Python
"""Generate project structure and intelligence reports for AI consumption."""
|
|
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple, Any
|
|
|
|
from .indexer.config import SKIP_DIRS
|
|
|
|
|
|
def generate_directory_tree(root_path: str = ".", max_depth: int = 4) -> str:
|
|
"""
|
|
Generate a text-based directory tree representation.
|
|
|
|
Args:
|
|
root_path: Root directory to analyze
|
|
max_depth: Maximum depth to traverse
|
|
|
|
Returns:
|
|
String representation of directory tree
|
|
"""
|
|
root = Path(root_path).resolve()
|
|
tree_lines = []
|
|
|
|
# Critical files to always show explicitly
|
|
critical_files = {
|
|
# Python
|
|
'main.py', 'app.py', '__main__.py', 'config.py', 'settings.py',
|
|
'models.py', 'schemas.py', 'auth.py', 'authentication.py',
|
|
'middleware.py', 'routes.py', 'urls.py', 'api.py',
|
|
# Node/JS/TS
|
|
'index.js', 'index.ts', 'app.js', 'app.ts', 'server.js', 'server.ts',
|
|
'package.json', 'tsconfig.json', 'types.ts',
|
|
# General
|
|
'requirements.txt', 'setup.py', 'pyproject.toml', 'Dockerfile',
|
|
'docker-compose.yml', 'Makefile', '.env.example'
|
|
}
|
|
|
|
def should_skip(path: Path) -> bool:
|
|
"""Check if directory should be skipped."""
|
|
return path.name in SKIP_DIRS or path.name.startswith('.')
|
|
|
|
def add_directory(dir_path: Path, prefix: str = "", depth: int = 0):
|
|
"""Recursively add directory contents to tree."""
|
|
if depth > max_depth:
|
|
return
|
|
|
|
try:
|
|
items = sorted(dir_path.iterdir(), key=lambda x: (not x.is_dir(), x.name.lower()))
|
|
except PermissionError:
|
|
return
|
|
|
|
dirs = [item for item in items if item.is_dir() and not should_skip(item)]
|
|
files = [item for item in items if item.is_file()]
|
|
|
|
# Group files by extension
|
|
file_groups = {}
|
|
critical_in_dir = []
|
|
|
|
for file in files:
|
|
if file.name in critical_files:
|
|
critical_in_dir.append(file)
|
|
else:
|
|
ext = file.suffix or 'no-ext'
|
|
if ext not in file_groups:
|
|
file_groups[ext] = 0
|
|
file_groups[ext] += 1
|
|
|
|
# Show critical files explicitly
|
|
for file in critical_in_dir:
|
|
is_last = (file == critical_in_dir[-1]) and not dirs and not file_groups
|
|
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{file.name}")
|
|
|
|
# Show file count summary by type
|
|
if file_groups:
|
|
summary_parts = []
|
|
for ext, count in sorted(file_groups.items()):
|
|
if count > 1:
|
|
summary_parts.append(f"{count} {ext} files")
|
|
elif count == 1:
|
|
summary_parts.append(f"1 {ext} file")
|
|
|
|
if summary_parts:
|
|
is_last = not dirs
|
|
summary = f"[{', '.join(summary_parts)}]"
|
|
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{summary}")
|
|
|
|
# Process subdirectories
|
|
for i, subdir in enumerate(dirs):
|
|
is_last_dir = (i == len(dirs) - 1)
|
|
tree_lines.append(f"{prefix}{'└── ' if is_last_dir else '├── '}{subdir.name}/")
|
|
|
|
extension = " " if is_last_dir else "│ "
|
|
add_directory(subdir, prefix + extension, depth + 1)
|
|
|
|
tree_lines.append(f"{root.name}/")
|
|
add_directory(root, "", 0)
|
|
|
|
return "\n".join(tree_lines)
|
|
|
|
|
|
def aggregate_statistics(manifest_path: str, db_path: str) -> Dict[str, Any]:
|
|
"""
|
|
Aggregate project-wide statistics from manifest and database.
|
|
|
|
Args:
|
|
manifest_path: Path to manifest.json
|
|
db_path: Path to repo_index.db
|
|
|
|
Returns:
|
|
Dictionary containing project statistics
|
|
"""
|
|
stats = {
|
|
'total_files': 0,
|
|
'total_loc': 0,
|
|
'total_bytes': 0,
|
|
'total_tokens': 0, # Estimated as chars/4
|
|
'languages': {},
|
|
'total_functions': 0,
|
|
'total_classes': 0,
|
|
'total_imports': 0,
|
|
'total_calls': 0,
|
|
'top_10_largest': [],
|
|
'top_15_critical': []
|
|
}
|
|
|
|
# Read manifest.json if it exists
|
|
if Path(manifest_path).exists():
|
|
with open(manifest_path, 'r') as f:
|
|
manifest = json.load(f)
|
|
|
|
stats['total_files'] = len(manifest)
|
|
|
|
# Language distribution and totals
|
|
for file_info in manifest:
|
|
stats['total_loc'] += file_info.get('loc', 0)
|
|
stats['total_bytes'] += file_info.get('bytes', 0)
|
|
|
|
ext = file_info.get('ext', '').lower()
|
|
if ext:
|
|
# Map extensions to languages
|
|
lang_map = {
|
|
'.py': 'Python',
|
|
'.js': 'JavaScript',
|
|
'.ts': 'TypeScript',
|
|
'.jsx': 'JSX',
|
|
'.tsx': 'TSX',
|
|
'.java': 'Java',
|
|
'.go': 'Go',
|
|
'.rs': 'Rust',
|
|
'.cpp': 'C++', '.cc': 'C++',
|
|
'.c': 'C',
|
|
'.rb': 'Ruby',
|
|
'.php': 'PHP',
|
|
'.cs': 'C#',
|
|
'.swift': 'Swift',
|
|
'.kt': 'Kotlin',
|
|
'.r': 'R',
|
|
'.m': 'MATLAB',
|
|
'.jl': 'Julia',
|
|
'.sh': 'Shell',
|
|
'.yml': 'YAML', '.yaml': 'YAML',
|
|
'.json': 'JSON',
|
|
'.xml': 'XML',
|
|
'.html': 'HTML',
|
|
'.css': 'CSS',
|
|
'.scss': 'SCSS',
|
|
'.sql': 'SQL',
|
|
'.md': 'Markdown'
|
|
}
|
|
|
|
lang = lang_map.get(ext, 'Other')
|
|
stats['languages'][lang] = stats['languages'].get(lang, 0) + 1
|
|
|
|
# Estimate tokens (rough approximation: 1 token ≈ 4 characters)
|
|
stats['total_tokens'] = stats['total_bytes'] // 4
|
|
|
|
# Find top 10 largest files by LOC
|
|
sorted_by_size = sorted(manifest, key=lambda x: x.get('loc', 0), reverse=True)
|
|
for file_info in sorted_by_size[:10]:
|
|
stats['top_10_largest'].append({
|
|
'path': file_info['path'],
|
|
'loc': file_info['loc'],
|
|
'bytes': file_info['bytes'],
|
|
'tokens': file_info['bytes'] // 4,
|
|
'percent': round((file_info['bytes'] / stats['total_bytes']) * 100, 2) if stats['total_bytes'] > 0 else 0
|
|
})
|
|
|
|
# Find critical files based on naming patterns
|
|
critical_patterns = {
|
|
# Python patterns
|
|
'main.py': 'Entry point',
|
|
'app.py': 'Application entry',
|
|
'__main__.py': 'Module entry',
|
|
'config.py': 'Configuration',
|
|
'settings.py': 'Settings',
|
|
'models.py': 'Data models',
|
|
'schemas.py': 'Data schemas',
|
|
'auth.py': 'Authentication',
|
|
'authentication.py': 'Authentication',
|
|
'middleware.py': 'Middleware',
|
|
'routes.py': 'Routes',
|
|
'urls.py': 'URL patterns',
|
|
'api.py': 'API endpoints',
|
|
'views.py': 'Views',
|
|
'database.py': 'Database',
|
|
'db.py': 'Database',
|
|
# Node/JS/TS patterns
|
|
'index.js': 'Entry point',
|
|
'index.ts': 'Entry point',
|
|
'app.js': 'Application',
|
|
'app.ts': 'Application',
|
|
'server.js': 'Server',
|
|
'server.ts': 'Server',
|
|
'package.json': 'Dependencies',
|
|
'tsconfig.json': 'TypeScript config',
|
|
'types.ts': 'Type definitions',
|
|
'types.d.ts': 'Type definitions',
|
|
'middleware.js': 'Middleware',
|
|
'middleware.ts': 'Middleware',
|
|
'routes.js': 'Routes',
|
|
'routes.ts': 'Routes',
|
|
'config.js': 'Configuration',
|
|
'config.ts': 'Configuration',
|
|
# General
|
|
'Dockerfile': 'Container definition',
|
|
'docker-compose.yml': 'Container orchestration',
|
|
'requirements.txt': 'Python dependencies',
|
|
'setup.py': 'Python package',
|
|
'pyproject.toml': 'Python project',
|
|
'Makefile': 'Build automation'
|
|
}
|
|
|
|
for file_info in manifest:
|
|
filename = Path(file_info['path']).name
|
|
if filename in critical_patterns:
|
|
stats['top_15_critical'].append({
|
|
'path': file_info['path'],
|
|
'filename': filename,
|
|
'purpose': critical_patterns[filename],
|
|
'loc': file_info['loc'],
|
|
'bytes': file_info['bytes']
|
|
})
|
|
|
|
# Limit to top 15 critical files
|
|
stats['top_15_critical'] = stats['top_15_critical'][:15]
|
|
|
|
# Query database for symbol counts if it exists
|
|
if Path(db_path).exists():
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
# Count functions
|
|
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'function'")
|
|
stats['total_functions'] = cursor.fetchone()[0]
|
|
|
|
# Count classes
|
|
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'class'")
|
|
stats['total_classes'] = cursor.fetchone()[0]
|
|
|
|
# Count calls (can represent imports/dependencies)
|
|
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'call'")
|
|
stats['total_calls'] = cursor.fetchone()[0]
|
|
|
|
# Count imports from refs table if it exists
|
|
try:
|
|
cursor.execute("SELECT COUNT(*) FROM refs WHERE kind IN ('import', 'from', 'require')")
|
|
stats['total_imports'] = cursor.fetchone()[0]
|
|
except sqlite3.OperationalError:
|
|
# refs table might not exist
|
|
pass
|
|
|
|
conn.close()
|
|
except Exception as e:
|
|
# Database might be empty or malformed
|
|
pass
|
|
|
|
return stats
|
|
|
|
|
|
def generate_project_summary(
|
|
root_path: str = ".",
|
|
manifest_path: str = "./.pf/manifest.json",
|
|
db_path: str = "./.pf/repo_index.db",
|
|
max_depth: int = 4
|
|
) -> str:
|
|
"""
|
|
Generate comprehensive project summary markdown report.
|
|
|
|
Args:
|
|
root_path: Root directory of project
|
|
manifest_path: Path to manifest.json
|
|
db_path: Path to repo_index.db
|
|
|
|
Returns:
|
|
Markdown formatted project summary report
|
|
"""
|
|
lines = []
|
|
|
|
# Header
|
|
lines.append("# Project Structure & Intelligence Report")
|
|
lines.append("")
|
|
lines.append("*This AI-optimized report provides immediate project comprehension.*")
|
|
lines.append("")
|
|
|
|
# Get statistics
|
|
stats = aggregate_statistics(manifest_path, db_path)
|
|
|
|
# Project Summary Section
|
|
lines.append("## Project Summary")
|
|
lines.append("")
|
|
lines.append(f"- **Total Files**: {stats['total_files']:,} (analyzable)")
|
|
|
|
# Calculate token percentage of Claude's context
|
|
claude_context = 400000 # Approximate context window
|
|
token_percent = (stats['total_tokens'] / claude_context * 100) if stats['total_tokens'] > 0 else 0
|
|
lines.append(f"- **Total Tokens**: ~{stats['total_tokens']:,} ({token_percent:.1f}% of Claude's context)")
|
|
lines.append(f"- **Total LOC**: {stats['total_loc']:,}")
|
|
|
|
# Language breakdown
|
|
if stats['languages']:
|
|
# Sort languages by file count
|
|
sorted_langs = sorted(stats['languages'].items(), key=lambda x: x[1], reverse=True)
|
|
total_files = sum(stats['languages'].values())
|
|
|
|
lang_parts = []
|
|
for lang, count in sorted_langs[:5]: # Top 5 languages
|
|
percent = (count / total_files * 100) if total_files > 0 else 0
|
|
lang_parts.append(f"{lang} ({percent:.0f}%)")
|
|
|
|
lines.append(f"- **Languages**: {', '.join(lang_parts)}")
|
|
|
|
# Key metrics
|
|
lines.append("")
|
|
lines.append("### Key Metrics")
|
|
lines.append("")
|
|
if stats['total_classes'] > 0:
|
|
lines.append(f"- **Classes**: {stats['total_classes']:,}")
|
|
if stats['total_functions'] > 0:
|
|
lines.append(f"- **Functions**: {stats['total_functions']:,}")
|
|
if stats['total_imports'] > 0:
|
|
lines.append(f"- **Imports**: {stats['total_imports']:,}")
|
|
if stats['total_calls'] > 0:
|
|
lines.append(f"- **Function Calls**: {stats['total_calls']:,}")
|
|
|
|
lines.append("")
|
|
|
|
# Top 10 Largest Files
|
|
if stats['top_10_largest']:
|
|
lines.append("## Largest Files (by tokens)")
|
|
lines.append("")
|
|
lines.append("| # | File | LOC | Tokens | % of Codebase |")
|
|
lines.append("|---|------|-----|--------|---------------|")
|
|
|
|
for i, file_info in enumerate(stats['top_10_largest'], 1):
|
|
path = file_info['path']
|
|
if len(path) > 50:
|
|
# Truncate long paths
|
|
path = "..." + path[-47:]
|
|
lines.append(f"| {i} | `{path}` | {file_info['loc']:,} | {file_info['tokens']:,} | {file_info['percent']:.1f}% |")
|
|
|
|
lines.append("")
|
|
|
|
# Top Critical Files
|
|
if stats['top_15_critical']:
|
|
lines.append("## Critical Files (by convention)")
|
|
lines.append("")
|
|
lines.append("*Files identified as architecturally significant based on naming patterns:*")
|
|
lines.append("")
|
|
lines.append("| File | Purpose | LOC |")
|
|
lines.append("|------|---------|-----|")
|
|
|
|
for file_info in stats['top_15_critical']:
|
|
path = file_info['path']
|
|
if len(path) > 40:
|
|
# Show just filename and parent dir
|
|
parts = Path(path).parts
|
|
if len(parts) > 2:
|
|
path = f".../{parts[-2]}/{parts[-1]}"
|
|
else:
|
|
path = "/".join(parts)
|
|
lines.append(f"| `{path}` | {file_info['purpose']} | {file_info['loc']:,} |")
|
|
|
|
lines.append("")
|
|
|
|
# Directory Tree
|
|
lines.append("## Directory Structure")
|
|
lines.append("")
|
|
lines.append("```")
|
|
tree = generate_directory_tree(root_path, max_depth)
|
|
lines.append(tree)
|
|
lines.append("```")
|
|
lines.append("")
|
|
|
|
# Token Tracking for AI Context
|
|
lines.append("## AI Context Optimization")
|
|
lines.append("")
|
|
lines.append("### Reading Order for Maximum Comprehension")
|
|
lines.append("")
|
|
lines.append("1. **Start here**: This file (STRUCTURE.md) - ~2,000 tokens")
|
|
lines.append("2. **Core understanding**: Critical files listed above - ~10,000 tokens")
|
|
lines.append("3. **Issues & findings**: AUDIT.md - ~15,000 tokens")
|
|
lines.append("4. **Detailed analysis**: Other reports as needed")
|
|
lines.append("")
|
|
|
|
lines.append("### Token Budget Recommendations")
|
|
lines.append("")
|
|
if stats['total_tokens'] < 50000:
|
|
lines.append("- **Small project**: Can load entire codebase if needed")
|
|
elif stats['total_tokens'] < 150000:
|
|
lines.append("- **Medium project**: Focus on critical files and problem areas")
|
|
else:
|
|
lines.append("- **Large project**: Use worksets and targeted analysis")
|
|
|
|
lines.append("")
|
|
lines.append("---")
|
|
lines.append("*Generated by TheAuditor - Truth through systematic observation*")
|
|
|
|
return "\n".join(lines) |