Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform

This commit is contained in:
TheAuditorTool
2025-09-07 20:39:47 +07:00
commit ba5c287b02
215 changed files with 50911 additions and 0 deletions

View File

@@ -0,0 +1,421 @@
"""Generate project structure and intelligence reports for AI consumption."""
import json
import os
import sqlite3
from pathlib import Path
from typing import Dict, List, Tuple, Any
from .indexer.config import SKIP_DIRS
def generate_directory_tree(root_path: str = ".", max_depth: int = 4) -> str:
"""
Generate a text-based directory tree representation.
Args:
root_path: Root directory to analyze
max_depth: Maximum depth to traverse
Returns:
String representation of directory tree
"""
root = Path(root_path).resolve()
tree_lines = []
# Critical files to always show explicitly
critical_files = {
# Python
'main.py', 'app.py', '__main__.py', 'config.py', 'settings.py',
'models.py', 'schemas.py', 'auth.py', 'authentication.py',
'middleware.py', 'routes.py', 'urls.py', 'api.py',
# Node/JS/TS
'index.js', 'index.ts', 'app.js', 'app.ts', 'server.js', 'server.ts',
'package.json', 'tsconfig.json', 'types.ts',
# General
'requirements.txt', 'setup.py', 'pyproject.toml', 'Dockerfile',
'docker-compose.yml', 'Makefile', '.env.example'
}
def should_skip(path: Path) -> bool:
"""Check if directory should be skipped."""
return path.name in SKIP_DIRS or path.name.startswith('.')
def add_directory(dir_path: Path, prefix: str = "", depth: int = 0):
"""Recursively add directory contents to tree."""
if depth > max_depth:
return
try:
items = sorted(dir_path.iterdir(), key=lambda x: (not x.is_dir(), x.name.lower()))
except PermissionError:
return
dirs = [item for item in items if item.is_dir() and not should_skip(item)]
files = [item for item in items if item.is_file()]
# Group files by extension
file_groups = {}
critical_in_dir = []
for file in files:
if file.name in critical_files:
critical_in_dir.append(file)
else:
ext = file.suffix or 'no-ext'
if ext not in file_groups:
file_groups[ext] = 0
file_groups[ext] += 1
# Show critical files explicitly
for file in critical_in_dir:
is_last = (file == critical_in_dir[-1]) and not dirs and not file_groups
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{file.name}")
# Show file count summary by type
if file_groups:
summary_parts = []
for ext, count in sorted(file_groups.items()):
if count > 1:
summary_parts.append(f"{count} {ext} files")
elif count == 1:
summary_parts.append(f"1 {ext} file")
if summary_parts:
is_last = not dirs
summary = f"[{', '.join(summary_parts)}]"
tree_lines.append(f"{prefix}{'└── ' if is_last else '├── '}{summary}")
# Process subdirectories
for i, subdir in enumerate(dirs):
is_last_dir = (i == len(dirs) - 1)
tree_lines.append(f"{prefix}{'└── ' if is_last_dir else '├── '}{subdir.name}/")
extension = " " if is_last_dir else ""
add_directory(subdir, prefix + extension, depth + 1)
tree_lines.append(f"{root.name}/")
add_directory(root, "", 0)
return "\n".join(tree_lines)
def aggregate_statistics(manifest_path: str, db_path: str) -> Dict[str, Any]:
"""
Aggregate project-wide statistics from manifest and database.
Args:
manifest_path: Path to manifest.json
db_path: Path to repo_index.db
Returns:
Dictionary containing project statistics
"""
stats = {
'total_files': 0,
'total_loc': 0,
'total_bytes': 0,
'total_tokens': 0, # Estimated as chars/4
'languages': {},
'total_functions': 0,
'total_classes': 0,
'total_imports': 0,
'total_calls': 0,
'top_10_largest': [],
'top_15_critical': []
}
# Read manifest.json if it exists
if Path(manifest_path).exists():
with open(manifest_path, 'r') as f:
manifest = json.load(f)
stats['total_files'] = len(manifest)
# Language distribution and totals
for file_info in manifest:
stats['total_loc'] += file_info.get('loc', 0)
stats['total_bytes'] += file_info.get('bytes', 0)
ext = file_info.get('ext', '').lower()
if ext:
# Map extensions to languages
lang_map = {
'.py': 'Python',
'.js': 'JavaScript',
'.ts': 'TypeScript',
'.jsx': 'JSX',
'.tsx': 'TSX',
'.java': 'Java',
'.go': 'Go',
'.rs': 'Rust',
'.cpp': 'C++', '.cc': 'C++',
'.c': 'C',
'.rb': 'Ruby',
'.php': 'PHP',
'.cs': 'C#',
'.swift': 'Swift',
'.kt': 'Kotlin',
'.r': 'R',
'.m': 'MATLAB',
'.jl': 'Julia',
'.sh': 'Shell',
'.yml': 'YAML', '.yaml': 'YAML',
'.json': 'JSON',
'.xml': 'XML',
'.html': 'HTML',
'.css': 'CSS',
'.scss': 'SCSS',
'.sql': 'SQL',
'.md': 'Markdown'
}
lang = lang_map.get(ext, 'Other')
stats['languages'][lang] = stats['languages'].get(lang, 0) + 1
# Estimate tokens (rough approximation: 1 token ≈ 4 characters)
stats['total_tokens'] = stats['total_bytes'] // 4
# Find top 10 largest files by LOC
sorted_by_size = sorted(manifest, key=lambda x: x.get('loc', 0), reverse=True)
for file_info in sorted_by_size[:10]:
stats['top_10_largest'].append({
'path': file_info['path'],
'loc': file_info['loc'],
'bytes': file_info['bytes'],
'tokens': file_info['bytes'] // 4,
'percent': round((file_info['bytes'] / stats['total_bytes']) * 100, 2) if stats['total_bytes'] > 0 else 0
})
# Find critical files based on naming patterns
critical_patterns = {
# Python patterns
'main.py': 'Entry point',
'app.py': 'Application entry',
'__main__.py': 'Module entry',
'config.py': 'Configuration',
'settings.py': 'Settings',
'models.py': 'Data models',
'schemas.py': 'Data schemas',
'auth.py': 'Authentication',
'authentication.py': 'Authentication',
'middleware.py': 'Middleware',
'routes.py': 'Routes',
'urls.py': 'URL patterns',
'api.py': 'API endpoints',
'views.py': 'Views',
'database.py': 'Database',
'db.py': 'Database',
# Node/JS/TS patterns
'index.js': 'Entry point',
'index.ts': 'Entry point',
'app.js': 'Application',
'app.ts': 'Application',
'server.js': 'Server',
'server.ts': 'Server',
'package.json': 'Dependencies',
'tsconfig.json': 'TypeScript config',
'types.ts': 'Type definitions',
'types.d.ts': 'Type definitions',
'middleware.js': 'Middleware',
'middleware.ts': 'Middleware',
'routes.js': 'Routes',
'routes.ts': 'Routes',
'config.js': 'Configuration',
'config.ts': 'Configuration',
# General
'Dockerfile': 'Container definition',
'docker-compose.yml': 'Container orchestration',
'requirements.txt': 'Python dependencies',
'setup.py': 'Python package',
'pyproject.toml': 'Python project',
'Makefile': 'Build automation'
}
for file_info in manifest:
filename = Path(file_info['path']).name
if filename in critical_patterns:
stats['top_15_critical'].append({
'path': file_info['path'],
'filename': filename,
'purpose': critical_patterns[filename],
'loc': file_info['loc'],
'bytes': file_info['bytes']
})
# Limit to top 15 critical files
stats['top_15_critical'] = stats['top_15_critical'][:15]
# Query database for symbol counts if it exists
if Path(db_path).exists():
try:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Count functions
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'function'")
stats['total_functions'] = cursor.fetchone()[0]
# Count classes
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'class'")
stats['total_classes'] = cursor.fetchone()[0]
# Count calls (can represent imports/dependencies)
cursor.execute("SELECT COUNT(*) FROM symbols WHERE type = 'call'")
stats['total_calls'] = cursor.fetchone()[0]
# Count imports from refs table if it exists
try:
cursor.execute("SELECT COUNT(*) FROM refs WHERE kind IN ('import', 'from', 'require')")
stats['total_imports'] = cursor.fetchone()[0]
except sqlite3.OperationalError:
# refs table might not exist
pass
conn.close()
except Exception as e:
# Database might be empty or malformed
pass
return stats
def generate_project_summary(
root_path: str = ".",
manifest_path: str = "./.pf/manifest.json",
db_path: str = "./.pf/repo_index.db",
max_depth: int = 4
) -> str:
"""
Generate comprehensive project summary markdown report.
Args:
root_path: Root directory of project
manifest_path: Path to manifest.json
db_path: Path to repo_index.db
Returns:
Markdown formatted project summary report
"""
lines = []
# Header
lines.append("# Project Structure & Intelligence Report")
lines.append("")
lines.append("*This AI-optimized report provides immediate project comprehension.*")
lines.append("")
# Get statistics
stats = aggregate_statistics(manifest_path, db_path)
# Project Summary Section
lines.append("## Project Summary")
lines.append("")
lines.append(f"- **Total Files**: {stats['total_files']:,} (analyzable)")
# Calculate token percentage of Claude's context
claude_context = 400000 # Approximate context window
token_percent = (stats['total_tokens'] / claude_context * 100) if stats['total_tokens'] > 0 else 0
lines.append(f"- **Total Tokens**: ~{stats['total_tokens']:,} ({token_percent:.1f}% of Claude's context)")
lines.append(f"- **Total LOC**: {stats['total_loc']:,}")
# Language breakdown
if stats['languages']:
# Sort languages by file count
sorted_langs = sorted(stats['languages'].items(), key=lambda x: x[1], reverse=True)
total_files = sum(stats['languages'].values())
lang_parts = []
for lang, count in sorted_langs[:5]: # Top 5 languages
percent = (count / total_files * 100) if total_files > 0 else 0
lang_parts.append(f"{lang} ({percent:.0f}%)")
lines.append(f"- **Languages**: {', '.join(lang_parts)}")
# Key metrics
lines.append("")
lines.append("### Key Metrics")
lines.append("")
if stats['total_classes'] > 0:
lines.append(f"- **Classes**: {stats['total_classes']:,}")
if stats['total_functions'] > 0:
lines.append(f"- **Functions**: {stats['total_functions']:,}")
if stats['total_imports'] > 0:
lines.append(f"- **Imports**: {stats['total_imports']:,}")
if stats['total_calls'] > 0:
lines.append(f"- **Function Calls**: {stats['total_calls']:,}")
lines.append("")
# Top 10 Largest Files
if stats['top_10_largest']:
lines.append("## Largest Files (by tokens)")
lines.append("")
lines.append("| # | File | LOC | Tokens | % of Codebase |")
lines.append("|---|------|-----|--------|---------------|")
for i, file_info in enumerate(stats['top_10_largest'], 1):
path = file_info['path']
if len(path) > 50:
# Truncate long paths
path = "..." + path[-47:]
lines.append(f"| {i} | `{path}` | {file_info['loc']:,} | {file_info['tokens']:,} | {file_info['percent']:.1f}% |")
lines.append("")
# Top Critical Files
if stats['top_15_critical']:
lines.append("## Critical Files (by convention)")
lines.append("")
lines.append("*Files identified as architecturally significant based on naming patterns:*")
lines.append("")
lines.append("| File | Purpose | LOC |")
lines.append("|------|---------|-----|")
for file_info in stats['top_15_critical']:
path = file_info['path']
if len(path) > 40:
# Show just filename and parent dir
parts = Path(path).parts
if len(parts) > 2:
path = f".../{parts[-2]}/{parts[-1]}"
else:
path = "/".join(parts)
lines.append(f"| `{path}` | {file_info['purpose']} | {file_info['loc']:,} |")
lines.append("")
# Directory Tree
lines.append("## Directory Structure")
lines.append("")
lines.append("```")
tree = generate_directory_tree(root_path, max_depth)
lines.append(tree)
lines.append("```")
lines.append("")
# Token Tracking for AI Context
lines.append("## AI Context Optimization")
lines.append("")
lines.append("### Reading Order for Maximum Comprehension")
lines.append("")
lines.append("1. **Start here**: This file (STRUCTURE.md) - ~2,000 tokens")
lines.append("2. **Core understanding**: Critical files listed above - ~10,000 tokens")
lines.append("3. **Issues & findings**: AUDIT.md - ~15,000 tokens")
lines.append("4. **Detailed analysis**: Other reports as needed")
lines.append("")
lines.append("### Token Budget Recommendations")
lines.append("")
if stats['total_tokens'] < 50000:
lines.append("- **Small project**: Can load entire codebase if needed")
elif stats['total_tokens'] < 150000:
lines.append("- **Medium project**: Focus on critical files and problem areas")
else:
lines.append("- **Large project**: Use worksets and targeted analysis")
lines.append("")
lines.append("---")
lines.append("*Generated by TheAuditor - Truth through systematic observation*")
return "\n".join(lines)