mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
1017 lines
38 KiB
Python
1017 lines
38 KiB
Python
"""Graph builder module - constructs dependency and call graphs."""
|
|
|
|
import os
|
|
import platform
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from dataclasses import asdict, dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
# Windows compatibility
|
|
IS_WINDOWS = platform.system() == "Windows"
|
|
|
|
import click
|
|
|
|
from theauditor.indexer.config import SKIP_DIRS
|
|
from theauditor.module_resolver import ModuleResolver
|
|
from theauditor.ast_parser import ASTParser
|
|
|
|
|
|
@dataclass
|
|
class GraphNode:
|
|
"""Represents a node in the dependency or call graph."""
|
|
|
|
id: str
|
|
file: str
|
|
lang: str | None = None
|
|
loc: int = 0
|
|
churn: int | None = None # Git commit count if available
|
|
type: str = "module" # module, function, class
|
|
|
|
|
|
@dataclass
|
|
class GraphEdge:
|
|
"""Represents an edge in the graph."""
|
|
|
|
source: str
|
|
target: str
|
|
type: str = "import" # import, call, extends, implements
|
|
file: str | None = None
|
|
line: int | None = None
|
|
|
|
|
|
@dataclass
|
|
class Cycle:
|
|
"""Represents a cycle in the dependency graph."""
|
|
|
|
nodes: list[str]
|
|
size: int
|
|
|
|
def __init__(self, nodes: list[str]):
|
|
self.nodes = nodes
|
|
self.size = len(nodes)
|
|
|
|
|
|
@dataclass
|
|
class Hotspot:
|
|
"""Represents a hotspot node with high connectivity."""
|
|
|
|
id: str
|
|
in_degree: int
|
|
out_degree: int
|
|
centrality: float
|
|
score: float # Computed based on weights
|
|
|
|
|
|
@dataclass
|
|
class ImpactAnalysis:
|
|
"""Results of change impact analysis."""
|
|
|
|
targets: list[str]
|
|
upstream: list[str] # What depends on targets
|
|
downstream: list[str] # What targets depend on
|
|
total_impacted: int
|
|
|
|
|
|
class XGraphBuilder:
|
|
"""Build cross-project dependency and call graphs."""
|
|
|
|
# Import regex patterns for different languages
|
|
IMPORT_PATTERNS = {
|
|
"python": [
|
|
r"^import\s+(\S+)",
|
|
r"^from\s+(\S+)\s+import",
|
|
],
|
|
"javascript": [
|
|
# Standard ES6 imports with 'from'
|
|
r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]",
|
|
|
|
# Side-effect imports (no 'from')
|
|
r"import\s+['\"]([^'\"]+)['\"]",
|
|
|
|
# CommonJS require
|
|
r"require\(['\"]([^'\"]+)['\"]\)",
|
|
|
|
# Dynamic imports
|
|
r"import\(['\"]([^'\"]+)['\"]\)",
|
|
|
|
# Re-exports
|
|
r"export\s+.*?\s+from\s+['\"]([^'\"]+)['\"]",
|
|
],
|
|
"typescript": [
|
|
# Standard ES6 imports with 'from'
|
|
r"import\s+.*?\s+from\s+['\"]([^'\"]+)['\"]",
|
|
|
|
# Side-effect imports (no 'from')
|
|
r"import\s+['\"]([^'\"]+)['\"]",
|
|
|
|
# Type-only imports
|
|
r"import\s+type\s+.*?\s+from\s+['\"]([^'\"]+)['\"]",
|
|
|
|
# CommonJS require
|
|
r"require\(['\"]([^'\"]+)['\"]\)",
|
|
|
|
# Dynamic imports
|
|
r"import\(['\"]([^'\"]+)['\"]\)",
|
|
|
|
# Re-exports
|
|
r"export\s+.*?\s+from\s+['\"]([^'\"]+)['\"]",
|
|
],
|
|
"java": [
|
|
r"^import\s+(\S+);",
|
|
r"^import\s+static\s+(\S+);",
|
|
],
|
|
"go": [
|
|
r'^import\s+"([^"]+)"',
|
|
r'^import\s+\(\s*"([^"]+)"',
|
|
],
|
|
"c#": [
|
|
r"^using\s+(\S+);",
|
|
r"^using\s+static\s+(\S+);",
|
|
],
|
|
"php": [
|
|
r"^use\s+(\S+);",
|
|
r"require_once\s*\(['\"]([^'\"]+)['\"]\)",
|
|
r"include_once\s*\(['\"]([^'\"]+)['\"]\)",
|
|
],
|
|
"ruby": [
|
|
r"^require\s+['\"]([^'\"]+)['\"]",
|
|
r"^require_relative\s+['\"]([^'\"]+)['\"]",
|
|
],
|
|
}
|
|
|
|
# Export patterns for different languages
|
|
EXPORT_PATTERNS = {
|
|
"python": [
|
|
r"^def\s+(\w+)\s*\(",
|
|
r"^class\s+(\w+)",
|
|
r"^(\w+)\s*=", # Module-level variables
|
|
],
|
|
"javascript": [
|
|
r"export\s+(?:default\s+)?(?:function|class|const|let|var)\s+(\w+)",
|
|
r"exports\.(\w+)\s*=",
|
|
r"module\.exports\.(\w+)\s*=",
|
|
],
|
|
"typescript": [
|
|
r"export\s+(?:default\s+)?(?:function|class|const|let|var|interface|type)\s+(\w+)",
|
|
r"exports\.(\w+)\s*=",
|
|
],
|
|
"java": [
|
|
r"public\s+(?:static\s+)?(?:class|interface|enum)\s+(\w+)",
|
|
r"public\s+(?:static\s+)?(?:\w+\s+)?(\w+)\s*\(", # Public methods
|
|
],
|
|
"go": [
|
|
r"^func\s+(\w+)\s*\(", # Exported if capitalized
|
|
r"^type\s+(\w+)\s+",
|
|
r"^var\s+(\w+)\s+",
|
|
],
|
|
}
|
|
|
|
# Call patterns for different languages
|
|
CALL_PATTERNS = {
|
|
"python": [
|
|
r"(\w+)\s*\(", # Function calls
|
|
r"(\w+)\.(\w+)\s*\(", # Method calls
|
|
],
|
|
"javascript": [
|
|
r"(\w+)\s*\(",
|
|
r"(\w+)\.(\w+)\s*\(",
|
|
r"new\s+(\w+)\s*\(",
|
|
],
|
|
"typescript": [
|
|
r"(\w+)\s*\(",
|
|
r"(\w+)\.(\w+)\s*\(",
|
|
r"new\s+(\w+)\s*\(",
|
|
],
|
|
"java": [
|
|
r"(\w+)\s*\(",
|
|
r"(\w+)\.(\w+)\s*\(",
|
|
r"new\s+(\w+)\s*\(",
|
|
],
|
|
"go": [
|
|
r"(\w+)\s*\(",
|
|
r"(\w+)\.(\w+)\s*\(",
|
|
],
|
|
}
|
|
|
|
def __init__(self, batch_size: int = 200, exclude_patterns: list[str] = None, project_root: str = "."):
|
|
"""Initialize builder with configuration."""
|
|
self.batch_size = batch_size
|
|
self.exclude_patterns = exclude_patterns or []
|
|
self.checkpoint_file = Path(".pf/xgraph_checkpoint.json")
|
|
self.project_root = Path(project_root).resolve()
|
|
self.module_resolver = ModuleResolver() # No project_root - uses database!
|
|
self.ast_parser = ASTParser() # Initialize AST parser for structural analysis
|
|
|
|
def detect_language(self, file_path: Path) -> str | None:
|
|
"""Detect language from file extension."""
|
|
ext_map = {
|
|
".py": "python",
|
|
".js": "javascript",
|
|
".jsx": "javascript",
|
|
".ts": "typescript",
|
|
".tsx": "typescript",
|
|
".java": "java",
|
|
".go": "go",
|
|
".cs": "c#",
|
|
".php": "php",
|
|
".rb": "ruby",
|
|
".c": "c",
|
|
".cpp": "c++",
|
|
".h": "c",
|
|
".hpp": "c++",
|
|
".rs": "rust",
|
|
".swift": "swift",
|
|
".kt": "kotlin",
|
|
".scala": "scala",
|
|
".r": "r",
|
|
".R": "r",
|
|
".m": "objective-c",
|
|
".mm": "objective-c++",
|
|
}
|
|
return ext_map.get(file_path.suffix.lower())
|
|
|
|
def should_skip(self, file_path: Path) -> bool:
|
|
"""Check if file should be skipped based on exclude patterns."""
|
|
# First, check if any component of the path is in SKIP_DIRS
|
|
for part in file_path.parts:
|
|
if part in SKIP_DIRS:
|
|
return True
|
|
|
|
# Second, check against exclude_patterns
|
|
path_str = str(file_path)
|
|
for pattern in self.exclude_patterns:
|
|
if pattern in path_str:
|
|
return True
|
|
return False
|
|
|
|
def extract_imports_from_db(self, rel_path: str) -> list[str]:
|
|
"""Extract import statements from the database where indexer already stored them.
|
|
|
|
Args:
|
|
rel_path: Relative path as stored in the database (e.g., "backend/src/app.ts")
|
|
|
|
Returns:
|
|
List of import targets
|
|
"""
|
|
import sqlite3
|
|
|
|
# Query the refs table for imports
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if not db_file.exists():
|
|
print(f"Warning: Database not found at {db_file}")
|
|
return []
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Get all imports for this file from refs table
|
|
# The indexer stores imports with kind like 'import', 'require', etc.
|
|
cursor.execute(
|
|
"SELECT value FROM refs WHERE src = ? AND kind IN ('import', 'require', 'from', 'import_type', 'export')",
|
|
(rel_path,)
|
|
)
|
|
|
|
imports = [row[0] for row in cursor.fetchall()]
|
|
conn.close()
|
|
|
|
return imports
|
|
|
|
except sqlite3.Error as e:
|
|
print(f"Warning: Failed to read imports from database: {e}")
|
|
return []
|
|
|
|
def extract_imports(self, file_path: Path, lang: str) -> list[str]:
|
|
"""Extract import statements from the database where indexer already stored them.
|
|
|
|
The indexer has already extracted all imports and stored them in the refs table.
|
|
We should read from there instead of re-parsing files.
|
|
"""
|
|
import sqlite3
|
|
|
|
# Get relative path for database lookup
|
|
try:
|
|
rel_path = file_path.relative_to(self.project_root)
|
|
except ValueError:
|
|
# If file_path is already relative or from a different root
|
|
rel_path = file_path
|
|
|
|
# Normalize path separators for database lookup
|
|
db_path = str(rel_path).replace("\\", "/")
|
|
|
|
# Query the refs table for imports
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if not db_file.exists():
|
|
print(f"Warning: Database not found at {db_file}")
|
|
return []
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Get all imports for this file from refs table
|
|
# The indexer stores imports with kind like 'import', 'require', etc.
|
|
cursor.execute(
|
|
"SELECT value FROM refs WHERE src = ? AND kind IN ('import', 'require', 'from', 'import_type', 'export')",
|
|
(db_path,)
|
|
)
|
|
|
|
imports = [row[0] for row in cursor.fetchall()]
|
|
conn.close()
|
|
|
|
return imports
|
|
|
|
except sqlite3.Error as e:
|
|
print(f"Warning: Failed to read imports from database: {e}")
|
|
return []
|
|
|
|
def extract_exports_from_db(self, rel_path: str) -> list[str]:
|
|
"""Extract exported symbols from the database where indexer already stored them.
|
|
|
|
Args:
|
|
rel_path: Relative path as stored in the database
|
|
|
|
Returns:
|
|
List of exported symbol names
|
|
"""
|
|
import sqlite3
|
|
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if not db_file.exists():
|
|
return []
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Get exported functions/classes from symbols table
|
|
# The indexer stores these as 'function' and 'class' types
|
|
cursor.execute(
|
|
"SELECT name FROM symbols WHERE path = ? AND type IN ('function', 'class')",
|
|
(rel_path,)
|
|
)
|
|
|
|
exports = [row[0] for row in cursor.fetchall()]
|
|
conn.close()
|
|
|
|
return exports
|
|
|
|
except sqlite3.Error:
|
|
return []
|
|
|
|
def extract_exports(self, file_path: Path, lang: str) -> list[str]:
|
|
"""Extract exported symbols from a file using AST parser with regex fallback."""
|
|
# Try AST parser first for supported languages
|
|
if self.ast_parser.supports_language(lang):
|
|
try:
|
|
# Check persistent cache first for JS/TS files
|
|
tree = None
|
|
if lang in ["javascript", "typescript"]:
|
|
# Compute file hash for cache lookup
|
|
import hashlib
|
|
with open(file_path, 'rb') as f:
|
|
file_hash = hashlib.sha256(f.read()).hexdigest()
|
|
|
|
# Check cache
|
|
cache_dir = self.project_root / ".pf" / "ast_cache"
|
|
cache_file = cache_dir / f"{file_hash}.json"
|
|
if cache_file.exists():
|
|
try:
|
|
import json
|
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
tree = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass # Cache read failed, parse fresh
|
|
|
|
# Parse file if not in cache
|
|
if not tree:
|
|
tree = self.ast_parser.parse_file(file_path, lang)
|
|
# REMOVED: Cache write logic - only indexer.py should write to cache
|
|
|
|
if tree and tree.get("type") != "regex_fallback":
|
|
# Extract exports using AST
|
|
export_dicts = self.ast_parser.extract_exports(tree, lang)
|
|
# Convert to list of export names
|
|
exports = []
|
|
for exp in export_dicts:
|
|
name = exp.get('name')
|
|
if name and name != 'unknown':
|
|
exports.append(name)
|
|
if exports: # If we got results, return them
|
|
return exports
|
|
except Exception as e:
|
|
# Fall through to regex fallback
|
|
pass
|
|
|
|
# Fallback to regex-based extraction
|
|
return self._extract_exports_regex(file_path, lang)
|
|
|
|
def extract_calls_from_db(self, rel_path: str) -> list[tuple[str, str | None]]:
|
|
"""Extract function calls from the database where indexer already stored them.
|
|
|
|
Args:
|
|
rel_path: Relative path as stored in the database
|
|
|
|
Returns:
|
|
List of (function_name, None) tuples for calls
|
|
"""
|
|
import sqlite3
|
|
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if not db_file.exists():
|
|
return []
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Get function calls from symbols table
|
|
# The indexer stores these as 'call' type
|
|
cursor.execute(
|
|
"SELECT name FROM symbols WHERE path = ? AND type = 'call'",
|
|
(rel_path,)
|
|
)
|
|
|
|
# Return as tuples with None for second element (no parent info)
|
|
calls = [(row[0], None) for row in cursor.fetchall()]
|
|
conn.close()
|
|
|
|
return calls
|
|
|
|
except sqlite3.Error:
|
|
return []
|
|
|
|
def extract_calls(self, file_path: Path, lang: str) -> list[tuple[str, str | None]]:
|
|
"""Extract function/method calls from a file using AST parser with regex fallback."""
|
|
# Try AST parser first for supported languages
|
|
if self.ast_parser.supports_language(lang):
|
|
try:
|
|
# Check persistent cache first for JS/TS files
|
|
tree = None
|
|
if lang in ["javascript", "typescript"]:
|
|
# Compute file hash for cache lookup
|
|
import hashlib
|
|
with open(file_path, 'rb') as f:
|
|
file_hash = hashlib.sha256(f.read()).hexdigest()
|
|
|
|
# Check cache
|
|
cache_dir = self.project_root / ".pf" / "ast_cache"
|
|
cache_file = cache_dir / f"{file_hash}.json"
|
|
if cache_file.exists():
|
|
try:
|
|
import json
|
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
|
tree = json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass # Cache read failed, parse fresh
|
|
|
|
# Parse file if not in cache
|
|
if not tree:
|
|
tree = self.ast_parser.parse_file(file_path, lang)
|
|
# REMOVED: Cache write logic - only indexer.py should write to cache
|
|
|
|
if tree and tree.get("type") != "regex_fallback":
|
|
# Extract calls using AST
|
|
call_dicts = self.ast_parser.extract_calls(tree, lang)
|
|
# Convert to list of (function, method) tuples
|
|
calls = []
|
|
for call in call_dicts:
|
|
name = call.get('name', '')
|
|
# Check if it's a method call (contains dot)
|
|
if '.' in name:
|
|
parts = name.rsplit('.', 1)
|
|
if len(parts) == 2:
|
|
calls.append((parts[0], parts[1]))
|
|
else:
|
|
calls.append((name, None))
|
|
else:
|
|
calls.append((name, None))
|
|
if calls: # If we got results, return them
|
|
return calls
|
|
except Exception as e:
|
|
# Fall through to regex fallback
|
|
pass
|
|
|
|
# Fallback to regex-based extraction
|
|
return self._extract_calls_regex(file_path, lang)
|
|
|
|
def resolve_import_path(self, import_str: str, source_file: Path, lang: str) -> str:
|
|
"""Resolve import string to a normalized module path that matches actual files in the graph."""
|
|
import sqlite3
|
|
|
|
# Clean up the import string (remove quotes, semicolons, etc.)
|
|
import_str = import_str.strip().strip('"\'`;')
|
|
|
|
# Language-specific resolution
|
|
if lang == "python":
|
|
# Convert Python module path to file path
|
|
parts = import_str.split(".")
|
|
return "/".join(parts)
|
|
elif lang in ["javascript", "typescript"]:
|
|
# Get source file directory for relative imports
|
|
source_dir = source_file.parent
|
|
# Handle case where source_file might already be relative or might be from manifest
|
|
try:
|
|
source_rel = str(source_file.relative_to(self.project_root)).replace("\\", "/")
|
|
except ValueError:
|
|
# If source_file is already relative or from a different root, use it as is
|
|
source_rel = str(source_file).replace("\\", "/")
|
|
|
|
# Handle different import patterns
|
|
resolved_path = None
|
|
|
|
# 1. Handle TypeScript path aliases using ModuleResolver (database-driven)
|
|
if import_str.startswith("@"):
|
|
# Determine context from source file location
|
|
try:
|
|
source_rel = str(source_file.relative_to(self.project_root)).replace("\\", "/")
|
|
except ValueError:
|
|
source_rel = str(source_file).replace("\\", "/")
|
|
|
|
# Determine which tsconfig context applies
|
|
if "backend/" in source_rel:
|
|
context = "backend"
|
|
elif "frontend/" in source_rel:
|
|
context = "frontend"
|
|
else:
|
|
context = "root"
|
|
|
|
# Use ModuleResolver's context-aware resolution
|
|
resolved = self.module_resolver.resolve_with_context(import_str, str(source_file), context)
|
|
|
|
# Check if resolution succeeded
|
|
if resolved != import_str:
|
|
# Resolution worked, now verify file exists in database
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if db_file.exists():
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Try with common extensions if no extension
|
|
test_paths = [resolved]
|
|
if not Path(resolved).suffix:
|
|
for ext in [".ts", ".tsx", ".js", ".jsx"]:
|
|
test_paths.append(resolved + ext)
|
|
test_paths.append(resolved + "/index.ts")
|
|
test_paths.append(resolved + "/index.js")
|
|
|
|
for test_path in test_paths:
|
|
cursor.execute("SELECT 1 FROM files WHERE path = ? LIMIT 1", (test_path,))
|
|
if cursor.fetchone():
|
|
conn.close()
|
|
return test_path
|
|
|
|
conn.close()
|
|
except sqlite3.Error:
|
|
pass
|
|
|
|
# Return resolved even if file check failed
|
|
return resolved
|
|
|
|
# 2. Handle relative imports (./foo, ../bar/baz)
|
|
elif import_str.startswith("."):
|
|
# Resolve relative to source file
|
|
try:
|
|
# Remove leading dots and slashes
|
|
rel_import = import_str.lstrip("./")
|
|
|
|
# Go up directories for ../
|
|
up_count = import_str.count("../")
|
|
current_dir = source_dir
|
|
for _ in range(up_count):
|
|
current_dir = current_dir.parent
|
|
|
|
if up_count > 0:
|
|
rel_import = import_str.replace("../", "")
|
|
|
|
# Build the target path
|
|
target_path = current_dir / rel_import
|
|
rel_target = str(target_path.relative_to(self.project_root)).replace("\\", "/")
|
|
|
|
# Check if this file exists (try with extensions)
|
|
db_file = self.project_root / ".pf" / "repo_index.db"
|
|
if db_file.exists():
|
|
try:
|
|
conn = sqlite3.connect(db_file)
|
|
cursor = conn.cursor()
|
|
|
|
# Try with common extensions
|
|
for ext in ["", ".ts", ".tsx", ".js", ".jsx", "/index.ts", "/index.tsx", "/index.js"]:
|
|
test_path = rel_target + ext
|
|
cursor.execute("SELECT 1 FROM files WHERE path = ? LIMIT 1", (test_path,))
|
|
if cursor.fetchone():
|
|
conn.close()
|
|
return test_path
|
|
|
|
conn.close()
|
|
except sqlite3.Error:
|
|
pass
|
|
|
|
return rel_target
|
|
|
|
except (ValueError, OSError):
|
|
pass
|
|
|
|
# 3. Handle node_modules imports (just return as-is, they're external)
|
|
else:
|
|
# For npm packages, just return the package name
|
|
return import_str
|
|
|
|
# If nothing worked, return original
|
|
return import_str
|
|
else:
|
|
# Default: return as-is
|
|
return import_str
|
|
|
|
def get_file_metrics(self, file_path: Path) -> dict[str, Any]:
|
|
"""Get basic metrics for a file."""
|
|
metrics = {"loc": 0, "churn": None}
|
|
|
|
# When working with manifest data, skip file reading
|
|
# The manifest already has loc and other metrics
|
|
if not file_path.exists():
|
|
# File doesn't exist, we're working with manifest data
|
|
# Return default metrics - the caller should use manifest data instead
|
|
return metrics
|
|
|
|
# Count lines of code
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
metrics["loc"] = sum(1 for _ in f)
|
|
except (IOError, UnicodeDecodeError, OSError) as e:
|
|
print(f"Warning: Failed to read {file_path} for metrics: {e}")
|
|
# Still return default metrics but LOG the failure
|
|
|
|
# Get git churn (commit count)
|
|
try:
|
|
# Use temp files to avoid buffer overflow
|
|
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stdout.txt', encoding='utf-8') as stdout_fp, \
|
|
tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='_stderr.txt', encoding='utf-8') as stderr_fp:
|
|
|
|
stdout_path = stdout_fp.name
|
|
stderr_path = stderr_fp.name
|
|
|
|
result = subprocess.run(
|
|
["git", "log", "--oneline", str(file_path)],
|
|
stdout=stdout_fp,
|
|
stderr=stderr_fp,
|
|
text=True,
|
|
timeout=5,
|
|
cwd=Path.cwd(),
|
|
shell=IS_WINDOWS # Windows compatibility fix
|
|
)
|
|
|
|
with open(stdout_path, 'r', encoding='utf-8') as f:
|
|
result.stdout = f.read()
|
|
with open(stderr_path, 'r', encoding='utf-8') as f:
|
|
result.stderr = f.read()
|
|
|
|
os.unlink(stdout_path)
|
|
os.unlink(stderr_path)
|
|
if result.returncode == 0:
|
|
metrics["churn"] = len(result.stdout.strip().split("\n"))
|
|
except (subprocess.TimeoutExpired, OSError, IOError) as e:
|
|
print(f"Warning: Failed to get git churn for {file_path}: {e}")
|
|
# Still return default metrics but LOG the failure
|
|
|
|
return metrics
|
|
|
|
def build_import_graph(
|
|
self,
|
|
root: str = ".",
|
|
langs: list[str] | None = None,
|
|
file_filter: str | None = None,
|
|
file_list: list[dict[str, Any]] | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Build import/dependency graph for the project."""
|
|
root_path = Path(root).resolve()
|
|
nodes = {}
|
|
edges = []
|
|
|
|
# Collect all source files
|
|
files = []
|
|
manifest_lookup = {} # Map file paths to manifest items for metrics
|
|
|
|
if file_list is not None:
|
|
# Use provided file list from manifest
|
|
# The manifest already contains all the file info we need
|
|
for item in file_list:
|
|
manifest_path = Path(item['path'])
|
|
|
|
# Use the path from manifest directly - we don't need actual files
|
|
# The manifest has all the data (path, ext, content, etc.)
|
|
file = root_path / manifest_path # Just for consistent path handling
|
|
|
|
# Store manifest item for later metric lookup
|
|
manifest_lookup[str(file)] = item
|
|
|
|
# Detect language from extension in manifest
|
|
lang = self.detect_language(manifest_path) # Use manifest path
|
|
if lang and (not langs or lang in langs):
|
|
files.append((file, lang))
|
|
else:
|
|
# Fall back to original os.walk logic for backward compatibility
|
|
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
# CRITICAL: Prune excluded directories before os.walk descends into them
|
|
# This prevents traversal into .venv and other SKIP_DIRS
|
|
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
|
|
|
# Also prune based on exclude_patterns
|
|
if self.exclude_patterns:
|
|
dirnames[:] = [d for d in dirnames
|
|
if not any(pattern in d for pattern in self.exclude_patterns)]
|
|
|
|
# Process files in this directory
|
|
for filename in filenames:
|
|
file = Path(dirpath) / filename
|
|
if not self.should_skip(file):
|
|
lang = self.detect_language(file)
|
|
if lang and (not langs or lang in langs):
|
|
files.append((file, lang))
|
|
|
|
# Process files with progress bar
|
|
with click.progressbar(
|
|
files,
|
|
label="Building import graph",
|
|
show_pos=True,
|
|
show_percent=True,
|
|
show_eta=True,
|
|
item_show_func=lambda x: str(x[0].name) if x else None,
|
|
) as bar:
|
|
for file_path, lang in bar:
|
|
# Create node for this file
|
|
rel_path = str(file_path.relative_to(root_path)).replace("\\", "/") # Normalize separators
|
|
node_id = rel_path # Already normalized
|
|
|
|
# Get metrics from manifest if available, otherwise from file
|
|
if str(file_path) in manifest_lookup:
|
|
# Use manifest data which already has metrics
|
|
manifest_item = manifest_lookup[str(file_path)]
|
|
loc = manifest_item.get('loc', 0)
|
|
churn = None # Manifest doesn't have churn data
|
|
else:
|
|
# Fall back to reading file metrics
|
|
metrics = self.get_file_metrics(file_path)
|
|
loc = metrics["loc"]
|
|
churn = metrics["churn"]
|
|
|
|
node = GraphNode(
|
|
id=node_id,
|
|
file=rel_path, # Already normalized
|
|
lang=lang,
|
|
loc=loc,
|
|
churn=churn,
|
|
type="module",
|
|
)
|
|
nodes[node_id] = asdict(node)
|
|
|
|
# Extract imports and create edges
|
|
# Pass the relative path that matches what's in the database
|
|
imports = self.extract_imports_from_db(rel_path)
|
|
for imp in imports:
|
|
target = self.resolve_import_path(imp, file_path, lang)
|
|
edge = GraphEdge(
|
|
source=node_id,
|
|
target=target,
|
|
type="import",
|
|
file=rel_path, # Already normalized
|
|
)
|
|
edges.append(asdict(edge))
|
|
|
|
return {
|
|
"nodes": list(nodes.values()),
|
|
"edges": edges,
|
|
"metadata": {
|
|
"root": str(root_path),
|
|
"languages": list(set(n["lang"] for n in nodes.values())),
|
|
"total_files": len(nodes),
|
|
"total_imports": len(edges),
|
|
},
|
|
}
|
|
|
|
def build_call_graph(
|
|
self,
|
|
root: str = ".",
|
|
langs: list[str] | None = None,
|
|
file_filter: str | None = None,
|
|
file_list: list[dict[str, Any]] | None = None,
|
|
) -> dict[str, Any]:
|
|
"""Build call graph for the project."""
|
|
root_path = Path(root).resolve()
|
|
nodes = {}
|
|
edges = []
|
|
|
|
# Collect all source files
|
|
files = []
|
|
|
|
if file_list is not None:
|
|
# Use provided file list from manifest
|
|
# The manifest already contains all the file info we need
|
|
for item in file_list:
|
|
manifest_path = Path(item['path'])
|
|
|
|
# Use the path from manifest directly - we don't need actual files
|
|
# The manifest has all the data (path, ext, content, etc.)
|
|
file = root_path / manifest_path # Just for consistent path handling
|
|
|
|
# Detect language from extension in manifest
|
|
lang = self.detect_language(manifest_path) # Use manifest path
|
|
if lang and (not langs or lang in langs):
|
|
files.append((file, lang))
|
|
else:
|
|
# Fall back to original os.walk logic for backward compatibility
|
|
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
# CRITICAL: Prune excluded directories before os.walk descends into them
|
|
# This prevents traversal into .venv and other SKIP_DIRS
|
|
dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
|
|
|
|
# Also prune based on exclude_patterns
|
|
if self.exclude_patterns:
|
|
dirnames[:] = [d for d in dirnames
|
|
if not any(pattern in d for pattern in self.exclude_patterns)]
|
|
|
|
# Process files in this directory
|
|
for filename in filenames:
|
|
file = Path(dirpath) / filename
|
|
if not self.should_skip(file):
|
|
lang = self.detect_language(file)
|
|
if lang and (not langs or lang in langs):
|
|
files.append((file, lang))
|
|
|
|
# Process files with progress bar to extract functions and calls
|
|
with click.progressbar(
|
|
files,
|
|
label="Building call graph",
|
|
show_pos=True,
|
|
show_percent=True,
|
|
show_eta=True,
|
|
item_show_func=lambda x: str(x[0].name) if x else None,
|
|
) as bar:
|
|
for file_path, lang in bar:
|
|
rel_path = str(file_path.relative_to(root_path)).replace("\\", "/") # Normalize separators
|
|
module_id = rel_path # Already normalized
|
|
|
|
# Extract exported functions/classes from database
|
|
exports = self.extract_exports_from_db(rel_path)
|
|
for export in exports:
|
|
func_id = f"{module_id}::{export}"
|
|
node = GraphNode(
|
|
id=func_id,
|
|
file=rel_path, # Already normalized
|
|
lang=lang,
|
|
type="function",
|
|
)
|
|
nodes[func_id] = asdict(node)
|
|
|
|
# Extract calls from database
|
|
calls = self.extract_calls_from_db(rel_path)
|
|
for call, method in calls:
|
|
# Try to resolve the call target
|
|
if method:
|
|
# Method call
|
|
target_id = f"{call}.{method}"
|
|
else:
|
|
# Function call
|
|
target_id = call
|
|
|
|
# Create edge from module to called function
|
|
edge = GraphEdge(
|
|
source=module_id,
|
|
target=target_id,
|
|
type="call",
|
|
file=rel_path, # Already normalized
|
|
)
|
|
edges.append(asdict(edge))
|
|
|
|
return {
|
|
"nodes": list(nodes.values()),
|
|
"edges": edges,
|
|
"metadata": {
|
|
"root": str(root_path),
|
|
"languages": langs or [],
|
|
"total_functions": len(nodes),
|
|
"total_calls": len(edges),
|
|
},
|
|
}
|
|
|
|
def merge_graphs(self, import_graph: dict, call_graph: dict) -> dict[str, Any]:
|
|
"""Merge import and call graphs into a unified graph."""
|
|
# Combine nodes (dedup by id)
|
|
nodes = {}
|
|
for node in import_graph["nodes"]:
|
|
nodes[node["id"]] = node
|
|
for node in call_graph["nodes"]:
|
|
nodes[node["id"]] = node
|
|
|
|
# Combine edges
|
|
edges = import_graph["edges"] + call_graph["edges"]
|
|
|
|
return {
|
|
"nodes": list(nodes.values()),
|
|
"edges": edges,
|
|
"metadata": {
|
|
"root": import_graph["metadata"]["root"],
|
|
"languages": list(
|
|
set(
|
|
import_graph["metadata"]["languages"]
|
|
+ call_graph["metadata"].get("languages", [])
|
|
)
|
|
),
|
|
"total_nodes": len(nodes),
|
|
"total_edges": len(edges),
|
|
},
|
|
}
|
|
|
|
def _extract_imports_regex(self, file_path: Path, lang: str) -> list[str]:
|
|
"""Regex-based fallback for extracting imports.
|
|
|
|
This method is used when AST parsing fails or is unavailable.
|
|
"""
|
|
if lang not in self.IMPORT_PATTERNS:
|
|
return []
|
|
|
|
imports = []
|
|
patterns = [re.compile(p, re.MULTILINE) for p in self.IMPORT_PATTERNS[lang]]
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
content = f.read()
|
|
|
|
for pattern in patterns:
|
|
matches = pattern.findall(content)
|
|
imports.extend(matches)
|
|
|
|
except (IOError, UnicodeDecodeError, OSError) as e:
|
|
print(f"Warning: Failed to extract imports from {file_path}: {e}")
|
|
# Return empty list but LOG the failure
|
|
|
|
return imports
|
|
|
|
def _extract_exports_regex(self, file_path: Path, lang: str) -> list[str]:
|
|
"""Regex-based fallback for extracting exports.
|
|
|
|
This method is used when AST parsing fails or is unavailable.
|
|
"""
|
|
if lang not in self.EXPORT_PATTERNS:
|
|
return []
|
|
|
|
exports = []
|
|
patterns = [re.compile(p, re.MULTILINE) for p in self.EXPORT_PATTERNS[lang]]
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
content = f.read()
|
|
|
|
for pattern in patterns:
|
|
matches = pattern.findall(content)
|
|
# Flatten tuples if regex has groups
|
|
for match in matches:
|
|
if isinstance(match, tuple):
|
|
exports.extend([m for m in match if m])
|
|
else:
|
|
exports.append(match)
|
|
|
|
except (IOError, UnicodeDecodeError, OSError) as e:
|
|
print(f"Warning: Failed to extract exports from {file_path}: {e}")
|
|
# Return empty list but LOG the failure
|
|
|
|
# Filter exports for Go (only capitalized are public)
|
|
if lang == "go":
|
|
exports = [e for e in exports if e and e[0].isupper()]
|
|
|
|
return exports
|
|
|
|
def _extract_calls_regex(self, file_path: Path, lang: str) -> list[tuple[str, str | None]]:
|
|
"""Regex-based fallback for extracting function calls.
|
|
|
|
This method is used when AST parsing fails or is unavailable.
|
|
"""
|
|
if lang not in self.CALL_PATTERNS:
|
|
return []
|
|
|
|
calls = []
|
|
patterns = [re.compile(p) for p in self.CALL_PATTERNS[lang]]
|
|
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
content = f.read()
|
|
|
|
for pattern in patterns:
|
|
matches = pattern.findall(content)
|
|
for match in matches:
|
|
if isinstance(match, tuple):
|
|
# Method call: (object, method)
|
|
calls.append(match)
|
|
else:
|
|
# Function call
|
|
calls.append((match, None))
|
|
|
|
except (IOError, UnicodeDecodeError, OSError) as e:
|
|
print(f"Warning: Failed to extract calls from {file_path}: {e}")
|
|
# Return empty list but LOG the failure
|
|
|
|
return calls |