mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
608 lines
29 KiB
Python
608 lines
29 KiB
Python
"""Framework detection for various languages and ecosystems."""
|
|
|
|
import json
|
|
import re
|
|
import glob
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from theauditor.manifest_parser import ManifestParser
|
|
from theauditor.framework_registry import FRAMEWORK_REGISTRY
|
|
|
|
|
|
class FrameworkDetector:
|
|
"""Detects frameworks and libraries used in a project."""
|
|
|
|
# Note: Framework detection now uses the centralized FRAMEWORK_REGISTRY
|
|
# from framework_registry.py instead of the old FRAMEWORK_SIGNATURES
|
|
|
|
def __init__(self, project_path: Path, exclude_patterns: list[str] = None):
|
|
"""Initialize detector with project path.
|
|
|
|
Args:
|
|
project_path: Root directory of the project.
|
|
exclude_patterns: List of patterns to exclude from scanning.
|
|
"""
|
|
self.project_path = Path(project_path)
|
|
self.detected_frameworks = []
|
|
self.deps_cache = None
|
|
self.exclude_patterns = exclude_patterns or []
|
|
|
|
def detect_all(self) -> list[dict[str, Any]]:
|
|
"""Detect all frameworks in the project.
|
|
|
|
Returns:
|
|
List of detected framework info dictionaries.
|
|
"""
|
|
self.detected_frameworks = []
|
|
|
|
# Load TheAuditor's deps.json if available for better version info
|
|
self._load_deps_cache()
|
|
|
|
# Use unified manifest detection
|
|
self._detect_from_manifests()
|
|
|
|
# Also detect from monorepo workspaces (keep existing logic)
|
|
self._detect_from_workspaces()
|
|
|
|
# Store frameworks found in manifests for version lookup
|
|
manifest_frameworks = {}
|
|
for fw in self.detected_frameworks:
|
|
if fw["source"] != "imports":
|
|
key = (fw["framework"], fw["language"])
|
|
manifest_frameworks[key] = fw["version"]
|
|
|
|
# DISABLED: Import scanning causes too many false positives
|
|
# It detects framework names in strings, comments, and detection code itself
|
|
# Real dependencies should be in manifest files (package.json, requirements.txt, etc.)
|
|
# self._scan_source_imports()
|
|
|
|
# Check for framework-specific files
|
|
self._check_framework_files()
|
|
|
|
# Update versions for frameworks detected from framework files only (imports disabled)
|
|
for fw in self.detected_frameworks:
|
|
if fw["version"] == "unknown" and fw["source"] == "framework_files":
|
|
key = (fw["framework"], fw["language"])
|
|
# First try manifest frameworks
|
|
if key in manifest_frameworks:
|
|
fw["version"] = manifest_frameworks[key]
|
|
fw["source"] = f"{fw['source']} (version from manifest)"
|
|
# Then try deps cache
|
|
elif self.deps_cache and fw["framework"] in self.deps_cache:
|
|
cached_dep = self.deps_cache[fw["framework"]]
|
|
manager = cached_dep.get("manager", "")
|
|
# Match language to manager (py -> python, npm -> javascript)
|
|
if (fw["language"] == "python" and manager == "py") or \
|
|
(fw["language"] in ["javascript", "typescript"] and manager == "npm"):
|
|
fw["version"] = cached_dep.get("version", "") # Empty not unknown
|
|
if fw["version"] != "unknown":
|
|
fw["source"] = f"{fw['source']} (version from deps cache)"
|
|
|
|
# Deduplicate results, preferring entries with known versions
|
|
# Now we keep framework+language+path as unique key to support monorepos
|
|
seen = {}
|
|
for fw in self.detected_frameworks:
|
|
key = (fw["framework"], fw["language"], fw.get("path", "."))
|
|
if key not in seen:
|
|
seen[key] = fw
|
|
elif fw["version"] != "unknown" and seen[key]["version"] == "unknown":
|
|
# Replace with version that has a known version
|
|
seen[key] = fw
|
|
|
|
return list(seen.values())
|
|
|
|
def _detect_from_manifests(self):
|
|
"""Unified manifest detection using registry and ManifestParser - now directory-aware."""
|
|
parser = ManifestParser()
|
|
|
|
# Manifest file names to search for
|
|
manifest_names = [
|
|
"pyproject.toml",
|
|
"package.json",
|
|
"requirements.txt",
|
|
"requirements-dev.txt",
|
|
"requirements-test.txt",
|
|
"setup.py",
|
|
"setup.cfg",
|
|
"Gemfile",
|
|
"Gemfile.lock",
|
|
"go.mod",
|
|
"pom.xml",
|
|
"build.gradle",
|
|
"build.gradle.kts",
|
|
"composer.json",
|
|
]
|
|
|
|
# Recursively find all manifest files in the project
|
|
manifests = {}
|
|
for manifest_name in manifest_names:
|
|
# Use rglob to find all instances of this manifest file
|
|
for manifest_path in self.project_path.rglob(manifest_name):
|
|
# Skip excluded directories
|
|
try:
|
|
relative_path = manifest_path.relative_to(self.project_path)
|
|
should_skip = False
|
|
|
|
# Check common skip directories
|
|
for part in relative_path.parts[:-1]: # Don't check the filename itself
|
|
if part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor",
|
|
"dist", "build", "__pycache__", ".git", ".tox", ".pytest_cache"]:
|
|
should_skip = True
|
|
break
|
|
|
|
if should_skip:
|
|
continue
|
|
|
|
# Calculate the directory path relative to project root
|
|
dir_path = manifest_path.parent.relative_to(self.project_path)
|
|
dir_str = str(dir_path) if dir_path != Path('.') else '.'
|
|
|
|
# Create a unique key for this manifest
|
|
manifest_key = f"{dir_str}/{manifest_name}" if dir_str != '.' else manifest_name
|
|
manifests[manifest_key] = manifest_path
|
|
|
|
except ValueError:
|
|
# File is outside project path somehow, skip it
|
|
continue
|
|
|
|
# Parse each manifest that exists
|
|
parsed_data = {}
|
|
for manifest_key, path in manifests.items():
|
|
if path.exists():
|
|
try:
|
|
# Extract just the filename for parsing logic
|
|
filename = path.name
|
|
|
|
if filename.endswith('.toml'):
|
|
parsed_data[manifest_key] = parser.parse_toml(path)
|
|
elif filename.endswith('.json'):
|
|
parsed_data[manifest_key] = parser.parse_json(path)
|
|
elif filename.endswith(('.yml', '.yaml')):
|
|
parsed_data[manifest_key] = parser.parse_yaml(path)
|
|
elif filename.endswith('.cfg'):
|
|
parsed_data[manifest_key] = parser.parse_ini(path)
|
|
elif filename.endswith('.txt'):
|
|
parsed_data[manifest_key] = parser.parse_requirements_txt(path)
|
|
elif filename == 'Gemfile' or filename == 'Gemfile.lock':
|
|
# Parse Gemfile as text for now
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
parsed_data[manifest_key] = f.read()
|
|
elif filename.endswith('.xml') or filename.endswith('.gradle') or filename.endswith('.kts') or filename.endswith('.mod'):
|
|
# Parse as text content for now
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
parsed_data[manifest_key] = f.read()
|
|
elif filename == 'setup.py':
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
parsed_data[manifest_key] = f.read()
|
|
except Exception as e:
|
|
print(f"Warning: Failed to parse {manifest_key}: {e}")
|
|
|
|
# Check each framework against all manifests
|
|
for fw_name, fw_config in FRAMEWORK_REGISTRY.items():
|
|
for required_manifest_name, search_configs in fw_config.get("detection_sources", {}).items():
|
|
# Check all parsed manifests that match this manifest type
|
|
for manifest_key, manifest_data in parsed_data.items():
|
|
# Check if this manifest matches the required type
|
|
if not manifest_key.endswith(required_manifest_name):
|
|
continue
|
|
|
|
# Extract the directory path from the manifest key
|
|
if '/' in manifest_key:
|
|
dir_path = '/'.join(manifest_key.split('/')[:-1])
|
|
else:
|
|
dir_path = '.'
|
|
|
|
if search_configs == "line_search":
|
|
# Simple text search for requirements.txt style or Gemfile
|
|
if isinstance(manifest_data, list):
|
|
# Requirements.txt parsed as list
|
|
for line in manifest_data:
|
|
version = parser.check_package_in_deps([line], fw_name)
|
|
if version:
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": version or "unknown",
|
|
"language": fw_config["language"],
|
|
"path": dir_path,
|
|
"source": manifest_key
|
|
})
|
|
break
|
|
elif isinstance(manifest_data, str):
|
|
# Text file content
|
|
if fw_name in manifest_data or (fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data):
|
|
# Try to extract version
|
|
version = "unknown"
|
|
import re
|
|
if fw_config.get("package_pattern"):
|
|
pattern = fw_config["package_pattern"]
|
|
else:
|
|
pattern = fw_name
|
|
|
|
# Try different version patterns
|
|
version_match = re.search(rf'{re.escape(pattern)}["\']?\s*[,:]?\s*["\']?([\d.]+)', manifest_data)
|
|
if not version_match:
|
|
version_match = re.search(rf'{re.escape(pattern)}\s+v([\d.]+)', manifest_data)
|
|
if not version_match:
|
|
version_match = re.search(rf'gem\s+["\']?{re.escape(pattern)}["\']?\s*,\s*["\']([\d.]+)["\']', manifest_data)
|
|
|
|
if version_match:
|
|
version = version_match.group(1)
|
|
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": version,
|
|
"language": fw_config["language"],
|
|
"path": dir_path,
|
|
"source": manifest_key
|
|
})
|
|
|
|
elif search_configs == "content_search":
|
|
# Content search for text-based files
|
|
if isinstance(manifest_data, str):
|
|
found = False
|
|
# Check package pattern first
|
|
if fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data:
|
|
found = True
|
|
# Check content patterns
|
|
elif fw_config.get("content_patterns"):
|
|
for pattern in fw_config["content_patterns"]:
|
|
if pattern in manifest_data:
|
|
found = True
|
|
break
|
|
# Fallback to framework name
|
|
elif fw_name in manifest_data:
|
|
found = True
|
|
|
|
if found:
|
|
# Try to extract version
|
|
version = "unknown"
|
|
import re
|
|
pattern = fw_config.get("package_pattern", fw_name)
|
|
version_match = re.search(rf'{re.escape(pattern)}.*?[>v]([\d.]+)', manifest_data, re.DOTALL)
|
|
if version_match:
|
|
version = version_match.group(1)
|
|
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": version,
|
|
"language": fw_config["language"],
|
|
"path": dir_path,
|
|
"source": manifest_key
|
|
})
|
|
|
|
elif search_configs == "exists":
|
|
# Just check if file exists (for go.mod with go test framework)
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": "unknown",
|
|
"language": fw_config["language"],
|
|
"path": dir_path,
|
|
"source": manifest_key
|
|
})
|
|
|
|
else:
|
|
# Structured search for JSON/TOML/YAML
|
|
for key_path in search_configs:
|
|
deps = parser.extract_nested_value(manifest_data, key_path)
|
|
if deps:
|
|
# Check if framework is in dependencies
|
|
package_name = fw_config.get("package_pattern", fw_name)
|
|
version = parser.check_package_in_deps(deps, package_name)
|
|
if version:
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": version,
|
|
"language": fw_config["language"],
|
|
"path": dir_path,
|
|
"source": manifest_key
|
|
})
|
|
break
|
|
|
|
def _detect_from_workspaces(self):
|
|
"""Detect frameworks from monorepo workspace packages."""
|
|
# This preserves the existing monorepo detection logic
|
|
package_json = self.project_path / "package.json"
|
|
if not package_json.exists():
|
|
return
|
|
|
|
parser = ManifestParser()
|
|
try:
|
|
data = parser.parse_json(package_json)
|
|
|
|
# Check for workspaces field (Yarn/npm workspaces)
|
|
workspaces = data.get("workspaces", [])
|
|
|
|
# Handle different workspace formats
|
|
if isinstance(workspaces, dict):
|
|
# npm 7+ format: {"packages": ["packages/*"]}
|
|
workspaces = workspaces.get("packages", [])
|
|
|
|
if workspaces and isinstance(workspaces, list):
|
|
# This is a monorepo - check workspace packages
|
|
for pattern in workspaces:
|
|
# Convert workspace pattern to absolute path pattern
|
|
abs_pattern = str(self.project_path / pattern)
|
|
|
|
# Handle glob patterns
|
|
if "*" in abs_pattern:
|
|
matched_paths = glob.glob(abs_pattern)
|
|
for matched_path in matched_paths:
|
|
matched_dir = Path(matched_path)
|
|
if matched_dir.is_dir():
|
|
workspace_pkg = matched_dir / "package.json"
|
|
if workspace_pkg.exists():
|
|
# Parse and check this workspace package
|
|
self._check_workspace_package(workspace_pkg, parser)
|
|
else:
|
|
# Direct path without glob
|
|
workspace_dir = self.project_path / pattern
|
|
if workspace_dir.is_dir():
|
|
workspace_pkg = workspace_dir / "package.json"
|
|
if workspace_pkg.exists():
|
|
self._check_workspace_package(workspace_pkg, parser)
|
|
except Exception as e:
|
|
print(f"Warning: Failed to check workspaces: {e}")
|
|
|
|
def _check_workspace_package(self, pkg_path: Path, parser: ManifestParser):
|
|
"""Check a single workspace package.json for frameworks."""
|
|
try:
|
|
data = parser.parse_json(pkg_path)
|
|
|
|
# Check dependencies
|
|
all_deps = {}
|
|
if "dependencies" in data:
|
|
all_deps.update(data["dependencies"])
|
|
if "devDependencies" in data:
|
|
all_deps.update(data["devDependencies"])
|
|
|
|
# Check each JavaScript framework
|
|
for fw_name, fw_config in FRAMEWORK_REGISTRY.items():
|
|
if fw_config["language"] != "javascript":
|
|
continue
|
|
|
|
package_name = fw_config.get("package_pattern", fw_name)
|
|
if package_name in all_deps:
|
|
version = all_deps[package_name]
|
|
# Clean version
|
|
version = re.sub(r'^[~^>=<]+', '', str(version)).strip()
|
|
|
|
# Calculate relative path for path field
|
|
try:
|
|
rel_path = pkg_path.parent.relative_to(self.project_path)
|
|
path = str(rel_path).replace("\\", "/") if rel_path != Path('.') else '.'
|
|
source = str(pkg_path.relative_to(self.project_path)).replace("\\", "/")
|
|
except ValueError:
|
|
path = '.'
|
|
source = str(pkg_path)
|
|
|
|
self.detected_frameworks.append({
|
|
"framework": fw_name,
|
|
"version": version,
|
|
"language": "javascript",
|
|
"path": path,
|
|
"source": source
|
|
})
|
|
except Exception as e:
|
|
print(f"Warning: Failed to parse workspace package {pkg_path}: {e}")
|
|
|
|
# Stub method kept for backward compatibility - actual logic moved to _detect_from_manifests
|
|
pass
|
|
|
|
def _scan_source_imports(self):
|
|
"""Scan source files for framework imports."""
|
|
# Limit scanning to avoid performance issues
|
|
max_files = 100
|
|
files_scanned = 0
|
|
|
|
# Language file extensions
|
|
lang_extensions = {
|
|
".py": "python",
|
|
".js": "javascript",
|
|
".jsx": "javascript",
|
|
".ts": "javascript",
|
|
".tsx": "javascript",
|
|
".go": "go",
|
|
".java": "java",
|
|
".rb": "ruby",
|
|
".php": "php",
|
|
}
|
|
|
|
for ext, language in lang_extensions.items():
|
|
if files_scanned >= max_files:
|
|
break
|
|
|
|
for file_path in self.project_path.rglob(f"*{ext}"):
|
|
if files_scanned >= max_files:
|
|
break
|
|
|
|
# Skip node_modules, venv, etc.
|
|
if any(
|
|
part in file_path.parts
|
|
for part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor", "dist", "build", "__pycache__", ".git"]
|
|
):
|
|
continue
|
|
|
|
# Check exclude patterns
|
|
relative_path = file_path.relative_to(self.project_path)
|
|
should_skip = False
|
|
for pattern in self.exclude_patterns:
|
|
# Handle directory patterns
|
|
if pattern.endswith('/'):
|
|
dir_pattern = pattern.rstrip('/')
|
|
if str(relative_path).startswith(dir_pattern + '/') or str(relative_path).startswith(dir_pattern + '\\'):
|
|
should_skip = True
|
|
break
|
|
# Handle glob patterns
|
|
elif '*' in pattern:
|
|
from fnmatch import fnmatch
|
|
if fnmatch(str(relative_path), pattern):
|
|
should_skip = True
|
|
break
|
|
# Handle exact matches
|
|
elif str(relative_path) == pattern:
|
|
should_skip = True
|
|
break
|
|
|
|
if should_skip:
|
|
continue
|
|
|
|
files_scanned += 1
|
|
|
|
try:
|
|
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
content = f.read()
|
|
|
|
# Check frameworks from registry
|
|
for fw_name, fw_config in FRAMEWORK_REGISTRY.items():
|
|
# Only check frameworks for this language
|
|
if fw_config["language"] != language:
|
|
continue
|
|
|
|
if "import_patterns" in fw_config:
|
|
for import_pattern in fw_config["import_patterns"]:
|
|
if import_pattern in content:
|
|
# Check if not already detected in this directory
|
|
file_dir = file_path.parent.relative_to(self.project_path)
|
|
dir_str = str(file_dir).replace("\\", "/") if file_dir != Path('.') else '.'
|
|
|
|
if not any(
|
|
fw["framework"] == fw_name and fw["language"] == language and fw.get("path", ".") == dir_str
|
|
for fw in self.detected_frameworks
|
|
):
|
|
self.detected_frameworks.append(
|
|
{
|
|
"framework": fw_name,
|
|
"version": "unknown",
|
|
"language": language,
|
|
"path": dir_str,
|
|
"source": "imports",
|
|
}
|
|
)
|
|
break
|
|
|
|
except Exception:
|
|
# Skip files that can't be read
|
|
continue
|
|
|
|
def _check_framework_files(self):
|
|
"""Check for framework-specific files."""
|
|
# Check all frameworks in registry for file markers
|
|
for fw_name, fw_config in FRAMEWORK_REGISTRY.items():
|
|
if "file_markers" in fw_config:
|
|
for file_marker in fw_config["file_markers"]:
|
|
# Handle wildcard patterns
|
|
if "*" in file_marker:
|
|
# Use glob for wildcard patterns
|
|
import glob
|
|
pattern = str(self.project_path / file_marker)
|
|
if glob.glob(pattern):
|
|
# Check if not already detected
|
|
if not any(
|
|
fw["framework"] == fw_name and fw["language"] == fw_config["language"]
|
|
for fw in self.detected_frameworks
|
|
):
|
|
self.detected_frameworks.append(
|
|
{
|
|
"framework": fw_name,
|
|
"version": "unknown",
|
|
"language": fw_config["language"],
|
|
"path": ".", # Framework files typically at root
|
|
"source": "framework_files",
|
|
}
|
|
)
|
|
break
|
|
else:
|
|
# Direct file path
|
|
if (self.project_path / file_marker).exists():
|
|
# Check if not already detected
|
|
if not any(
|
|
fw["framework"] == fw_name and fw["language"] == fw_config["language"]
|
|
for fw in self.detected_frameworks
|
|
):
|
|
self.detected_frameworks.append(
|
|
{
|
|
"framework": fw_name,
|
|
"version": "unknown",
|
|
"language": fw_config["language"],
|
|
"path": ".", # Framework files typically at root
|
|
"source": "framework_files",
|
|
}
|
|
)
|
|
break
|
|
|
|
def _load_deps_cache(self):
|
|
"""Load TheAuditor's deps.json if available for version info."""
|
|
deps_file = self.project_path / ".pf" / "deps.json"
|
|
if deps_file.exists():
|
|
try:
|
|
with open(deps_file) as f:
|
|
data = json.load(f)
|
|
self.deps_cache = {}
|
|
# Handle both old format (list) and new format (dict with "dependencies" key)
|
|
if isinstance(data, list):
|
|
deps_list = data
|
|
else:
|
|
deps_list = data.get("dependencies", [])
|
|
|
|
for dep in deps_list:
|
|
# Store by name for quick lookup
|
|
self.deps_cache[dep["name"]] = dep
|
|
except Exception as e:
|
|
# Log the error but continue
|
|
print(f"Warning: Could not load deps cache: {e}")
|
|
pass
|
|
|
|
def format_table(self) -> str:
|
|
"""Format detected frameworks as a table.
|
|
|
|
Returns:
|
|
Formatted table string.
|
|
"""
|
|
if not self.detected_frameworks:
|
|
return "No frameworks detected."
|
|
|
|
lines = []
|
|
lines.append("FRAMEWORK LANGUAGE PATH VERSION SOURCE")
|
|
lines.append("-" * 80)
|
|
|
|
imports_only = []
|
|
for fw in self.detected_frameworks:
|
|
framework = fw["framework"][:18].ljust(18)
|
|
language = fw["language"][:12].ljust(12)
|
|
path = fw.get("path", ".")[:15].ljust(15)
|
|
version = fw["version"][:15].ljust(15)
|
|
source = fw["source"]
|
|
|
|
lines.append(f"{framework} {language} {path} {version} {source}")
|
|
|
|
# Track if any are from imports only
|
|
if fw["source"] == "imports" and fw["version"] == "unknown":
|
|
imports_only.append(fw["framework"])
|
|
|
|
# Add note if frameworks detected from imports without versions
|
|
if imports_only:
|
|
lines.append("\n" + "="*60)
|
|
lines.append("NOTE: Frameworks marked with 'imports' source were detected from")
|
|
lines.append("import statements in the codebase (possibly test files) but are")
|
|
lines.append("not listed as dependencies. Version shown as 'unknown' because")
|
|
lines.append("they are not in package.json, pyproject.toml, or requirements.txt.")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def to_json(self) -> str:
|
|
"""Export detected frameworks to JSON.
|
|
|
|
Returns:
|
|
JSON string.
|
|
"""
|
|
return json.dumps(self.detected_frameworks, indent=2, sort_keys=True)
|
|
|
|
def save_to_file(self, output_path: Path) -> None:
|
|
"""Save detected frameworks to a JSON file.
|
|
|
|
Args:
|
|
output_path: Path where the JSON file should be saved.
|
|
"""
|
|
output_path = Path(output_path)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(self.to_json()) |