"""Framework detection for various languages and ecosystems.""" import json import re import glob from pathlib import Path from typing import Any from theauditor.manifest_parser import ManifestParser from theauditor.framework_registry import FRAMEWORK_REGISTRY class FrameworkDetector: """Detects frameworks and libraries used in a project.""" # Note: Framework detection now uses the centralized FRAMEWORK_REGISTRY # from framework_registry.py instead of the old FRAMEWORK_SIGNATURES def __init__(self, project_path: Path, exclude_patterns: list[str] = None): """Initialize detector with project path. Args: project_path: Root directory of the project. exclude_patterns: List of patterns to exclude from scanning. """ self.project_path = Path(project_path) self.detected_frameworks = [] self.deps_cache = None self.exclude_patterns = exclude_patterns or [] def detect_all(self) -> list[dict[str, Any]]: """Detect all frameworks in the project. Returns: List of detected framework info dictionaries. """ self.detected_frameworks = [] # Load TheAuditor's deps.json if available for better version info self._load_deps_cache() # Use unified manifest detection self._detect_from_manifests() # Also detect from monorepo workspaces (keep existing logic) self._detect_from_workspaces() # Store frameworks found in manifests for version lookup manifest_frameworks = {} for fw in self.detected_frameworks: if fw["source"] != "imports": key = (fw["framework"], fw["language"]) manifest_frameworks[key] = fw["version"] # DISABLED: Import scanning causes too many false positives # It detects framework names in strings, comments, and detection code itself # Real dependencies should be in manifest files (package.json, requirements.txt, etc.) # self._scan_source_imports() # Check for framework-specific files self._check_framework_files() # Update versions for frameworks detected from framework files only (imports disabled) for fw in self.detected_frameworks: if fw["version"] == "unknown" and fw["source"] == "framework_files": key = (fw["framework"], fw["language"]) # First try manifest frameworks if key in manifest_frameworks: fw["version"] = manifest_frameworks[key] fw["source"] = f"{fw['source']} (version from manifest)" # Then try deps cache elif self.deps_cache and fw["framework"] in self.deps_cache: cached_dep = self.deps_cache[fw["framework"]] manager = cached_dep.get("manager", "") # Match language to manager (py -> python, npm -> javascript) if (fw["language"] == "python" and manager == "py") or \ (fw["language"] in ["javascript", "typescript"] and manager == "npm"): fw["version"] = cached_dep.get("version", "") # Empty not unknown if fw["version"] != "unknown": fw["source"] = f"{fw['source']} (version from deps cache)" # Deduplicate results, preferring entries with known versions # Now we keep framework+language+path as unique key to support monorepos seen = {} for fw in self.detected_frameworks: key = (fw["framework"], fw["language"], fw.get("path", ".")) if key not in seen: seen[key] = fw elif fw["version"] != "unknown" and seen[key]["version"] == "unknown": # Replace with version that has a known version seen[key] = fw return list(seen.values()) def _detect_from_manifests(self): """Unified manifest detection using registry and ManifestParser - now directory-aware.""" parser = ManifestParser() # Manifest file names to search for manifest_names = [ "pyproject.toml", "package.json", "requirements.txt", "requirements-dev.txt", "requirements-test.txt", "setup.py", "setup.cfg", "Gemfile", "Gemfile.lock", "go.mod", "pom.xml", "build.gradle", "build.gradle.kts", "composer.json", ] # Recursively find all manifest files in the project manifests = {} for manifest_name in manifest_names: # Use rglob to find all instances of this manifest file for manifest_path in self.project_path.rglob(manifest_name): # Skip excluded directories try: relative_path = manifest_path.relative_to(self.project_path) should_skip = False # Check common skip directories for part in relative_path.parts[:-1]: # Don't check the filename itself if part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor", "dist", "build", "__pycache__", ".git", ".tox", ".pytest_cache"]: should_skip = True break if should_skip: continue # Calculate the directory path relative to project root dir_path = manifest_path.parent.relative_to(self.project_path) dir_str = str(dir_path) if dir_path != Path('.') else '.' # Create a unique key for this manifest manifest_key = f"{dir_str}/{manifest_name}" if dir_str != '.' else manifest_name manifests[manifest_key] = manifest_path except ValueError: # File is outside project path somehow, skip it continue # Parse each manifest that exists parsed_data = {} for manifest_key, path in manifests.items(): if path.exists(): try: # Extract just the filename for parsing logic filename = path.name if filename.endswith('.toml'): parsed_data[manifest_key] = parser.parse_toml(path) elif filename.endswith('.json'): parsed_data[manifest_key] = parser.parse_json(path) elif filename.endswith(('.yml', '.yaml')): parsed_data[manifest_key] = parser.parse_yaml(path) elif filename.endswith('.cfg'): parsed_data[manifest_key] = parser.parse_ini(path) elif filename.endswith('.txt'): parsed_data[manifest_key] = parser.parse_requirements_txt(path) elif filename == 'Gemfile' or filename == 'Gemfile.lock': # Parse Gemfile as text for now with open(path, 'r', encoding='utf-8') as f: parsed_data[manifest_key] = f.read() elif filename.endswith('.xml') or filename.endswith('.gradle') or filename.endswith('.kts') or filename.endswith('.mod'): # Parse as text content for now with open(path, 'r', encoding='utf-8') as f: parsed_data[manifest_key] = f.read() elif filename == 'setup.py': with open(path, 'r', encoding='utf-8') as f: parsed_data[manifest_key] = f.read() except Exception as e: print(f"Warning: Failed to parse {manifest_key}: {e}") # Check each framework against all manifests for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): for required_manifest_name, search_configs in fw_config.get("detection_sources", {}).items(): # Check all parsed manifests that match this manifest type for manifest_key, manifest_data in parsed_data.items(): # Check if this manifest matches the required type if not manifest_key.endswith(required_manifest_name): continue # Extract the directory path from the manifest key if '/' in manifest_key: dir_path = '/'.join(manifest_key.split('/')[:-1]) else: dir_path = '.' if search_configs == "line_search": # Simple text search for requirements.txt style or Gemfile if isinstance(manifest_data, list): # Requirements.txt parsed as list for line in manifest_data: version = parser.check_package_in_deps([line], fw_name) if version: self.detected_frameworks.append({ "framework": fw_name, "version": version or "unknown", "language": fw_config["language"], "path": dir_path, "source": manifest_key }) break elif isinstance(manifest_data, str): # Text file content if fw_name in manifest_data or (fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data): # Try to extract version version = "unknown" import re if fw_config.get("package_pattern"): pattern = fw_config["package_pattern"] else: pattern = fw_name # Try different version patterns version_match = re.search(rf'{re.escape(pattern)}["\']?\s*[,:]?\s*["\']?([\d.]+)', manifest_data) if not version_match: version_match = re.search(rf'{re.escape(pattern)}\s+v([\d.]+)', manifest_data) if not version_match: version_match = re.search(rf'gem\s+["\']?{re.escape(pattern)}["\']?\s*,\s*["\']([\d.]+)["\']', manifest_data) if version_match: version = version_match.group(1) self.detected_frameworks.append({ "framework": fw_name, "version": version, "language": fw_config["language"], "path": dir_path, "source": manifest_key }) elif search_configs == "content_search": # Content search for text-based files if isinstance(manifest_data, str): found = False # Check package pattern first if fw_config.get("package_pattern") and fw_config["package_pattern"] in manifest_data: found = True # Check content patterns elif fw_config.get("content_patterns"): for pattern in fw_config["content_patterns"]: if pattern in manifest_data: found = True break # Fallback to framework name elif fw_name in manifest_data: found = True if found: # Try to extract version version = "unknown" import re pattern = fw_config.get("package_pattern", fw_name) version_match = re.search(rf'{re.escape(pattern)}.*?[>v]([\d.]+)', manifest_data, re.DOTALL) if version_match: version = version_match.group(1) self.detected_frameworks.append({ "framework": fw_name, "version": version, "language": fw_config["language"], "path": dir_path, "source": manifest_key }) elif search_configs == "exists": # Just check if file exists (for go.mod with go test framework) self.detected_frameworks.append({ "framework": fw_name, "version": "unknown", "language": fw_config["language"], "path": dir_path, "source": manifest_key }) else: # Structured search for JSON/TOML/YAML for key_path in search_configs: deps = parser.extract_nested_value(manifest_data, key_path) if deps: # Check if framework is in dependencies package_name = fw_config.get("package_pattern", fw_name) version = parser.check_package_in_deps(deps, package_name) if version: self.detected_frameworks.append({ "framework": fw_name, "version": version, "language": fw_config["language"], "path": dir_path, "source": manifest_key }) break def _detect_from_workspaces(self): """Detect frameworks from monorepo workspace packages.""" # This preserves the existing monorepo detection logic package_json = self.project_path / "package.json" if not package_json.exists(): return parser = ManifestParser() try: data = parser.parse_json(package_json) # Check for workspaces field (Yarn/npm workspaces) workspaces = data.get("workspaces", []) # Handle different workspace formats if isinstance(workspaces, dict): # npm 7+ format: {"packages": ["packages/*"]} workspaces = workspaces.get("packages", []) if workspaces and isinstance(workspaces, list): # This is a monorepo - check workspace packages for pattern in workspaces: # Convert workspace pattern to absolute path pattern abs_pattern = str(self.project_path / pattern) # Handle glob patterns if "*" in abs_pattern: matched_paths = glob.glob(abs_pattern) for matched_path in matched_paths: matched_dir = Path(matched_path) if matched_dir.is_dir(): workspace_pkg = matched_dir / "package.json" if workspace_pkg.exists(): # Parse and check this workspace package self._check_workspace_package(workspace_pkg, parser) else: # Direct path without glob workspace_dir = self.project_path / pattern if workspace_dir.is_dir(): workspace_pkg = workspace_dir / "package.json" if workspace_pkg.exists(): self._check_workspace_package(workspace_pkg, parser) except Exception as e: print(f"Warning: Failed to check workspaces: {e}") def _check_workspace_package(self, pkg_path: Path, parser: ManifestParser): """Check a single workspace package.json for frameworks.""" try: data = parser.parse_json(pkg_path) # Check dependencies all_deps = {} if "dependencies" in data: all_deps.update(data["dependencies"]) if "devDependencies" in data: all_deps.update(data["devDependencies"]) # Check each JavaScript framework for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): if fw_config["language"] != "javascript": continue package_name = fw_config.get("package_pattern", fw_name) if package_name in all_deps: version = all_deps[package_name] # Clean version version = re.sub(r'^[~^>=<]+', '', str(version)).strip() # Calculate relative path for path field try: rel_path = pkg_path.parent.relative_to(self.project_path) path = str(rel_path).replace("\\", "/") if rel_path != Path('.') else '.' source = str(pkg_path.relative_to(self.project_path)).replace("\\", "/") except ValueError: path = '.' source = str(pkg_path) self.detected_frameworks.append({ "framework": fw_name, "version": version, "language": "javascript", "path": path, "source": source }) except Exception as e: print(f"Warning: Failed to parse workspace package {pkg_path}: {e}") # Stub method kept for backward compatibility - actual logic moved to _detect_from_manifests pass def _scan_source_imports(self): """Scan source files for framework imports.""" # Limit scanning to avoid performance issues max_files = 100 files_scanned = 0 # Language file extensions lang_extensions = { ".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "javascript", ".tsx": "javascript", ".go": "go", ".java": "java", ".rb": "ruby", ".php": "php", } for ext, language in lang_extensions.items(): if files_scanned >= max_files: break for file_path in self.project_path.rglob(f"*{ext}"): if files_scanned >= max_files: break # Skip node_modules, venv, etc. if any( part in file_path.parts for part in ["node_modules", "venv", ".venv", ".auditor_venv", "vendor", "dist", "build", "__pycache__", ".git"] ): continue # Check exclude patterns relative_path = file_path.relative_to(self.project_path) should_skip = False for pattern in self.exclude_patterns: # Handle directory patterns if pattern.endswith('/'): dir_pattern = pattern.rstrip('/') if str(relative_path).startswith(dir_pattern + '/') or str(relative_path).startswith(dir_pattern + '\\'): should_skip = True break # Handle glob patterns elif '*' in pattern: from fnmatch import fnmatch if fnmatch(str(relative_path), pattern): should_skip = True break # Handle exact matches elif str(relative_path) == pattern: should_skip = True break if should_skip: continue files_scanned += 1 try: with open(file_path, encoding="utf-8", errors="ignore") as f: content = f.read() # Check frameworks from registry for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): # Only check frameworks for this language if fw_config["language"] != language: continue if "import_patterns" in fw_config: for import_pattern in fw_config["import_patterns"]: if import_pattern in content: # Check if not already detected in this directory file_dir = file_path.parent.relative_to(self.project_path) dir_str = str(file_dir).replace("\\", "/") if file_dir != Path('.') else '.' if not any( fw["framework"] == fw_name and fw["language"] == language and fw.get("path", ".") == dir_str for fw in self.detected_frameworks ): self.detected_frameworks.append( { "framework": fw_name, "version": "unknown", "language": language, "path": dir_str, "source": "imports", } ) break except Exception: # Skip files that can't be read continue def _check_framework_files(self): """Check for framework-specific files.""" # Check all frameworks in registry for file markers for fw_name, fw_config in FRAMEWORK_REGISTRY.items(): if "file_markers" in fw_config: for file_marker in fw_config["file_markers"]: # Handle wildcard patterns if "*" in file_marker: # Use glob for wildcard patterns import glob pattern = str(self.project_path / file_marker) if glob.glob(pattern): # Check if not already detected if not any( fw["framework"] == fw_name and fw["language"] == fw_config["language"] for fw in self.detected_frameworks ): self.detected_frameworks.append( { "framework": fw_name, "version": "unknown", "language": fw_config["language"], "path": ".", # Framework files typically at root "source": "framework_files", } ) break else: # Direct file path if (self.project_path / file_marker).exists(): # Check if not already detected if not any( fw["framework"] == fw_name and fw["language"] == fw_config["language"] for fw in self.detected_frameworks ): self.detected_frameworks.append( { "framework": fw_name, "version": "unknown", "language": fw_config["language"], "path": ".", # Framework files typically at root "source": "framework_files", } ) break def _load_deps_cache(self): """Load TheAuditor's deps.json if available for version info.""" deps_file = self.project_path / ".pf" / "deps.json" if deps_file.exists(): try: with open(deps_file) as f: data = json.load(f) self.deps_cache = {} # Handle both old format (list) and new format (dict with "dependencies" key) if isinstance(data, list): deps_list = data else: deps_list = data.get("dependencies", []) for dep in deps_list: # Store by name for quick lookup self.deps_cache[dep["name"]] = dep except Exception as e: # Log the error but continue print(f"Warning: Could not load deps cache: {e}") pass def format_table(self) -> str: """Format detected frameworks as a table. Returns: Formatted table string. """ if not self.detected_frameworks: return "No frameworks detected." lines = [] lines.append("FRAMEWORK LANGUAGE PATH VERSION SOURCE") lines.append("-" * 80) imports_only = [] for fw in self.detected_frameworks: framework = fw["framework"][:18].ljust(18) language = fw["language"][:12].ljust(12) path = fw.get("path", ".")[:15].ljust(15) version = fw["version"][:15].ljust(15) source = fw["source"] lines.append(f"{framework} {language} {path} {version} {source}") # Track if any are from imports only if fw["source"] == "imports" and fw["version"] == "unknown": imports_only.append(fw["framework"]) # Add note if frameworks detected from imports without versions if imports_only: lines.append("\n" + "="*60) lines.append("NOTE: Frameworks marked with 'imports' source were detected from") lines.append("import statements in the codebase (possibly test files) but are") lines.append("not listed as dependencies. Version shown as 'unknown' because") lines.append("they are not in package.json, pyproject.toml, or requirements.txt.") return "\n".join(lines) def to_json(self) -> str: """Export detected frameworks to JSON. Returns: JSON string. """ return json.dumps(self.detected_frameworks, indent=2, sort_keys=True) def save_to_file(self, output_path: Path) -> None: """Save detected frameworks to a JSON file. Args: output_path: Path where the JSON file should be saved. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(self.to_json())