mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
1109 lines
41 KiB
Python
1109 lines
41 KiB
Python
"""Dependency parser for multiple ecosystems."""
|
|
|
|
import glob
|
|
import http.client
|
|
import json
|
|
import platform
|
|
import re
|
|
import shutil
|
|
import time
|
|
import urllib.error
|
|
import yaml
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError
|
|
|
|
# Detect if running on Windows for character encoding
|
|
IS_WINDOWS = platform.system() == "Windows"
|
|
|
|
# Rate limiting configuration - optimized for minimal runtime
|
|
# Based on actual API rate limits and industry standards
|
|
RATE_LIMIT_NPM = 0.1 # npm registry: 600 req/min (well under any limit)
|
|
RATE_LIMIT_PYPI = 0.2 # PyPI: 300 req/min (safe margin)
|
|
RATE_LIMIT_DOCKER = 0.2 # Docker Hub: 300 req/min for tag checks
|
|
RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset)
|
|
|
|
|
|
def parse_dependencies(root_path: str = ".") -> List[Dict[str, Any]]:
|
|
"""
|
|
Parse dependencies from various package managers.
|
|
|
|
Returns list of dependency objects with structure:
|
|
{
|
|
"name": str,
|
|
"version": str,
|
|
"manager": "npm"|"py",
|
|
"files": [paths that import it],
|
|
"source": "package.json|pyproject.toml|requirements.txt"
|
|
}
|
|
"""
|
|
import os
|
|
root = Path(root_path)
|
|
deps = []
|
|
|
|
# Debug mode
|
|
debug = os.environ.get("THEAUDITOR_DEBUG")
|
|
|
|
# Parse Node dependencies
|
|
try:
|
|
package_json = sanitize_path("package.json", root_path)
|
|
if package_json.exists():
|
|
if debug:
|
|
print(f"Debug: Found {package_json}")
|
|
deps.extend(_parse_package_json(package_json))
|
|
except SecurityError as e:
|
|
if debug:
|
|
print(f"Debug: Security error checking package.json: {e}")
|
|
|
|
# Parse Python dependencies
|
|
try:
|
|
pyproject = sanitize_path("pyproject.toml", root_path)
|
|
if pyproject.exists():
|
|
if debug:
|
|
print(f"Debug: Found {pyproject}")
|
|
deps.extend(_parse_pyproject_toml(pyproject))
|
|
except SecurityError as e:
|
|
if debug:
|
|
print(f"Debug: Security error checking pyproject.toml: {e}")
|
|
|
|
# Parse requirements files
|
|
req_files = list(root.glob("requirements*.txt"))
|
|
if debug and req_files:
|
|
print(f"Debug: Found requirements files: {req_files}")
|
|
for req_file in req_files:
|
|
try:
|
|
# Validate the path is within project root
|
|
safe_req_file = sanitize_path(str(req_file), root_path)
|
|
deps.extend(_parse_requirements_txt(safe_req_file))
|
|
except SecurityError as e:
|
|
if debug:
|
|
print(f"Debug: Security error with {req_file}: {e}")
|
|
|
|
# Parse Docker Compose files
|
|
docker_compose_files = list(root.glob("docker-compose*.yml")) + list(root.glob("docker-compose*.yaml"))
|
|
if debug and docker_compose_files:
|
|
print(f"Debug: Found Docker Compose files: {docker_compose_files}")
|
|
for compose_file in docker_compose_files:
|
|
try:
|
|
safe_compose_file = sanitize_path(str(compose_file), root_path)
|
|
deps.extend(_parse_docker_compose(safe_compose_file))
|
|
except SecurityError as e:
|
|
if debug:
|
|
print(f"Debug: Security error with {compose_file}: {e}")
|
|
|
|
# Parse Dockerfiles
|
|
dockerfiles = list(root.glob("**/Dockerfile"))
|
|
if debug and dockerfiles:
|
|
print(f"Debug: Found Dockerfiles: {dockerfiles}")
|
|
for dockerfile in dockerfiles:
|
|
try:
|
|
safe_dockerfile = sanitize_path(str(dockerfile), root_path)
|
|
deps.extend(_parse_dockerfile(safe_dockerfile))
|
|
except SecurityError as e:
|
|
if debug:
|
|
print(f"Debug: Security error with {dockerfile}: {e}")
|
|
|
|
if debug:
|
|
print(f"Debug: Total dependencies found: {len(deps)}")
|
|
|
|
return deps
|
|
|
|
|
|
def _parse_package_json(path: Path) -> List[Dict[str, Any]]:
|
|
"""Parse dependencies from package.json, with monorepo support."""
|
|
deps = []
|
|
processed_packages = set() # Track processed packages to avoid duplicates
|
|
|
|
def parse_single_package(pkg_path: Path, workspace_path: str = "package.json") -> List[Dict[str, Any]]:
|
|
"""Parse a single package.json file."""
|
|
local_deps = []
|
|
try:
|
|
with open(pkg_path, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
# Combine dependencies and devDependencies
|
|
all_deps = {}
|
|
if "dependencies" in data:
|
|
all_deps.update(data["dependencies"])
|
|
if "devDependencies" in data:
|
|
all_deps.update(data["devDependencies"])
|
|
|
|
for name, version_spec in all_deps.items():
|
|
# Clean version spec (remove ^, ~, >=, etc.)
|
|
version = _clean_version(version_spec)
|
|
local_deps.append({
|
|
"name": name,
|
|
"version": version,
|
|
"manager": "npm",
|
|
"files": [], # Will be populated by workset scan
|
|
"source": "package.json",
|
|
"workspace_package": workspace_path # Track which package.json this came from
|
|
})
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
# Log but don't fail - package.json might be malformed
|
|
print(f"Warning: Could not parse {pkg_path}: {e}")
|
|
|
|
return local_deps
|
|
|
|
# Parse the root package.json first
|
|
root_dir = path.parent
|
|
deps.extend(parse_single_package(path, "package.json"))
|
|
processed_packages.add(str(path.resolve()))
|
|
|
|
# Check for monorepo workspaces
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
# Check for workspaces field (Yarn/npm workspaces)
|
|
workspaces = data.get("workspaces", [])
|
|
|
|
# Handle different workspace formats
|
|
if isinstance(workspaces, dict):
|
|
# npm 7+ format: {"packages": ["packages/*"]}
|
|
workspaces = workspaces.get("packages", [])
|
|
|
|
if workspaces and isinstance(workspaces, list):
|
|
# This is a monorepo - expand workspace patterns
|
|
for pattern in workspaces:
|
|
# Convert workspace pattern to absolute path pattern
|
|
abs_pattern = str(root_dir / pattern)
|
|
|
|
# Handle glob patterns like "packages/*" or "apps/**"
|
|
if "*" in abs_pattern:
|
|
# Use glob to find matching directories
|
|
matched_paths = glob.glob(abs_pattern)
|
|
|
|
for matched_path in matched_paths:
|
|
matched_dir = Path(matched_path)
|
|
if matched_dir.is_dir():
|
|
# Look for package.json in this directory
|
|
workspace_pkg = matched_dir / "package.json"
|
|
if workspace_pkg.exists():
|
|
# Skip if already processed
|
|
if str(workspace_pkg.resolve()) in processed_packages:
|
|
continue
|
|
|
|
# Calculate relative path for workspace_package field
|
|
try:
|
|
rel_path = workspace_pkg.relative_to(root_dir)
|
|
workspace_path = str(rel_path).replace("\\", "/")
|
|
except ValueError:
|
|
# If relative path fails, use absolute path
|
|
workspace_path = str(workspace_pkg)
|
|
|
|
# Parse this workspace package
|
|
workspace_deps = parse_single_package(workspace_pkg, workspace_path)
|
|
deps.extend(workspace_deps)
|
|
processed_packages.add(str(workspace_pkg.resolve()))
|
|
else:
|
|
# Direct path without glob
|
|
workspace_dir = root_dir / pattern
|
|
if workspace_dir.is_dir():
|
|
workspace_pkg = workspace_dir / "package.json"
|
|
if workspace_pkg.exists():
|
|
# Skip if already processed
|
|
if str(workspace_pkg.resolve()) in processed_packages:
|
|
continue
|
|
|
|
# Calculate relative path for workspace_package field
|
|
try:
|
|
rel_path = workspace_pkg.relative_to(root_dir)
|
|
workspace_path = str(rel_path).replace("\\", "/")
|
|
except ValueError:
|
|
workspace_path = str(workspace_pkg)
|
|
|
|
# Parse this workspace package
|
|
workspace_deps = parse_single_package(workspace_pkg, workspace_path)
|
|
deps.extend(workspace_deps)
|
|
processed_packages.add(str(workspace_pkg.resolve()))
|
|
|
|
# Also check for Lerna configuration (lerna.json)
|
|
lerna_json = root_dir / "lerna.json"
|
|
if lerna_json.exists():
|
|
try:
|
|
with open(lerna_json, encoding="utf-8") as f:
|
|
lerna_data = json.load(f)
|
|
|
|
lerna_packages = lerna_data.get("packages", [])
|
|
for pattern in lerna_packages:
|
|
abs_pattern = str(root_dir / pattern)
|
|
if "*" in abs_pattern:
|
|
matched_paths = glob.glob(abs_pattern)
|
|
for matched_path in matched_paths:
|
|
matched_dir = Path(matched_path)
|
|
if matched_dir.is_dir():
|
|
workspace_pkg = matched_dir / "package.json"
|
|
if workspace_pkg.exists() and str(workspace_pkg.resolve()) not in processed_packages:
|
|
try:
|
|
rel_path = workspace_pkg.relative_to(root_dir)
|
|
workspace_path = str(rel_path).replace("\\", "/")
|
|
except ValueError:
|
|
workspace_path = str(workspace_pkg)
|
|
|
|
workspace_deps = parse_single_package(workspace_pkg, workspace_path)
|
|
deps.extend(workspace_deps)
|
|
processed_packages.add(str(workspace_pkg.resolve()))
|
|
except (json.JSONDecodeError, KeyError):
|
|
# Lerna.json parsing failed, continue without it
|
|
pass
|
|
|
|
# Check for pnpm-workspace.yaml
|
|
pnpm_workspace = root_dir / "pnpm-workspace.yaml"
|
|
if pnpm_workspace.exists():
|
|
try:
|
|
with open(pnpm_workspace, encoding="utf-8") as f:
|
|
pnpm_data = yaml.safe_load(f)
|
|
|
|
pnpm_packages = pnpm_data.get("packages", [])
|
|
for pattern in pnpm_packages:
|
|
abs_pattern = str(root_dir / pattern)
|
|
if "*" in abs_pattern:
|
|
matched_paths = glob.glob(abs_pattern)
|
|
for matched_path in matched_paths:
|
|
matched_dir = Path(matched_path)
|
|
if matched_dir.is_dir():
|
|
workspace_pkg = matched_dir / "package.json"
|
|
if workspace_pkg.exists() and str(workspace_pkg.resolve()) not in processed_packages:
|
|
try:
|
|
rel_path = workspace_pkg.relative_to(root_dir)
|
|
workspace_path = str(rel_path).replace("\\", "/")
|
|
except ValueError:
|
|
workspace_path = str(workspace_pkg)
|
|
|
|
workspace_deps = parse_single_package(workspace_pkg, workspace_path)
|
|
deps.extend(workspace_deps)
|
|
processed_packages.add(str(workspace_pkg.resolve()))
|
|
except (yaml.YAMLError, KeyError):
|
|
# pnpm-workspace.yaml parsing failed, continue without it
|
|
pass
|
|
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
# Root package.json parsing for workspaces failed, but we already have root deps
|
|
pass
|
|
|
|
return deps
|
|
|
|
|
|
def _parse_pyproject_toml(path: Path) -> List[Dict[str, Any]]:
|
|
"""Parse dependencies from pyproject.toml."""
|
|
deps = []
|
|
try:
|
|
import tomllib
|
|
except ImportError:
|
|
# Python < 3.11
|
|
try:
|
|
import tomli as tomllib
|
|
except ImportError:
|
|
# Can't parse TOML without library
|
|
print(f"Warning: Cannot parse {path} - tomllib not available")
|
|
return deps
|
|
|
|
try:
|
|
with open(path, "rb") as f:
|
|
data = tomllib.load(f)
|
|
|
|
# Get project dependencies
|
|
project_deps = data.get("project", {}).get("dependencies", [])
|
|
for dep_spec in project_deps:
|
|
name, version = _parse_python_dep_spec(dep_spec)
|
|
if name:
|
|
deps.append({
|
|
"name": name,
|
|
"version": version or "latest",
|
|
"manager": "py",
|
|
"files": [],
|
|
"source": "pyproject.toml"
|
|
})
|
|
|
|
# Also check optional dependencies
|
|
optional = data.get("project", {}).get("optional-dependencies", {})
|
|
for group_deps in optional.values():
|
|
for dep_spec in group_deps:
|
|
name, version = _parse_python_dep_spec(dep_spec)
|
|
if name:
|
|
deps.append({
|
|
"name": name,
|
|
"version": version or "latest",
|
|
"manager": "py",
|
|
"files": [],
|
|
"source": "pyproject.toml"
|
|
})
|
|
except Exception as e:
|
|
print(f"Warning: Could not parse {path}: {e}")
|
|
|
|
return deps
|
|
|
|
|
|
def _parse_requirements_txt(path: Path) -> List[Dict[str, Any]]:
|
|
"""Parse dependencies from requirements.txt."""
|
|
deps = []
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
# Skip comments and empty lines
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
# Skip special directives
|
|
if line.startswith("-"):
|
|
continue
|
|
|
|
# Strip inline comments and trailing whitespace
|
|
if "#" in line:
|
|
line = line.split("#")[0].strip()
|
|
|
|
name, version = _parse_python_dep_spec(line)
|
|
if name:
|
|
deps.append({
|
|
"name": name,
|
|
"version": version or "latest",
|
|
"manager": "py",
|
|
"files": [],
|
|
"source": path.name
|
|
})
|
|
except Exception as e:
|
|
print(f"Warning: Could not parse {path}: {e}")
|
|
|
|
return deps
|
|
|
|
|
|
def _parse_python_dep_spec(spec: str) -> tuple[str, Optional[str]]:
|
|
"""
|
|
Parse a Python dependency specification.
|
|
Returns (name, version) tuple.
|
|
"""
|
|
# Handle various formats:
|
|
# package==1.2.3
|
|
# package>=1.2.3
|
|
# package~=1.2.3
|
|
# package[extra]==1.2.3
|
|
# package @ git+https://...
|
|
|
|
# Remove extras
|
|
spec = re.sub(r'\[.*?\]', '', spec)
|
|
|
|
# Handle git URLs
|
|
if "@" in spec and ("git+" in spec or "https://" in spec):
|
|
name = spec.split("@")[0].strip()
|
|
return (name, "git")
|
|
|
|
# Parse version specs (allow dots, underscores, hyphens in package names)
|
|
match = re.match(r'^([a-zA-Z0-9._-]+)\s*([><=~!]+)\s*(.+)$', spec)
|
|
if match:
|
|
name, op, version = match.groups()
|
|
# For pinned versions, use exact version
|
|
if op == "==":
|
|
return (name, version)
|
|
# For other operators, use the specified version as hint
|
|
return (name, version)
|
|
|
|
# No version specified
|
|
return (spec.strip(), None)
|
|
|
|
|
|
def _clean_version(version_spec: str) -> str:
|
|
"""
|
|
Clean version specification to get actual version.
|
|
^1.2.3 -> 1.2.3
|
|
~1.2.3 -> 1.2.3
|
|
>=1.2.3 -> 1.2.3
|
|
"""
|
|
# Remove common prefixes
|
|
version = re.sub(r'^[~^>=<]+', '', version_spec)
|
|
# Handle ranges (use first version)
|
|
if " " in version:
|
|
version = version.split()[0]
|
|
return version.strip()
|
|
|
|
|
|
def _parse_docker_compose(path: Path) -> List[Dict[str, Any]]:
|
|
"""Parse Docker base images from docker-compose.yml files."""
|
|
deps = []
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Check if services key exists
|
|
if not data or "services" not in data:
|
|
return deps
|
|
|
|
# Iterate through services
|
|
for service_name, service_config in data["services"].items():
|
|
if not isinstance(service_config, dict):
|
|
continue
|
|
|
|
# Extract image if present
|
|
if "image" in service_config:
|
|
image_spec = service_config["image"]
|
|
# Parse image:tag format
|
|
if ":" in image_spec:
|
|
name, tag = image_spec.rsplit(":", 1)
|
|
else:
|
|
name = image_spec
|
|
tag = "latest"
|
|
|
|
# Handle registry prefixes (e.g., docker.io/library/postgres)
|
|
if "/" in name:
|
|
# Take the last part as the image name
|
|
name_parts = name.split("/")
|
|
if len(name_parts) >= 2:
|
|
# If it's library/image, use just image
|
|
if name_parts[-2] == "library":
|
|
name = name_parts[-1]
|
|
else:
|
|
# Keep org/image format
|
|
name = "/".join(name_parts[-2:])
|
|
|
|
deps.append({
|
|
"name": name,
|
|
"version": tag,
|
|
"manager": "docker",
|
|
"files": [],
|
|
"source": path.name
|
|
})
|
|
except (yaml.YAMLError, KeyError, AttributeError) as e:
|
|
print(f"Warning: Could not parse {path}: {e}")
|
|
|
|
return deps
|
|
|
|
|
|
def _parse_dockerfile(path: Path) -> List[Dict[str, Any]]:
|
|
"""Parse Docker base images from Dockerfile."""
|
|
deps = []
|
|
try:
|
|
with open(path, encoding="utf-8") as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
# Look for FROM instructions
|
|
if line.upper().startswith("FROM "):
|
|
# Extract image spec after FROM
|
|
image_spec = line[5:].strip()
|
|
|
|
# Handle multi-stage builds (FROM image AS stage)
|
|
if " AS " in image_spec.upper():
|
|
image_spec = image_spec.split(" AS ")[0].strip()
|
|
elif " as " in image_spec:
|
|
image_spec = image_spec.split(" as ")[0].strip()
|
|
|
|
# Skip scratch and build stages
|
|
if image_spec.lower() in ["scratch", "builder"]:
|
|
continue
|
|
|
|
# Parse image:tag format
|
|
if ":" in image_spec:
|
|
name, tag = image_spec.rsplit(":", 1)
|
|
else:
|
|
name = image_spec
|
|
tag = "latest"
|
|
|
|
# Handle registry prefixes
|
|
if "/" in name:
|
|
name_parts = name.split("/")
|
|
if len(name_parts) >= 2:
|
|
if name_parts[-2] == "library":
|
|
name = name_parts[-1]
|
|
else:
|
|
name = "/".join(name_parts[-2:])
|
|
|
|
deps.append({
|
|
"name": name,
|
|
"version": tag,
|
|
"manager": "docker",
|
|
"files": [],
|
|
"source": str(path.relative_to(Path.cwd()))
|
|
})
|
|
except Exception as e:
|
|
print(f"Warning: Could not parse {path}: {e}")
|
|
|
|
return deps
|
|
|
|
|
|
def write_deps_json(deps: List[Dict[str, Any]], output_path: str = "./.pf/deps.json") -> None:
|
|
"""Write dependencies to JSON file."""
|
|
try:
|
|
output = sanitize_path(output_path, ".")
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output, "w", encoding="utf-8") as f:
|
|
json.dump(deps, f, indent=2, sort_keys=True)
|
|
except SecurityError as e:
|
|
raise SecurityError(f"Invalid output path: {e}")
|
|
|
|
|
|
def check_latest_versions(
|
|
deps: List[Dict[str, Any]],
|
|
allow_net: bool = True,
|
|
offline: bool = False,
|
|
cache_file: str = "./.pf/deps_cache.json"
|
|
) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Check latest versions from registries with caching.
|
|
|
|
Returns dict keyed by "manager:name" with:
|
|
{
|
|
"locked": str,
|
|
"latest": str,
|
|
"delta": str,
|
|
"is_outdated": bool,
|
|
"last_checked": str (ISO timestamp)
|
|
}
|
|
"""
|
|
if offline or not allow_net:
|
|
# Try to load from cache in offline mode
|
|
cached_data = _load_deps_cache(cache_file)
|
|
if cached_data:
|
|
# Update locked versions from current deps
|
|
for dep in deps:
|
|
key = f"{dep['manager']}:{dep['name']}"
|
|
if key in cached_data:
|
|
cached_data[key]["locked"] = dep["version"]
|
|
cached_data[key]["is_outdated"] = cached_data[key]["latest"] != dep["version"]
|
|
cached_data[key]["delta"] = _calculate_version_delta(dep["version"], cached_data[key]["latest"])
|
|
return cached_data or {}
|
|
|
|
# Load existing cache
|
|
cache = _load_deps_cache(cache_file)
|
|
latest_info = {}
|
|
needs_check = []
|
|
|
|
# FIRST PASS: Check what's in cache and still valid
|
|
for dep in deps:
|
|
key = f"{dep['manager']}:{dep['name']}"
|
|
if key in latest_info:
|
|
continue # Already processed
|
|
|
|
# Check if we have valid cached data (24 hours for deps)
|
|
if key in cache and _is_cache_valid(cache[key], hours=24):
|
|
# Update locked version from current deps
|
|
cache[key]["locked"] = dep["version"]
|
|
cache[key]["is_outdated"] = cache[key]["latest"] != dep["version"]
|
|
cache[key]["delta"] = _calculate_version_delta(dep["version"], cache[key]["latest"])
|
|
latest_info[key] = cache[key]
|
|
else:
|
|
needs_check.append(dep)
|
|
|
|
# Early exit if everything is cached
|
|
if not needs_check:
|
|
return latest_info
|
|
|
|
# SECOND PASS: Check only what needs updating, with per-service rate limiting
|
|
npm_rate_limited_until = 0
|
|
pypi_rate_limited_until = 0
|
|
docker_rate_limited_until = 0
|
|
|
|
for dep in needs_check:
|
|
key = f"{dep['manager']}:{dep['name']}"
|
|
current_time = time.time()
|
|
|
|
# Skip if this service is rate limited
|
|
if dep["manager"] == "npm" and current_time < npm_rate_limited_until:
|
|
# Use cached data if available, even if expired
|
|
if key in cache:
|
|
latest_info[key] = cache[key]
|
|
continue
|
|
elif dep["manager"] == "py" and current_time < pypi_rate_limited_until:
|
|
if key in cache:
|
|
latest_info[key] = cache[key]
|
|
continue
|
|
elif dep["manager"] == "docker" and current_time < docker_rate_limited_until:
|
|
if key in cache:
|
|
latest_info[key] = cache[key]
|
|
continue
|
|
|
|
try:
|
|
if dep["manager"] == "npm":
|
|
latest = _check_npm_latest(dep["name"])
|
|
elif dep["manager"] == "py":
|
|
latest = _check_pypi_latest(dep["name"])
|
|
elif dep["manager"] == "docker":
|
|
latest = _check_dockerhub_latest(dep["name"])
|
|
else:
|
|
continue
|
|
|
|
if latest:
|
|
locked = dep["version"]
|
|
delta = _calculate_version_delta(locked, latest)
|
|
latest_info[key] = {
|
|
"locked": locked,
|
|
"latest": latest,
|
|
"delta": delta,
|
|
"is_outdated": locked != latest,
|
|
"last_checked": datetime.now().isoformat()
|
|
}
|
|
# Rate limiting: service-specific delays for optimal performance
|
|
if dep["manager"] == "npm":
|
|
time.sleep(RATE_LIMIT_NPM) # 0.1s for npm
|
|
elif dep["manager"] == "py":
|
|
time.sleep(RATE_LIMIT_PYPI) # 0.2s for PyPI
|
|
elif dep["manager"] == "docker":
|
|
time.sleep(RATE_LIMIT_DOCKER) # 0.2s for Docker Hub
|
|
except (urllib.error.URLError, urllib.error.HTTPError, http.client.RemoteDisconnected,
|
|
TimeoutError, json.JSONDecodeError, KeyError, ValueError) as e:
|
|
error_msg = f"{type(e).__name__}: {str(e)[:50]}"
|
|
|
|
# Handle rate limiting and connection errors specifically
|
|
if ("429" in str(e) or "rate" in str(e).lower() or
|
|
"RemoteDisconnected" in str(e) or "closed connection" in str(e).lower()):
|
|
# Set rate limit expiry for this service
|
|
if dep["manager"] == "npm":
|
|
npm_rate_limited_until = current_time + RATE_LIMIT_BACKOFF
|
|
elif dep["manager"] == "py":
|
|
pypi_rate_limited_until = current_time + RATE_LIMIT_BACKOFF
|
|
elif dep["manager"] == "docker":
|
|
docker_rate_limited_until = current_time + RATE_LIMIT_BACKOFF
|
|
|
|
# Use cached data if available, even if expired
|
|
if key in cache:
|
|
latest_info[key] = cache[key]
|
|
latest_info[key]["error"] = error_msg
|
|
else:
|
|
latest_info[key] = {
|
|
"locked": dep["version"],
|
|
"latest": None,
|
|
"delta": None,
|
|
"is_outdated": False,
|
|
"error": error_msg,
|
|
"last_checked": datetime.now().isoformat()
|
|
}
|
|
continue
|
|
|
|
# Save updated cache
|
|
_save_deps_cache(latest_info, cache_file)
|
|
|
|
return latest_info
|
|
|
|
|
|
def _load_deps_cache(cache_file: str) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Load the dependency cache from disk.
|
|
Returns empty dict if cache doesn't exist or is invalid.
|
|
"""
|
|
try:
|
|
cache_path = Path(cache_file)
|
|
if cache_path.exists():
|
|
with open(cache_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, OSError):
|
|
pass
|
|
return {}
|
|
|
|
|
|
def _save_deps_cache(latest_info: Dict[str, Dict[str, Any]], cache_file: str) -> None:
|
|
"""
|
|
Save the dependency cache to disk.
|
|
Merges with existing cache to preserve data for packages not in current check.
|
|
"""
|
|
try:
|
|
cache_path = Path(cache_file)
|
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load existing cache to merge
|
|
existing = _load_deps_cache(cache_file)
|
|
|
|
# Merge new data into existing (new data takes precedence)
|
|
existing.update(latest_info)
|
|
|
|
# Write merged cache
|
|
with open(cache_path, 'w', encoding='utf-8') as f:
|
|
json.dump(existing, f, indent=2, sort_keys=True)
|
|
except OSError:
|
|
pass # Fail silently if can't write cache
|
|
|
|
|
|
def _is_cache_valid(cached_item: Dict[str, Any], hours: int = 24) -> bool:
|
|
"""
|
|
Check if a cached item is still valid based on age.
|
|
Default is 24 hours for dependency version checks.
|
|
"""
|
|
try:
|
|
if "last_checked" not in cached_item:
|
|
return False
|
|
last_checked = datetime.fromisoformat(cached_item["last_checked"])
|
|
age = datetime.now() - last_checked
|
|
return age.total_seconds() < (hours * 3600)
|
|
except (ValueError, KeyError):
|
|
return False
|
|
|
|
|
|
def _check_npm_latest(package_name: str) -> Optional[str]:
|
|
"""Fetch latest version from npm registry."""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# Validate and sanitize package name
|
|
if not validate_package_name(package_name, "npm"):
|
|
return None
|
|
|
|
# URL-encode the package name for safety
|
|
safe_package_name = sanitize_url_component(package_name)
|
|
url = f"https://registry.npmjs.org/{safe_package_name}"
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
data = json.loads(response.read())
|
|
return data.get("dist-tags", {}).get("latest")
|
|
except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError):
|
|
return None
|
|
|
|
|
|
def _check_pypi_latest(package_name: str) -> Optional[str]:
|
|
"""Fetch latest version from PyPI."""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# Validate package name
|
|
if not validate_package_name(package_name, "py"):
|
|
return None
|
|
|
|
# Normalize package name for PyPI (replace underscores with hyphens)
|
|
normalized_name = package_name.replace('_', '-')
|
|
# Sanitize for URL
|
|
safe_package_name = sanitize_url_component(normalized_name)
|
|
url = f"https://pypi.org/pypi/{safe_package_name}/json"
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
data = json.loads(response.read())
|
|
return data.get("info", {}).get("version")
|
|
except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError):
|
|
return None
|
|
|
|
|
|
def _check_dockerhub_latest(image_name: str) -> Optional[str]:
|
|
"""Fetch latest version from Docker Hub."""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# Validate image name
|
|
if not validate_package_name(image_name, "docker"):
|
|
return None
|
|
|
|
# For official images, use library/ prefix
|
|
if "/" not in image_name:
|
|
image_name = f"library/{image_name}"
|
|
|
|
# Sanitize image name for URL
|
|
safe_image_name = sanitize_url_component(image_name)
|
|
|
|
# Docker Hub API endpoint for tags
|
|
url = f"https://hub.docker.com/v2/repositories/{safe_image_name}/tags"
|
|
|
|
try:
|
|
# Create request with proper headers
|
|
req = urllib.request.Request(url)
|
|
req.add_header('User-Agent', 'TheAuditor/0.1.0')
|
|
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
|
data = json.loads(response.read())
|
|
|
|
# Parse the results to find latest stable version
|
|
tags = data.get("results", [])
|
|
if not tags:
|
|
return None
|
|
|
|
# Filter and sort tags to find the best "latest" version
|
|
version_tags = []
|
|
for tag in tags:
|
|
tag_name = tag.get("name", "")
|
|
# Skip non-version tags
|
|
if tag_name in ["latest", "alpine", "slim", "bullseye", "bookworm"]:
|
|
continue
|
|
# Look for semantic version-like tags
|
|
if re.match(r'^\d+(\.\d+)*', tag_name):
|
|
version_tags.append(tag_name)
|
|
|
|
if version_tags:
|
|
# Sort versions (simple string sort for now)
|
|
# More sophisticated version comparison could be added
|
|
version_tags.sort(reverse=True)
|
|
return version_tags[0]
|
|
|
|
# Fallback to "latest" if no version tags found
|
|
for tag in tags:
|
|
if tag.get("name") == "latest":
|
|
return "latest"
|
|
|
|
return None
|
|
|
|
except (urllib.error.URLError, http.client.RemoteDisconnected, json.JSONDecodeError, KeyError) as e:
|
|
# Docker Hub API might require auth or have rate limits
|
|
return None
|
|
|
|
|
|
def _calculate_version_delta(locked: str, latest: str) -> str:
|
|
"""
|
|
Calculate semantic version delta.
|
|
Returns: "major", "minor", "patch", "equal", or "unknown"
|
|
"""
|
|
try:
|
|
locked_parts = [int(x) for x in locked.split(".")[:3]]
|
|
latest_parts = [int(x) for x in latest.split(".")[:3]]
|
|
|
|
# Pad with zeros if needed
|
|
while len(locked_parts) < 3:
|
|
locked_parts.append(0)
|
|
while len(latest_parts) < 3:
|
|
latest_parts.append(0)
|
|
|
|
if locked_parts == latest_parts:
|
|
return "equal"
|
|
elif latest_parts[0] > locked_parts[0]:
|
|
return "major"
|
|
elif latest_parts[1] > locked_parts[1]:
|
|
return "minor"
|
|
elif latest_parts[2] > locked_parts[2]:
|
|
return "patch"
|
|
else:
|
|
return "unknown" # locked is newer than latest?
|
|
except (ValueError, IndexError):
|
|
return "unknown"
|
|
|
|
|
|
def write_deps_latest_json(
|
|
latest_info: Dict[str, Dict[str, Any]],
|
|
output_path: str = "./.pf/deps_latest.json"
|
|
) -> None:
|
|
"""Write latest version info to JSON file."""
|
|
try:
|
|
output = sanitize_path(output_path, ".")
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output, "w", encoding="utf-8") as f:
|
|
json.dump(latest_info, f, indent=2, sort_keys=True)
|
|
except SecurityError as e:
|
|
raise SecurityError(f"Invalid output path: {e}")
|
|
|
|
|
|
def upgrade_all_deps(
|
|
root_path: str,
|
|
latest_info: Dict[str, Dict[str, Any]],
|
|
deps_list: List[Dict[str, Any]]
|
|
) -> Dict[str, int]:
|
|
"""
|
|
YOLO MODE: Upgrade all dependencies to latest versions.
|
|
Rewrites requirements.txt, package.json, and pyproject.toml with latest versions.
|
|
|
|
Returns dict with counts of upgraded packages per file type.
|
|
"""
|
|
import shutil
|
|
from datetime import datetime
|
|
|
|
root = Path(root_path)
|
|
upgraded = {
|
|
"requirements.txt": 0,
|
|
"package.json": 0,
|
|
"pyproject.toml": 0
|
|
}
|
|
|
|
# Group deps by source file
|
|
deps_by_source = {}
|
|
for dep in deps_list:
|
|
source = dep.get("source", "")
|
|
if source not in deps_by_source:
|
|
deps_by_source[source] = []
|
|
deps_by_source[source].append(dep)
|
|
|
|
# Upgrade requirements*.txt files
|
|
for req_file in root.glob("requirements*.txt"):
|
|
if req_file.name in deps_by_source:
|
|
count = _upgrade_requirements_txt(req_file, latest_info, deps_by_source[req_file.name])
|
|
upgraded["requirements.txt"] += count
|
|
|
|
# Upgrade package.json
|
|
package_json = root / "package.json"
|
|
if package_json.exists() and "package.json" in deps_by_source:
|
|
count = _upgrade_package_json(package_json, latest_info, deps_by_source["package.json"])
|
|
upgraded["package.json"] = count
|
|
|
|
# Upgrade pyproject.toml
|
|
pyproject = root / "pyproject.toml"
|
|
if pyproject.exists() and "pyproject.toml" in deps_by_source:
|
|
count = _upgrade_pyproject_toml(pyproject, latest_info, deps_by_source["pyproject.toml"])
|
|
upgraded["pyproject.toml"] = count
|
|
|
|
return upgraded
|
|
|
|
|
|
def _upgrade_requirements_txt(
|
|
path: Path,
|
|
latest_info: Dict[str, Dict[str, Any]],
|
|
deps: List[Dict[str, Any]]
|
|
) -> int:
|
|
"""Upgrade a requirements.txt file to latest versions."""
|
|
# Sanitize path
|
|
try:
|
|
safe_path = sanitize_path(str(path), ".")
|
|
except SecurityError:
|
|
return 0 # Skip files outside project root
|
|
|
|
# Create backup
|
|
backup_path = safe_path.with_suffix(safe_path.suffix + ".bak")
|
|
shutil.copy2(safe_path, backup_path)
|
|
|
|
# Read current file
|
|
with open(safe_path, "r", encoding="utf-8") as f:
|
|
lines = f.readlines()
|
|
|
|
# Build package name to latest version map
|
|
latest_versions = {}
|
|
for dep in deps:
|
|
key = f"py:{dep['name']}"
|
|
if key in latest_info:
|
|
latest_versions[dep['name']] = latest_info[key]['latest']
|
|
|
|
# Rewrite lines with latest versions
|
|
updated_lines = []
|
|
count = 0
|
|
|
|
for line in lines:
|
|
original_line = line
|
|
line = line.strip()
|
|
|
|
# Skip comments and empty lines
|
|
if not line or line.startswith("#") or line.startswith("-"):
|
|
updated_lines.append(original_line)
|
|
continue
|
|
|
|
# Parse package name
|
|
name, _ = _parse_python_dep_spec(line)
|
|
|
|
if name and name in latest_versions:
|
|
# Replace with latest version
|
|
updated_lines.append(f"{name}=={latest_versions[name]}\n")
|
|
count += 1
|
|
else:
|
|
updated_lines.append(original_line)
|
|
|
|
# Write updated file
|
|
with open(safe_path, "w", encoding="utf-8") as f:
|
|
f.writelines(updated_lines)
|
|
|
|
return count
|
|
|
|
|
|
def _upgrade_package_json(
|
|
path: Path,
|
|
latest_info: Dict[str, Dict[str, Any]],
|
|
deps: List[Dict[str, Any]]
|
|
) -> int:
|
|
"""Upgrade package.json to latest versions."""
|
|
import shutil
|
|
|
|
# Sanitize path
|
|
try:
|
|
safe_path = sanitize_path(str(path), ".")
|
|
except SecurityError:
|
|
return 0 # Skip files outside project root
|
|
|
|
# Create backup
|
|
backup_path = safe_path.with_suffix(safe_path.suffix + ".bak")
|
|
shutil.copy2(safe_path, backup_path)
|
|
|
|
# Read current file
|
|
with open(safe_path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
count = 0
|
|
|
|
# Update dependencies
|
|
if "dependencies" in data:
|
|
for name in data["dependencies"]:
|
|
key = f"npm:{name}"
|
|
if key in latest_info:
|
|
data["dependencies"][name] = latest_info[key]["latest"]
|
|
count += 1
|
|
|
|
# Update devDependencies
|
|
if "devDependencies" in data:
|
|
for name in data["devDependencies"]:
|
|
key = f"npm:{name}"
|
|
if key in latest_info:
|
|
data["devDependencies"][name] = latest_info[key]["latest"]
|
|
count += 1
|
|
|
|
# Write updated file
|
|
with open(safe_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
f.write("\n") # Add trailing newline
|
|
|
|
return count
|
|
|
|
|
|
def _upgrade_pyproject_toml(
|
|
path: Path,
|
|
latest_info: Dict[str, Dict[str, Any]],
|
|
deps: List[Dict[str, Any]]
|
|
) -> int:
|
|
"""Upgrade pyproject.toml to latest versions - handles ALL sections."""
|
|
import shutil
|
|
import re
|
|
|
|
# Sanitize path
|
|
try:
|
|
safe_path = sanitize_path(str(path), ".")
|
|
except SecurityError:
|
|
return 0 # Skip files outside project root
|
|
|
|
# Create backup
|
|
backup_path = safe_path.with_suffix(safe_path.suffix + ".bak")
|
|
shutil.copy2(safe_path, backup_path)
|
|
|
|
# Read entire file as string for regex replacement
|
|
with open(safe_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
count = 0
|
|
updated_packages = {} # Track all updates: package -> [(old, new)]
|
|
|
|
# For each package in latest_info
|
|
for key, info in latest_info.items():
|
|
if not key.startswith("py:"):
|
|
continue
|
|
|
|
package_name = key[3:] # Remove "py:" prefix
|
|
latest_version = info.get("latest")
|
|
|
|
if not latest_version:
|
|
continue
|
|
|
|
# Pattern to match this package anywhere in the file
|
|
# Matches: "package==X.Y.Z" with any version number
|
|
pattern = rf'"{package_name}==([^"]+)"'
|
|
|
|
# Replace ALL occurrences at once using re.sub with a function
|
|
def replacer(match):
|
|
old_version = match.group(1)
|
|
if old_version != latest_version:
|
|
# Track the update
|
|
if package_name not in updated_packages:
|
|
updated_packages[package_name] = []
|
|
updated_packages[package_name].append((old_version, latest_version))
|
|
return f'"{package_name}=={latest_version}"'
|
|
return match.group(0) # No change
|
|
|
|
# Replace all occurrences in one pass
|
|
new_content = re.sub(pattern, replacer, content)
|
|
|
|
# Update count only if package was actually updated
|
|
if package_name in updated_packages and content != new_content:
|
|
count += 1
|
|
content = new_content
|
|
|
|
# Write updated content
|
|
with open(safe_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
|
|
# Report what was updated
|
|
total_occurrences = 0
|
|
# Use ASCII characters on Windows
|
|
check_mark = "[OK]" if IS_WINDOWS else "✓"
|
|
arrow = "->" if IS_WINDOWS else "→"
|
|
for package, updates in updated_packages.items():
|
|
total_occurrences += len(updates)
|
|
if len(updates) == 1:
|
|
print(f" {check_mark} {package}: {updates[0][0]} {arrow} {updates[0][1]}")
|
|
else:
|
|
print(f" {check_mark} {package}: {updates[0][0]} {arrow} {updates[0][1]} ({len(updates)} occurrences)")
|
|
|
|
# Return total occurrences updated, not just unique packages
|
|
return total_occurrences |