Files
Auditor/theauditor/docs_fetch.py

794 lines
29 KiB
Python

"""Documentation fetcher for version-correct package docs."""
import json
import re
import time
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError
# Default allowlist for registries
DEFAULT_ALLOWLIST = [
"https://registry.npmjs.org/",
"https://pypi.org/", # Allow both API and web scraping
"https://raw.githubusercontent.com/",
"https://readthedocs.io/",
"https://readthedocs.org/",
]
# Rate limiting configuration - optimized for minimal runtime
RATE_LIMIT_DELAY = 0.15 # Average delay between requests (balanced for npm/PyPI)
RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset)
def fetch_docs(
deps: List[Dict[str, Any]],
allow_net: bool = True,
allowlist: Optional[List[str]] = None,
offline: bool = False,
output_dir: str = "./.pf/context/docs"
) -> Dict[str, Any]:
"""
Fetch version-correct documentation for dependencies.
Args:
deps: List of dependency objects from deps.py
allow_net: Whether network access is allowed
allowlist: List of allowed URL prefixes (uses DEFAULT_ALLOWLIST if None)
offline: Force offline mode
output_dir: Base directory for cached docs
Returns:
Summary of fetch operations
"""
if offline or not allow_net:
return {
"mode": "offline",
"fetched": 0,
"cached": 0,
"skipped": len(deps),
"errors": []
}
if allowlist is None:
allowlist = DEFAULT_ALLOWLIST
try:
output_path = sanitize_path(output_dir, ".")
output_path.mkdir(parents=True, exist_ok=True)
except SecurityError as e:
return {
"mode": "error",
"error": f"Invalid output directory: {e}",
"fetched": 0,
"cached": 0,
"skipped": len(deps)
}
stats = {
"mode": "online",
"fetched": 0,
"cached": 0,
"skipped": 0,
"errors": []
}
# FIRST PASS: Check what's cached
needs_fetch = []
for dep in deps:
# Quick cache check without network
cache_result = _check_cache_for_dep(dep, output_path)
if cache_result["cached"]:
stats["cached"] += 1
else:
needs_fetch.append(dep)
# Early exit if everything is cached
if not needs_fetch:
return stats
# SECOND PASS: Fetch only what we need, with per-service rate limiting
npm_rate_limited_until = 0
pypi_rate_limited_until = 0
for i, dep in enumerate(needs_fetch):
try:
current_time = time.time()
# Check if this service is rate limited
if dep["manager"] == "npm" and current_time < npm_rate_limited_until:
stats["skipped"] += 1
stats["errors"].append(f"{dep['name']}: Skipped (npm rate limited)")
continue
elif dep["manager"] == "py" and current_time < pypi_rate_limited_until:
stats["skipped"] += 1
stats["errors"].append(f"{dep['name']}: Skipped (PyPI rate limited)")
continue
# Fetch the documentation
if dep["manager"] == "npm":
result = _fetch_npm_docs(dep, output_path, allowlist)
elif dep["manager"] == "py":
result = _fetch_pypi_docs(dep, output_path, allowlist)
else:
stats["skipped"] += 1
continue
if result["status"] == "fetched":
stats["fetched"] += 1
# Rate limiting: delay after successful fetch to be server-friendly
# npm and PyPI both have rate limits (npm: 100/min, PyPI: 60/min)
time.sleep(RATE_LIMIT_DELAY) # Be server-friendly
elif result["status"] == "cached":
stats["cached"] += 1 # Shouldn't happen here but handle it
elif result.get("reason") == "rate_limited":
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
stats["skipped"] += 1
# Set rate limit expiry for this service
if dep["manager"] == "npm":
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
elif dep["manager"] == "py":
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
else:
stats["skipped"] += 1
except Exception as e:
error_msg = str(e)
if "429" in error_msg or "rate" in error_msg.lower():
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
# Set rate limit expiry for this service
if dep["manager"] == "npm":
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
elif dep["manager"] == "py":
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
else:
stats["errors"].append(f"{dep['name']}: {error_msg}")
return stats
def _check_cache_for_dep(dep: Dict[str, Any], output_dir: Path) -> Dict[str, bool]:
"""
Quick cache check for a dependency without making network calls.
Returns {"cached": True/False}
"""
name = dep["name"]
version = dep["version"]
manager = dep["manager"]
# Build the cache file path
if manager == "npm":
# Handle git versions
if version.startswith("git") or "://" in version:
import hashlib
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
safe_version = f"git-{version_hash}"
else:
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
safe_name = name.replace("@", "_at_").replace("/", "_")
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
elif manager == "py":
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
safe_name = name.replace("/", "_").replace("\\", "_")
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
else:
return {"cached": False}
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
# Check cache validity
if doc_file.exists() and meta_file.exists():
try:
with open(meta_file, encoding="utf-8") as f:
meta = json.load(f)
# Cache for 7 days
last_checked = datetime.fromisoformat(meta["last_checked"])
if (datetime.now() - last_checked).days < 7:
return {"cached": True}
except (json.JSONDecodeError, KeyError):
pass
return {"cached": False}
def _fetch_npm_docs(
dep: Dict[str, Any],
output_dir: Path,
allowlist: List[str]
) -> Dict[str, Any]:
"""Fetch documentation for an npm package."""
name = dep["name"]
version = dep["version"]
# Validate package name
if not validate_package_name(name, "npm"):
return {"status": "skipped", "reason": "Invalid package name"}
# Sanitize version for filesystem (handle git URLs)
if version.startswith("git") or "://" in version:
# For git dependencies, use a hash of the URL as version
import hashlib
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
safe_version = f"git-{version_hash}"
else:
# For normal versions, just replace problematic characters
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
# Create package-specific directory with sanitized name
# Replace @ and / in scoped packages for filesystem safety
safe_name = name.replace("@", "_at_").replace("/", "_")
try:
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
pkg_dir.mkdir(parents=True, exist_ok=True)
except (OSError, SecurityError) as e:
return {"status": "error", "error": f"Cannot create package directory: {e}"}
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
# Check cache
if doc_file.exists() and meta_file.exists():
# Check if cache is still valid (simple time-based for now)
try:
with open(meta_file, encoding="utf-8") as f:
meta = json.load(f)
# Cache for 7 days
last_checked = datetime.fromisoformat(meta["last_checked"])
if (datetime.now() - last_checked).days < 7:
return {"status": "cached"}
except (json.JSONDecodeError, KeyError):
pass # Invalid cache, refetch
# Fetch from registry with sanitized package name
safe_url_name = sanitize_url_component(name)
safe_url_version = sanitize_url_component(version)
url = f"https://registry.npmjs.org/{safe_url_name}/{safe_url_version}"
if not _is_url_allowed(url, allowlist):
return {"status": "skipped", "reason": "URL not in allowlist"}
try:
with urllib.request.urlopen(url, timeout=10) as response:
data = json.loads(response.read())
readme = data.get("readme", "")
repository = data.get("repository", {})
homepage = data.get("homepage", "")
# Priority 1: Try to get README from GitHub if available
github_fetched = False
if isinstance(repository, dict):
repo_url = repository.get("url", "")
github_readme = _fetch_github_readme(repo_url, allowlist)
if github_readme and len(github_readme) > 500: # Only use if substantial
readme = github_readme
github_fetched = True
# Priority 2: If no good GitHub README, try homepage if it's GitHub
if not github_fetched and homepage and "github.com" in homepage:
github_readme = _fetch_github_readme(homepage, allowlist)
if github_readme and len(github_readme) > 500:
readme = github_readme
github_fetched = True
# Priority 3: Use npm README if it's substantial
if not github_fetched and len(readme) < 500:
# The npm README is too short, try to enhance it
readme = _enhance_npm_readme(data, readme)
# Write documentation
with open(doc_file, "w", encoding="utf-8") as f:
f.write(f"# {name}@{version}\n\n")
f.write(f"**Package**: [{name}](https://www.npmjs.com/package/{name})\n")
f.write(f"**Version**: {version}\n")
if homepage:
f.write(f"**Homepage**: {homepage}\n")
f.write("\n---\n\n")
f.write(readme)
# Add usage examples if not in README
if "## Usage" not in readme and "## Example" not in readme:
f.write("\n\n## Installation\n\n```bash\nnpm install {name}\n```\n".format(name=name))
# Write metadata
meta = {
"source_url": url,
"last_checked": datetime.now().isoformat(),
"etag": response.headers.get("ETag"),
"repository": repository,
"from_github": github_fetched
}
with open(meta_file, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2)
return {"status": "fetched"}
except urllib.error.HTTPError as e:
if e.code == 429:
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
except (urllib.error.URLError, json.JSONDecodeError) as e:
return {"status": "error", "error": str(e)}
def _fetch_pypi_docs(
dep: Dict[str, Any],
output_dir: Path,
allowlist: List[str]
) -> Dict[str, Any]:
"""Fetch documentation for a PyPI package."""
name = dep["name"].strip() # Strip any whitespace from name
version = dep["version"]
# Validate package name
if not validate_package_name(name, "py"):
return {"status": "skipped", "reason": "Invalid package name"}
# Sanitize package name for URL
safe_url_name = sanitize_url_component(name)
# Handle special versions
if version in ["latest", "git"]:
# For latest, fetch current version first
if version == "latest":
url = f"https://pypi.org/pypi/{safe_url_name}/json"
else:
return {"status": "skipped", "reason": "git dependency"}
else:
safe_url_version = sanitize_url_component(version)
url = f"https://pypi.org/pypi/{safe_url_name}/{safe_url_version}/json"
if not _is_url_allowed(url, allowlist):
return {"status": "skipped", "reason": "URL not in allowlist"}
# Sanitize version for filesystem
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
# Create package-specific directory with sanitized name
safe_name = name.replace("/", "_").replace("\\", "_")
try:
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
pkg_dir.mkdir(parents=True, exist_ok=True)
except (OSError, SecurityError) as e:
return {"status": "error", "error": f"Cannot create package directory: {e}"}
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
# Check cache
if doc_file.exists() and meta_file.exists():
try:
with open(meta_file, encoding="utf-8") as f:
meta = json.load(f)
last_checked = datetime.fromisoformat(meta["last_checked"])
if (datetime.now() - last_checked).days < 7:
return {"status": "cached"}
except (json.JSONDecodeError, KeyError):
pass
try:
with urllib.request.urlopen(url, timeout=10) as response:
data = json.loads(response.read())
info = data.get("info", {})
description = info.get("description", "")
summary = info.get("summary", "")
# Priority 1: Try to get README from project URLs (GitHub, GitLab, etc.)
github_fetched = False
project_urls = info.get("project_urls", {})
# Check all possible URL sources for GitHub
all_urls = []
for key, proj_url in project_urls.items():
if proj_url:
all_urls.append(proj_url)
# Also check home_page and download_url
home_page = info.get("home_page", "")
if home_page:
all_urls.append(home_page)
download_url = info.get("download_url", "")
if download_url:
all_urls.append(download_url)
# Try GitHub first
for url in all_urls:
if "github.com" in url.lower():
github_readme = _fetch_github_readme(url, allowlist)
if github_readme and len(github_readme) > 500:
description = github_readme
github_fetched = True
break
# Priority 2: Try ReadTheDocs if available
if not github_fetched:
for url in all_urls:
if "readthedocs" in url.lower():
rtd_content = _fetch_readthedocs(url, allowlist)
if rtd_content and len(rtd_content) > 500:
description = rtd_content
github_fetched = True # Mark as fetched from external source
break
# Priority 3: Try to scrape PyPI web page (not API) for full README
if not github_fetched and len(description) < 1000:
pypi_readme = _fetch_pypi_web_readme(name, version, allowlist)
if pypi_readme and len(pypi_readme) > len(description):
description = pypi_readme
github_fetched = True # Mark as fetched from external source
# Priority 4: Use PyPI description (often contains full README)
# PyPI descriptions can be quite good if properly uploaded
if not github_fetched and len(description) < 500 and summary:
# If description is too short, enhance it
description = _enhance_pypi_description(info, description, summary)
# Write documentation
with open(doc_file, "w", encoding="utf-8") as f:
f.write(f"# {name}@{version}\n\n")
f.write(f"**Package**: [{name}](https://pypi.org/project/{name}/)\n")
f.write(f"**Version**: {version}\n")
# Add project URLs if available
if project_urls:
f.write("\n**Links**:\n")
for key, url in list(project_urls.items())[:5]: # Limit to 5
if url:
f.write(f"- {key}: {url}\n")
f.write("\n---\n\n")
# Add summary if different from description
if summary and summary not in description:
f.write(f"**Summary**: {summary}\n\n")
f.write(description)
# Add installation instructions if not in description
if "pip install" not in description.lower():
f.write(f"\n\n## Installation\n\n```bash\npip install {name}\n```\n")
# Add basic usage if really minimal docs
if len(description) < 200:
f.write(f"\n\n## Basic Usage\n\n```python\nimport {name.replace('-', '_')}\n```\n")
# Write metadata
meta = {
"source_url": url,
"last_checked": datetime.now().isoformat(),
"etag": response.headers.get("ETag"),
"project_urls": project_urls,
"from_github": github_fetched
}
with open(meta_file, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2)
return {"status": "fetched"}
except urllib.error.HTTPError as e:
if e.code == 429:
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
except (urllib.error.URLError, json.JSONDecodeError) as e:
return {"status": "error", "error": str(e)}
def _fetch_github_readme(repo_url: str, allowlist: List[str]) -> Optional[str]:
"""
Fetch README from GitHub repository.
Converts repository URL to raw GitHub URL for README.
"""
if not repo_url:
return None
# Extract owner/repo from various GitHub URL formats
patterns = [
r'github\.com[:/]([^/]+)/([^/\s]+)',
r'git\+https://github\.com/([^/]+)/([^/\s]+)',
]
for pattern in patterns:
match = re.search(pattern, repo_url)
if match:
owner, repo = match.groups()
# Clean repo name
repo = repo.replace(".git", "")
# Try common README filenames
readme_files = ["README.md", "readme.md", "README.rst", "README.txt"]
# Sanitize owner and repo for URL
safe_owner = sanitize_url_component(owner)
safe_repo = sanitize_url_component(repo)
for readme_name in readme_files:
safe_readme = sanitize_url_component(readme_name)
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/main/{safe_readme}"
if not _is_url_allowed(raw_url, allowlist):
continue
try:
with urllib.request.urlopen(raw_url, timeout=5) as response:
return response.read().decode("utf-8")
except urllib.error.HTTPError:
# Try master branch
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/master/{safe_readme}"
try:
with urllib.request.urlopen(raw_url, timeout=5) as response:
return response.read().decode("utf-8")
except urllib.error.URLError:
continue
except urllib.error.URLError:
continue
return None
def _is_url_allowed(url: str, allowlist: List[str]) -> bool:
"""Check if URL is in the allowlist."""
for allowed in allowlist:
if url.startswith(allowed):
return True
return False
def _enhance_npm_readme(data: Dict[str, Any], readme: str) -> str:
"""Enhance minimal npm README with package metadata."""
enhanced = readme if readme else ""
# Add description if not in README
description = data.get("description", "")
if description and description not in enhanced:
enhanced = f"{description}\n\n{enhanced}"
# Add keywords
keywords = data.get("keywords", [])
if keywords and "keywords" not in enhanced.lower():
enhanced += f"\n\n## Keywords\n\n{', '.join(keywords)}"
# Add main entry point info
main = data.get("main", "")
if main:
enhanced += f"\n\n## Entry Point\n\nMain file: `{main}`"
# Add dependencies info if substantial
deps = data.get("dependencies", {})
if len(deps) > 0 and len(deps) <= 10: # Only if reasonable number
enhanced += "\n\n## Dependencies\n\n"
for dep, ver in deps.items():
enhanced += f"- {dep}: {ver}\n"
return enhanced
def _fetch_readthedocs(url: str, allowlist: List[str]) -> Optional[str]:
"""
Fetch documentation from ReadTheDocs.
Tries to get the main index page content.
"""
if not url or not _is_url_allowed(url, allowlist):
return None
# Ensure we're getting the latest version
if not url.endswith("/"):
url += "/"
# Try to fetch the main page
try:
# Add en/latest if not already in URL
if "/en/latest" not in url and "/en/stable" not in url:
url = url.rstrip("/") + "/en/latest/"
with urllib.request.urlopen(url, timeout=10) as response:
html_content = response.read().decode("utf-8")
# Basic HTML to markdown conversion (very simplified)
# Remove script and style tags
html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
# Extract main content (look for common RTD content divs)
content_match = re.search(r'<div[^>]*class="[^"]*document[^"]*"[^>]*>(.*?)</div>', html_content, re.DOTALL)
if content_match:
html_content = content_match.group(1)
# Convert basic HTML tags to markdown
html_content = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', html_content)
html_content = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', html_content)
html_content = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', html_content)
html_content = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html_content)
html_content = re.sub(r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```', html_content, flags=re.DOTALL)
html_content = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html_content)
html_content = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', html_content)
html_content = re.sub(r'<[^>]+>', '', html_content) # Remove remaining HTML tags
# Clean up whitespace
html_content = re.sub(r'\n{3,}', '\n\n', html_content)
return html_content.strip()
except Exception:
return None
def _fetch_pypi_web_readme(name: str, version: str, allowlist: List[str]) -> Optional[str]:
"""
Fetch the rendered README from PyPI's web interface.
The web interface shows the full README that's often missing from the API.
"""
# Validate package name
if not validate_package_name(name, "py"):
return None
# Sanitize for URL
safe_name = sanitize_url_component(name)
safe_version = sanitize_url_component(version)
# PyPI web URLs
urls_to_try = [
f"https://pypi.org/project/{safe_name}/{safe_version}/",
f"https://pypi.org/project/{safe_name}/"
]
for url in urls_to_try:
if not _is_url_allowed(url, allowlist):
continue
try:
req = urllib.request.Request(url, headers={
'User-Agent': 'Mozilla/5.0 (compatible; TheAuditor/1.0)'
})
with urllib.request.urlopen(req, timeout=10) as response:
html_content = response.read().decode("utf-8")
# Look for the project description div
# PyPI uses a specific class for the README content
readme_match = re.search(
r'<div[^>]*class="[^"]*project-description[^"]*"[^>]*>(.*?)</div>',
html_content,
re.DOTALL | re.IGNORECASE
)
if not readme_match:
# Try alternative patterns
readme_match = re.search(
r'<div[^>]*class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
html_content,
re.DOTALL | re.IGNORECASE
)
if readme_match:
readme_html = readme_match.group(1)
# Convert HTML to markdown (simplified)
# Headers
readme_html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', readme_html, flags=re.IGNORECASE)
readme_html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', readme_html, flags=re.IGNORECASE)
readme_html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', readme_html, flags=re.IGNORECASE)
# Code blocks
readme_html = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```', readme_html, flags=re.DOTALL | re.IGNORECASE)
readme_html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', readme_html, flags=re.IGNORECASE)
# Lists
readme_html = re.sub(r'<li[^>]*>(.*?)</li>', r'- \1\n', readme_html, flags=re.IGNORECASE)
# Links
readme_html = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', readme_html, flags=re.IGNORECASE)
# Paragraphs and line breaks
readme_html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', readme_html, flags=re.DOTALL | re.IGNORECASE)
readme_html = re.sub(r'<br[^>]*>', '\n', readme_html, flags=re.IGNORECASE)
# Remove remaining HTML tags
readme_html = re.sub(r'<[^>]+>', '', readme_html)
# Decode HTML entities
readme_html = readme_html.replace('&lt;', '<')
readme_html = readme_html.replace('&gt;', '>')
readme_html = readme_html.replace('&amp;', '&')
readme_html = readme_html.replace('&quot;', '"')
readme_html = readme_html.replace('&#39;', "'")
# Clean up whitespace
readme_html = re.sub(r'\n{3,}', '\n\n', readme_html)
readme_html = readme_html.strip()
if len(readme_html) > 100: # Only return if we got substantial content
return readme_html
except Exception:
continue
return None
def _enhance_pypi_description(info: Dict[str, Any], description: str, summary: str) -> str:
"""Enhance minimal PyPI description with package metadata."""
enhanced = description if description else ""
# Start with summary if description is empty
if not enhanced and summary:
enhanced = f"{summary}\n\n"
# Add author info
author = info.get("author", "")
author_email = info.get("author_email", "")
if author and "author" not in enhanced.lower():
author_info = f"\n\n## Author\n\n{author}"
if author_email:
author_info += f" ({author_email})"
enhanced += author_info
# Add license
license_info = info.get("license", "")
if license_info and "license" not in enhanced.lower():
enhanced += f"\n\n## License\n\n{license_info}"
# Add classifiers (limited)
classifiers = info.get("classifiers", [])
relevant_classifiers = [
c for c in classifiers
if "Programming Language" in c or "Framework" in c or "Topic" in c
][:5] # Limit to 5
if relevant_classifiers:
enhanced += "\n\n## Classifiers\n\n"
for classifier in relevant_classifiers:
enhanced += f"- {classifier}\n"
# Add requires_python if specified
requires_python = info.get("requires_python", "")
if requires_python:
enhanced += f"\n\n## Python Version\n\nRequires Python {requires_python}"
return enhanced
def check_latest(
deps: List[Dict[str, Any]],
allow_net: bool = True,
offline: bool = False,
output_path: str = "./.pf/deps_latest.json"
) -> Dict[str, Any]:
"""
Check latest versions and compare to locked versions.
This is a wrapper around deps.check_latest_versions for consistency.
"""
from .deps import check_latest_versions, write_deps_latest_json
if offline or not allow_net:
return {
"mode": "offline",
"checked": 0,
"outdated": 0
}
latest_info = check_latest_versions(deps, allow_net=allow_net, offline=offline)
if latest_info:
# Sanitize output path before writing
try:
safe_output_path = str(sanitize_path(output_path, "."))
write_deps_latest_json(latest_info, safe_output_path)
except SecurityError as e:
return {
"mode": "error",
"error": f"Invalid output path: {e}",
"checked": 0,
"outdated": 0
}
outdated = sum(1 for info in latest_info.values() if info["is_outdated"])
return {
"mode": "online",
"checked": len(latest_info),
"outdated": outdated,
"output": output_path
}