mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 19:34:19 +01:00
Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform
This commit is contained in:
793
theauditor/docs_fetch.py
Normal file
793
theauditor/docs_fetch.py
Normal file
@@ -0,0 +1,793 @@
|
||||
"""Documentation fetcher for version-correct package docs."""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError
|
||||
|
||||
|
||||
# Default allowlist for registries
|
||||
DEFAULT_ALLOWLIST = [
|
||||
"https://registry.npmjs.org/",
|
||||
"https://pypi.org/", # Allow both API and web scraping
|
||||
"https://raw.githubusercontent.com/",
|
||||
"https://readthedocs.io/",
|
||||
"https://readthedocs.org/",
|
||||
]
|
||||
|
||||
# Rate limiting configuration - optimized for minimal runtime
|
||||
RATE_LIMIT_DELAY = 0.15 # Average delay between requests (balanced for npm/PyPI)
|
||||
RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset)
|
||||
|
||||
|
||||
def fetch_docs(
|
||||
deps: List[Dict[str, Any]],
|
||||
allow_net: bool = True,
|
||||
allowlist: Optional[List[str]] = None,
|
||||
offline: bool = False,
|
||||
output_dir: str = "./.pf/context/docs"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Fetch version-correct documentation for dependencies.
|
||||
|
||||
Args:
|
||||
deps: List of dependency objects from deps.py
|
||||
allow_net: Whether network access is allowed
|
||||
allowlist: List of allowed URL prefixes (uses DEFAULT_ALLOWLIST if None)
|
||||
offline: Force offline mode
|
||||
output_dir: Base directory for cached docs
|
||||
|
||||
Returns:
|
||||
Summary of fetch operations
|
||||
"""
|
||||
if offline or not allow_net:
|
||||
return {
|
||||
"mode": "offline",
|
||||
"fetched": 0,
|
||||
"cached": 0,
|
||||
"skipped": len(deps),
|
||||
"errors": []
|
||||
}
|
||||
|
||||
if allowlist is None:
|
||||
allowlist = DEFAULT_ALLOWLIST
|
||||
|
||||
try:
|
||||
output_path = sanitize_path(output_dir, ".")
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
except SecurityError as e:
|
||||
return {
|
||||
"mode": "error",
|
||||
"error": f"Invalid output directory: {e}",
|
||||
"fetched": 0,
|
||||
"cached": 0,
|
||||
"skipped": len(deps)
|
||||
}
|
||||
|
||||
stats = {
|
||||
"mode": "online",
|
||||
"fetched": 0,
|
||||
"cached": 0,
|
||||
"skipped": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
# FIRST PASS: Check what's cached
|
||||
needs_fetch = []
|
||||
for dep in deps:
|
||||
# Quick cache check without network
|
||||
cache_result = _check_cache_for_dep(dep, output_path)
|
||||
if cache_result["cached"]:
|
||||
stats["cached"] += 1
|
||||
else:
|
||||
needs_fetch.append(dep)
|
||||
|
||||
# Early exit if everything is cached
|
||||
if not needs_fetch:
|
||||
return stats
|
||||
|
||||
# SECOND PASS: Fetch only what we need, with per-service rate limiting
|
||||
npm_rate_limited_until = 0
|
||||
pypi_rate_limited_until = 0
|
||||
|
||||
for i, dep in enumerate(needs_fetch):
|
||||
try:
|
||||
current_time = time.time()
|
||||
|
||||
# Check if this service is rate limited
|
||||
if dep["manager"] == "npm" and current_time < npm_rate_limited_until:
|
||||
stats["skipped"] += 1
|
||||
stats["errors"].append(f"{dep['name']}: Skipped (npm rate limited)")
|
||||
continue
|
||||
elif dep["manager"] == "py" and current_time < pypi_rate_limited_until:
|
||||
stats["skipped"] += 1
|
||||
stats["errors"].append(f"{dep['name']}: Skipped (PyPI rate limited)")
|
||||
continue
|
||||
|
||||
# Fetch the documentation
|
||||
if dep["manager"] == "npm":
|
||||
result = _fetch_npm_docs(dep, output_path, allowlist)
|
||||
elif dep["manager"] == "py":
|
||||
result = _fetch_pypi_docs(dep, output_path, allowlist)
|
||||
else:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
if result["status"] == "fetched":
|
||||
stats["fetched"] += 1
|
||||
# Rate limiting: delay after successful fetch to be server-friendly
|
||||
# npm and PyPI both have rate limits (npm: 100/min, PyPI: 60/min)
|
||||
time.sleep(RATE_LIMIT_DELAY) # Be server-friendly
|
||||
elif result["status"] == "cached":
|
||||
stats["cached"] += 1 # Shouldn't happen here but handle it
|
||||
elif result.get("reason") == "rate_limited":
|
||||
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
|
||||
stats["skipped"] += 1
|
||||
# Set rate limit expiry for this service
|
||||
if dep["manager"] == "npm":
|
||||
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
||||
elif dep["manager"] == "py":
|
||||
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
||||
else:
|
||||
stats["skipped"] += 1
|
||||
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "429" in error_msg or "rate" in error_msg.lower():
|
||||
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
|
||||
# Set rate limit expiry for this service
|
||||
if dep["manager"] == "npm":
|
||||
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
||||
elif dep["manager"] == "py":
|
||||
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
||||
else:
|
||||
stats["errors"].append(f"{dep['name']}: {error_msg}")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def _check_cache_for_dep(dep: Dict[str, Any], output_dir: Path) -> Dict[str, bool]:
|
||||
"""
|
||||
Quick cache check for a dependency without making network calls.
|
||||
Returns {"cached": True/False}
|
||||
"""
|
||||
name = dep["name"]
|
||||
version = dep["version"]
|
||||
manager = dep["manager"]
|
||||
|
||||
# Build the cache file path
|
||||
if manager == "npm":
|
||||
# Handle git versions
|
||||
if version.startswith("git") or "://" in version:
|
||||
import hashlib
|
||||
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
|
||||
safe_version = f"git-{version_hash}"
|
||||
else:
|
||||
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
||||
safe_name = name.replace("@", "_at_").replace("/", "_")
|
||||
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
|
||||
elif manager == "py":
|
||||
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
||||
safe_name = name.replace("/", "_").replace("\\", "_")
|
||||
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
|
||||
else:
|
||||
return {"cached": False}
|
||||
|
||||
doc_file = pkg_dir / "doc.md"
|
||||
meta_file = pkg_dir / "meta.json"
|
||||
|
||||
# Check cache validity
|
||||
if doc_file.exists() and meta_file.exists():
|
||||
try:
|
||||
with open(meta_file, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
# Cache for 7 days
|
||||
last_checked = datetime.fromisoformat(meta["last_checked"])
|
||||
if (datetime.now() - last_checked).days < 7:
|
||||
return {"cached": True}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
return {"cached": False}
|
||||
|
||||
|
||||
def _fetch_npm_docs(
|
||||
dep: Dict[str, Any],
|
||||
output_dir: Path,
|
||||
allowlist: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch documentation for an npm package."""
|
||||
name = dep["name"]
|
||||
version = dep["version"]
|
||||
|
||||
# Validate package name
|
||||
if not validate_package_name(name, "npm"):
|
||||
return {"status": "skipped", "reason": "Invalid package name"}
|
||||
|
||||
# Sanitize version for filesystem (handle git URLs)
|
||||
if version.startswith("git") or "://" in version:
|
||||
# For git dependencies, use a hash of the URL as version
|
||||
import hashlib
|
||||
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
|
||||
safe_version = f"git-{version_hash}"
|
||||
else:
|
||||
# For normal versions, just replace problematic characters
|
||||
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
||||
|
||||
# Create package-specific directory with sanitized name
|
||||
# Replace @ and / in scoped packages for filesystem safety
|
||||
safe_name = name.replace("@", "_at_").replace("/", "_")
|
||||
try:
|
||||
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
|
||||
pkg_dir.mkdir(parents=True, exist_ok=True)
|
||||
except (OSError, SecurityError) as e:
|
||||
return {"status": "error", "error": f"Cannot create package directory: {e}"}
|
||||
|
||||
doc_file = pkg_dir / "doc.md"
|
||||
meta_file = pkg_dir / "meta.json"
|
||||
|
||||
# Check cache
|
||||
if doc_file.exists() and meta_file.exists():
|
||||
# Check if cache is still valid (simple time-based for now)
|
||||
try:
|
||||
with open(meta_file, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
# Cache for 7 days
|
||||
last_checked = datetime.fromisoformat(meta["last_checked"])
|
||||
if (datetime.now() - last_checked).days < 7:
|
||||
return {"status": "cached"}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass # Invalid cache, refetch
|
||||
|
||||
# Fetch from registry with sanitized package name
|
||||
safe_url_name = sanitize_url_component(name)
|
||||
safe_url_version = sanitize_url_component(version)
|
||||
url = f"https://registry.npmjs.org/{safe_url_name}/{safe_url_version}"
|
||||
if not _is_url_allowed(url, allowlist):
|
||||
return {"status": "skipped", "reason": "URL not in allowlist"}
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
data = json.loads(response.read())
|
||||
|
||||
readme = data.get("readme", "")
|
||||
repository = data.get("repository", {})
|
||||
homepage = data.get("homepage", "")
|
||||
|
||||
# Priority 1: Try to get README from GitHub if available
|
||||
github_fetched = False
|
||||
if isinstance(repository, dict):
|
||||
repo_url = repository.get("url", "")
|
||||
github_readme = _fetch_github_readme(repo_url, allowlist)
|
||||
if github_readme and len(github_readme) > 500: # Only use if substantial
|
||||
readme = github_readme
|
||||
github_fetched = True
|
||||
|
||||
# Priority 2: If no good GitHub README, try homepage if it's GitHub
|
||||
if not github_fetched and homepage and "github.com" in homepage:
|
||||
github_readme = _fetch_github_readme(homepage, allowlist)
|
||||
if github_readme and len(github_readme) > 500:
|
||||
readme = github_readme
|
||||
github_fetched = True
|
||||
|
||||
# Priority 3: Use npm README if it's substantial
|
||||
if not github_fetched and len(readme) < 500:
|
||||
# The npm README is too short, try to enhance it
|
||||
readme = _enhance_npm_readme(data, readme)
|
||||
|
||||
# Write documentation
|
||||
with open(doc_file, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {name}@{version}\n\n")
|
||||
f.write(f"**Package**: [{name}](https://www.npmjs.com/package/{name})\n")
|
||||
f.write(f"**Version**: {version}\n")
|
||||
if homepage:
|
||||
f.write(f"**Homepage**: {homepage}\n")
|
||||
f.write("\n---\n\n")
|
||||
f.write(readme)
|
||||
|
||||
# Add usage examples if not in README
|
||||
if "## Usage" not in readme and "## Example" not in readme:
|
||||
f.write("\n\n## Installation\n\n```bash\nnpm install {name}\n```\n".format(name=name))
|
||||
|
||||
# Write metadata
|
||||
meta = {
|
||||
"source_url": url,
|
||||
"last_checked": datetime.now().isoformat(),
|
||||
"etag": response.headers.get("ETag"),
|
||||
"repository": repository,
|
||||
"from_github": github_fetched
|
||||
}
|
||||
with open(meta_file, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
return {"status": "fetched"}
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 429:
|
||||
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
|
||||
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
|
||||
except (urllib.error.URLError, json.JSONDecodeError) as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def _fetch_pypi_docs(
|
||||
dep: Dict[str, Any],
|
||||
output_dir: Path,
|
||||
allowlist: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch documentation for a PyPI package."""
|
||||
name = dep["name"].strip() # Strip any whitespace from name
|
||||
version = dep["version"]
|
||||
|
||||
# Validate package name
|
||||
if not validate_package_name(name, "py"):
|
||||
return {"status": "skipped", "reason": "Invalid package name"}
|
||||
|
||||
# Sanitize package name for URL
|
||||
safe_url_name = sanitize_url_component(name)
|
||||
|
||||
# Handle special versions
|
||||
if version in ["latest", "git"]:
|
||||
# For latest, fetch current version first
|
||||
if version == "latest":
|
||||
url = f"https://pypi.org/pypi/{safe_url_name}/json"
|
||||
else:
|
||||
return {"status": "skipped", "reason": "git dependency"}
|
||||
else:
|
||||
safe_url_version = sanitize_url_component(version)
|
||||
url = f"https://pypi.org/pypi/{safe_url_name}/{safe_url_version}/json"
|
||||
|
||||
if not _is_url_allowed(url, allowlist):
|
||||
return {"status": "skipped", "reason": "URL not in allowlist"}
|
||||
|
||||
# Sanitize version for filesystem
|
||||
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
||||
|
||||
# Create package-specific directory with sanitized name
|
||||
safe_name = name.replace("/", "_").replace("\\", "_")
|
||||
try:
|
||||
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
|
||||
pkg_dir.mkdir(parents=True, exist_ok=True)
|
||||
except (OSError, SecurityError) as e:
|
||||
return {"status": "error", "error": f"Cannot create package directory: {e}"}
|
||||
|
||||
doc_file = pkg_dir / "doc.md"
|
||||
meta_file = pkg_dir / "meta.json"
|
||||
|
||||
# Check cache
|
||||
if doc_file.exists() and meta_file.exists():
|
||||
try:
|
||||
with open(meta_file, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
last_checked = datetime.fromisoformat(meta["last_checked"])
|
||||
if (datetime.now() - last_checked).days < 7:
|
||||
return {"status": "cached"}
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
data = json.loads(response.read())
|
||||
|
||||
info = data.get("info", {})
|
||||
description = info.get("description", "")
|
||||
summary = info.get("summary", "")
|
||||
|
||||
# Priority 1: Try to get README from project URLs (GitHub, GitLab, etc.)
|
||||
github_fetched = False
|
||||
project_urls = info.get("project_urls", {})
|
||||
|
||||
# Check all possible URL sources for GitHub
|
||||
all_urls = []
|
||||
for key, proj_url in project_urls.items():
|
||||
if proj_url:
|
||||
all_urls.append(proj_url)
|
||||
|
||||
# Also check home_page and download_url
|
||||
home_page = info.get("home_page", "")
|
||||
if home_page:
|
||||
all_urls.append(home_page)
|
||||
download_url = info.get("download_url", "")
|
||||
if download_url:
|
||||
all_urls.append(download_url)
|
||||
|
||||
# Try GitHub first
|
||||
for url in all_urls:
|
||||
if "github.com" in url.lower():
|
||||
github_readme = _fetch_github_readme(url, allowlist)
|
||||
if github_readme and len(github_readme) > 500:
|
||||
description = github_readme
|
||||
github_fetched = True
|
||||
break
|
||||
|
||||
# Priority 2: Try ReadTheDocs if available
|
||||
if not github_fetched:
|
||||
for url in all_urls:
|
||||
if "readthedocs" in url.lower():
|
||||
rtd_content = _fetch_readthedocs(url, allowlist)
|
||||
if rtd_content and len(rtd_content) > 500:
|
||||
description = rtd_content
|
||||
github_fetched = True # Mark as fetched from external source
|
||||
break
|
||||
|
||||
# Priority 3: Try to scrape PyPI web page (not API) for full README
|
||||
if not github_fetched and len(description) < 1000:
|
||||
pypi_readme = _fetch_pypi_web_readme(name, version, allowlist)
|
||||
if pypi_readme and len(pypi_readme) > len(description):
|
||||
description = pypi_readme
|
||||
github_fetched = True # Mark as fetched from external source
|
||||
|
||||
# Priority 4: Use PyPI description (often contains full README)
|
||||
# PyPI descriptions can be quite good if properly uploaded
|
||||
if not github_fetched and len(description) < 500 and summary:
|
||||
# If description is too short, enhance it
|
||||
description = _enhance_pypi_description(info, description, summary)
|
||||
|
||||
# Write documentation
|
||||
with open(doc_file, "w", encoding="utf-8") as f:
|
||||
f.write(f"# {name}@{version}\n\n")
|
||||
f.write(f"**Package**: [{name}](https://pypi.org/project/{name}/)\n")
|
||||
f.write(f"**Version**: {version}\n")
|
||||
|
||||
# Add project URLs if available
|
||||
if project_urls:
|
||||
f.write("\n**Links**:\n")
|
||||
for key, url in list(project_urls.items())[:5]: # Limit to 5
|
||||
if url:
|
||||
f.write(f"- {key}: {url}\n")
|
||||
|
||||
f.write("\n---\n\n")
|
||||
|
||||
# Add summary if different from description
|
||||
if summary and summary not in description:
|
||||
f.write(f"**Summary**: {summary}\n\n")
|
||||
|
||||
f.write(description)
|
||||
|
||||
# Add installation instructions if not in description
|
||||
if "pip install" not in description.lower():
|
||||
f.write(f"\n\n## Installation\n\n```bash\npip install {name}\n```\n")
|
||||
|
||||
# Add basic usage if really minimal docs
|
||||
if len(description) < 200:
|
||||
f.write(f"\n\n## Basic Usage\n\n```python\nimport {name.replace('-', '_')}\n```\n")
|
||||
|
||||
# Write metadata
|
||||
meta = {
|
||||
"source_url": url,
|
||||
"last_checked": datetime.now().isoformat(),
|
||||
"etag": response.headers.get("ETag"),
|
||||
"project_urls": project_urls,
|
||||
"from_github": github_fetched
|
||||
}
|
||||
with open(meta_file, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
return {"status": "fetched"}
|
||||
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 429:
|
||||
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
|
||||
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
|
||||
except (urllib.error.URLError, json.JSONDecodeError) as e:
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def _fetch_github_readme(repo_url: str, allowlist: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Fetch README from GitHub repository.
|
||||
Converts repository URL to raw GitHub URL for README.
|
||||
"""
|
||||
if not repo_url:
|
||||
return None
|
||||
|
||||
# Extract owner/repo from various GitHub URL formats
|
||||
patterns = [
|
||||
r'github\.com[:/]([^/]+)/([^/\s]+)',
|
||||
r'git\+https://github\.com/([^/]+)/([^/\s]+)',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, repo_url)
|
||||
if match:
|
||||
owner, repo = match.groups()
|
||||
# Clean repo name
|
||||
repo = repo.replace(".git", "")
|
||||
|
||||
# Try common README filenames
|
||||
readme_files = ["README.md", "readme.md", "README.rst", "README.txt"]
|
||||
|
||||
# Sanitize owner and repo for URL
|
||||
safe_owner = sanitize_url_component(owner)
|
||||
safe_repo = sanitize_url_component(repo)
|
||||
|
||||
for readme_name in readme_files:
|
||||
safe_readme = sanitize_url_component(readme_name)
|
||||
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/main/{safe_readme}"
|
||||
|
||||
if not _is_url_allowed(raw_url, allowlist):
|
||||
continue
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(raw_url, timeout=5) as response:
|
||||
return response.read().decode("utf-8")
|
||||
except urllib.error.HTTPError:
|
||||
# Try master branch
|
||||
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/master/{safe_readme}"
|
||||
try:
|
||||
with urllib.request.urlopen(raw_url, timeout=5) as response:
|
||||
return response.read().decode("utf-8")
|
||||
except urllib.error.URLError:
|
||||
continue
|
||||
except urllib.error.URLError:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _is_url_allowed(url: str, allowlist: List[str]) -> bool:
|
||||
"""Check if URL is in the allowlist."""
|
||||
for allowed in allowlist:
|
||||
if url.startswith(allowed):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _enhance_npm_readme(data: Dict[str, Any], readme: str) -> str:
|
||||
"""Enhance minimal npm README with package metadata."""
|
||||
enhanced = readme if readme else ""
|
||||
|
||||
# Add description if not in README
|
||||
description = data.get("description", "")
|
||||
if description and description not in enhanced:
|
||||
enhanced = f"{description}\n\n{enhanced}"
|
||||
|
||||
# Add keywords
|
||||
keywords = data.get("keywords", [])
|
||||
if keywords and "keywords" not in enhanced.lower():
|
||||
enhanced += f"\n\n## Keywords\n\n{', '.join(keywords)}"
|
||||
|
||||
# Add main entry point info
|
||||
main = data.get("main", "")
|
||||
if main:
|
||||
enhanced += f"\n\n## Entry Point\n\nMain file: `{main}`"
|
||||
|
||||
# Add dependencies info if substantial
|
||||
deps = data.get("dependencies", {})
|
||||
if len(deps) > 0 and len(deps) <= 10: # Only if reasonable number
|
||||
enhanced += "\n\n## Dependencies\n\n"
|
||||
for dep, ver in deps.items():
|
||||
enhanced += f"- {dep}: {ver}\n"
|
||||
|
||||
return enhanced
|
||||
|
||||
|
||||
def _fetch_readthedocs(url: str, allowlist: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Fetch documentation from ReadTheDocs.
|
||||
Tries to get the main index page content.
|
||||
"""
|
||||
if not url or not _is_url_allowed(url, allowlist):
|
||||
return None
|
||||
|
||||
# Ensure we're getting the latest version
|
||||
if not url.endswith("/"):
|
||||
url += "/"
|
||||
|
||||
# Try to fetch the main page
|
||||
try:
|
||||
# Add en/latest if not already in URL
|
||||
if "/en/latest" not in url and "/en/stable" not in url:
|
||||
url = url.rstrip("/") + "/en/latest/"
|
||||
|
||||
with urllib.request.urlopen(url, timeout=10) as response:
|
||||
html_content = response.read().decode("utf-8")
|
||||
|
||||
# Basic HTML to markdown conversion (very simplified)
|
||||
# Remove script and style tags
|
||||
html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
|
||||
html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
|
||||
|
||||
# Extract main content (look for common RTD content divs)
|
||||
content_match = re.search(r'<div[^>]*class="[^"]*document[^"]*"[^>]*>(.*?)</div>', html_content, re.DOTALL)
|
||||
if content_match:
|
||||
html_content = content_match.group(1)
|
||||
|
||||
# Convert basic HTML tags to markdown
|
||||
html_content = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', html_content)
|
||||
html_content = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', html_content)
|
||||
html_content = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', html_content)
|
||||
html_content = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html_content)
|
||||
html_content = re.sub(r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```', html_content, flags=re.DOTALL)
|
||||
html_content = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html_content)
|
||||
html_content = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', html_content)
|
||||
html_content = re.sub(r'<[^>]+>', '', html_content) # Remove remaining HTML tags
|
||||
|
||||
# Clean up whitespace
|
||||
html_content = re.sub(r'\n{3,}', '\n\n', html_content)
|
||||
|
||||
return html_content.strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_pypi_web_readme(name: str, version: str, allowlist: List[str]) -> Optional[str]:
|
||||
"""
|
||||
Fetch the rendered README from PyPI's web interface.
|
||||
The web interface shows the full README that's often missing from the API.
|
||||
"""
|
||||
# Validate package name
|
||||
if not validate_package_name(name, "py"):
|
||||
return None
|
||||
|
||||
# Sanitize for URL
|
||||
safe_name = sanitize_url_component(name)
|
||||
safe_version = sanitize_url_component(version)
|
||||
|
||||
# PyPI web URLs
|
||||
urls_to_try = [
|
||||
f"https://pypi.org/project/{safe_name}/{safe_version}/",
|
||||
f"https://pypi.org/project/{safe_name}/"
|
||||
]
|
||||
|
||||
for url in urls_to_try:
|
||||
if not _is_url_allowed(url, allowlist):
|
||||
continue
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; TheAuditor/1.0)'
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=10) as response:
|
||||
html_content = response.read().decode("utf-8")
|
||||
|
||||
# Look for the project description div
|
||||
# PyPI uses a specific class for the README content
|
||||
readme_match = re.search(
|
||||
r'<div[^>]*class="[^"]*project-description[^"]*"[^>]*>(.*?)</div>',
|
||||
html_content,
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
if not readme_match:
|
||||
# Try alternative patterns
|
||||
readme_match = re.search(
|
||||
r'<div[^>]*class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
|
||||
html_content,
|
||||
re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
if readme_match:
|
||||
readme_html = readme_match.group(1)
|
||||
|
||||
# Convert HTML to markdown (simplified)
|
||||
# Headers
|
||||
readme_html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', readme_html, flags=re.IGNORECASE)
|
||||
readme_html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', readme_html, flags=re.IGNORECASE)
|
||||
readme_html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', readme_html, flags=re.IGNORECASE)
|
||||
|
||||
# Code blocks
|
||||
readme_html = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```', readme_html, flags=re.DOTALL | re.IGNORECASE)
|
||||
readme_html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', readme_html, flags=re.IGNORECASE)
|
||||
|
||||
# Lists
|
||||
readme_html = re.sub(r'<li[^>]*>(.*?)</li>', r'- \1\n', readme_html, flags=re.IGNORECASE)
|
||||
|
||||
# Links
|
||||
readme_html = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', readme_html, flags=re.IGNORECASE)
|
||||
|
||||
# Paragraphs and line breaks
|
||||
readme_html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', readme_html, flags=re.DOTALL | re.IGNORECASE)
|
||||
readme_html = re.sub(r'<br[^>]*>', '\n', readme_html, flags=re.IGNORECASE)
|
||||
|
||||
# Remove remaining HTML tags
|
||||
readme_html = re.sub(r'<[^>]+>', '', readme_html)
|
||||
|
||||
# Decode HTML entities
|
||||
readme_html = readme_html.replace('<', '<')
|
||||
readme_html = readme_html.replace('>', '>')
|
||||
readme_html = readme_html.replace('&', '&')
|
||||
readme_html = readme_html.replace('"', '"')
|
||||
readme_html = readme_html.replace(''', "'")
|
||||
|
||||
# Clean up whitespace
|
||||
readme_html = re.sub(r'\n{3,}', '\n\n', readme_html)
|
||||
readme_html = readme_html.strip()
|
||||
|
||||
if len(readme_html) > 100: # Only return if we got substantial content
|
||||
return readme_html
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _enhance_pypi_description(info: Dict[str, Any], description: str, summary: str) -> str:
|
||||
"""Enhance minimal PyPI description with package metadata."""
|
||||
enhanced = description if description else ""
|
||||
|
||||
# Start with summary if description is empty
|
||||
if not enhanced and summary:
|
||||
enhanced = f"{summary}\n\n"
|
||||
|
||||
# Add author info
|
||||
author = info.get("author", "")
|
||||
author_email = info.get("author_email", "")
|
||||
if author and "author" not in enhanced.lower():
|
||||
author_info = f"\n\n## Author\n\n{author}"
|
||||
if author_email:
|
||||
author_info += f" ({author_email})"
|
||||
enhanced += author_info
|
||||
|
||||
# Add license
|
||||
license_info = info.get("license", "")
|
||||
if license_info and "license" not in enhanced.lower():
|
||||
enhanced += f"\n\n## License\n\n{license_info}"
|
||||
|
||||
# Add classifiers (limited)
|
||||
classifiers = info.get("classifiers", [])
|
||||
relevant_classifiers = [
|
||||
c for c in classifiers
|
||||
if "Programming Language" in c or "Framework" in c or "Topic" in c
|
||||
][:5] # Limit to 5
|
||||
if relevant_classifiers:
|
||||
enhanced += "\n\n## Classifiers\n\n"
|
||||
for classifier in relevant_classifiers:
|
||||
enhanced += f"- {classifier}\n"
|
||||
|
||||
# Add requires_python if specified
|
||||
requires_python = info.get("requires_python", "")
|
||||
if requires_python:
|
||||
enhanced += f"\n\n## Python Version\n\nRequires Python {requires_python}"
|
||||
|
||||
return enhanced
|
||||
|
||||
|
||||
def check_latest(
|
||||
deps: List[Dict[str, Any]],
|
||||
allow_net: bool = True,
|
||||
offline: bool = False,
|
||||
output_path: str = "./.pf/deps_latest.json"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Check latest versions and compare to locked versions.
|
||||
|
||||
This is a wrapper around deps.check_latest_versions for consistency.
|
||||
"""
|
||||
from .deps import check_latest_versions, write_deps_latest_json
|
||||
|
||||
if offline or not allow_net:
|
||||
return {
|
||||
"mode": "offline",
|
||||
"checked": 0,
|
||||
"outdated": 0
|
||||
}
|
||||
|
||||
latest_info = check_latest_versions(deps, allow_net=allow_net, offline=offline)
|
||||
|
||||
if latest_info:
|
||||
# Sanitize output path before writing
|
||||
try:
|
||||
safe_output_path = str(sanitize_path(output_path, "."))
|
||||
write_deps_latest_json(latest_info, safe_output_path)
|
||||
except SecurityError as e:
|
||||
return {
|
||||
"mode": "error",
|
||||
"error": f"Invalid output path: {e}",
|
||||
"checked": 0,
|
||||
"outdated": 0
|
||||
}
|
||||
|
||||
outdated = sum(1 for info in latest_info.values() if info["is_outdated"])
|
||||
|
||||
return {
|
||||
"mode": "online",
|
||||
"checked": len(latest_info),
|
||||
"outdated": outdated,
|
||||
"output": output_path
|
||||
}
|
||||
Reference in New Issue
Block a user