mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
794 lines
29 KiB
Python
794 lines
29 KiB
Python
"""Documentation fetcher for version-correct package docs."""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.error
|
|
import urllib.request
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError
|
|
|
|
|
|
# Default allowlist for registries
|
|
DEFAULT_ALLOWLIST = [
|
|
"https://registry.npmjs.org/",
|
|
"https://pypi.org/", # Allow both API and web scraping
|
|
"https://raw.githubusercontent.com/",
|
|
"https://readthedocs.io/",
|
|
"https://readthedocs.org/",
|
|
]
|
|
|
|
# Rate limiting configuration - optimized for minimal runtime
|
|
RATE_LIMIT_DELAY = 0.15 # Average delay between requests (balanced for npm/PyPI)
|
|
RATE_LIMIT_BACKOFF = 15 # Backoff on 429/disconnect (15s gives APIs time to reset)
|
|
|
|
|
|
def fetch_docs(
|
|
deps: List[Dict[str, Any]],
|
|
allow_net: bool = True,
|
|
allowlist: Optional[List[str]] = None,
|
|
offline: bool = False,
|
|
output_dir: str = "./.pf/context/docs"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Fetch version-correct documentation for dependencies.
|
|
|
|
Args:
|
|
deps: List of dependency objects from deps.py
|
|
allow_net: Whether network access is allowed
|
|
allowlist: List of allowed URL prefixes (uses DEFAULT_ALLOWLIST if None)
|
|
offline: Force offline mode
|
|
output_dir: Base directory for cached docs
|
|
|
|
Returns:
|
|
Summary of fetch operations
|
|
"""
|
|
if offline or not allow_net:
|
|
return {
|
|
"mode": "offline",
|
|
"fetched": 0,
|
|
"cached": 0,
|
|
"skipped": len(deps),
|
|
"errors": []
|
|
}
|
|
|
|
if allowlist is None:
|
|
allowlist = DEFAULT_ALLOWLIST
|
|
|
|
try:
|
|
output_path = sanitize_path(output_dir, ".")
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
except SecurityError as e:
|
|
return {
|
|
"mode": "error",
|
|
"error": f"Invalid output directory: {e}",
|
|
"fetched": 0,
|
|
"cached": 0,
|
|
"skipped": len(deps)
|
|
}
|
|
|
|
stats = {
|
|
"mode": "online",
|
|
"fetched": 0,
|
|
"cached": 0,
|
|
"skipped": 0,
|
|
"errors": []
|
|
}
|
|
|
|
# FIRST PASS: Check what's cached
|
|
needs_fetch = []
|
|
for dep in deps:
|
|
# Quick cache check without network
|
|
cache_result = _check_cache_for_dep(dep, output_path)
|
|
if cache_result["cached"]:
|
|
stats["cached"] += 1
|
|
else:
|
|
needs_fetch.append(dep)
|
|
|
|
# Early exit if everything is cached
|
|
if not needs_fetch:
|
|
return stats
|
|
|
|
# SECOND PASS: Fetch only what we need, with per-service rate limiting
|
|
npm_rate_limited_until = 0
|
|
pypi_rate_limited_until = 0
|
|
|
|
for i, dep in enumerate(needs_fetch):
|
|
try:
|
|
current_time = time.time()
|
|
|
|
# Check if this service is rate limited
|
|
if dep["manager"] == "npm" and current_time < npm_rate_limited_until:
|
|
stats["skipped"] += 1
|
|
stats["errors"].append(f"{dep['name']}: Skipped (npm rate limited)")
|
|
continue
|
|
elif dep["manager"] == "py" and current_time < pypi_rate_limited_until:
|
|
stats["skipped"] += 1
|
|
stats["errors"].append(f"{dep['name']}: Skipped (PyPI rate limited)")
|
|
continue
|
|
|
|
# Fetch the documentation
|
|
if dep["manager"] == "npm":
|
|
result = _fetch_npm_docs(dep, output_path, allowlist)
|
|
elif dep["manager"] == "py":
|
|
result = _fetch_pypi_docs(dep, output_path, allowlist)
|
|
else:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
if result["status"] == "fetched":
|
|
stats["fetched"] += 1
|
|
# Rate limiting: delay after successful fetch to be server-friendly
|
|
# npm and PyPI both have rate limits (npm: 100/min, PyPI: 60/min)
|
|
time.sleep(RATE_LIMIT_DELAY) # Be server-friendly
|
|
elif result["status"] == "cached":
|
|
stats["cached"] += 1 # Shouldn't happen here but handle it
|
|
elif result.get("reason") == "rate_limited":
|
|
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
|
|
stats["skipped"] += 1
|
|
# Set rate limit expiry for this service
|
|
if dep["manager"] == "npm":
|
|
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
|
elif dep["manager"] == "py":
|
|
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
|
else:
|
|
stats["skipped"] += 1
|
|
|
|
except Exception as e:
|
|
error_msg = str(e)
|
|
if "429" in error_msg or "rate" in error_msg.lower():
|
|
stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
|
|
# Set rate limit expiry for this service
|
|
if dep["manager"] == "npm":
|
|
npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
|
elif dep["manager"] == "py":
|
|
pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
|
|
else:
|
|
stats["errors"].append(f"{dep['name']}: {error_msg}")
|
|
|
|
return stats
|
|
|
|
|
|
def _check_cache_for_dep(dep: Dict[str, Any], output_dir: Path) -> Dict[str, bool]:
|
|
"""
|
|
Quick cache check for a dependency without making network calls.
|
|
Returns {"cached": True/False}
|
|
"""
|
|
name = dep["name"]
|
|
version = dep["version"]
|
|
manager = dep["manager"]
|
|
|
|
# Build the cache file path
|
|
if manager == "npm":
|
|
# Handle git versions
|
|
if version.startswith("git") or "://" in version:
|
|
import hashlib
|
|
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
|
|
safe_version = f"git-{version_hash}"
|
|
else:
|
|
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
|
safe_name = name.replace("@", "_at_").replace("/", "_")
|
|
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
|
|
elif manager == "py":
|
|
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
|
safe_name = name.replace("/", "_").replace("\\", "_")
|
|
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
|
|
else:
|
|
return {"cached": False}
|
|
|
|
doc_file = pkg_dir / "doc.md"
|
|
meta_file = pkg_dir / "meta.json"
|
|
|
|
# Check cache validity
|
|
if doc_file.exists() and meta_file.exists():
|
|
try:
|
|
with open(meta_file, encoding="utf-8") as f:
|
|
meta = json.load(f)
|
|
# Cache for 7 days
|
|
last_checked = datetime.fromisoformat(meta["last_checked"])
|
|
if (datetime.now() - last_checked).days < 7:
|
|
return {"cached": True}
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
return {"cached": False}
|
|
|
|
|
|
def _fetch_npm_docs(
|
|
dep: Dict[str, Any],
|
|
output_dir: Path,
|
|
allowlist: List[str]
|
|
) -> Dict[str, Any]:
|
|
"""Fetch documentation for an npm package."""
|
|
name = dep["name"]
|
|
version = dep["version"]
|
|
|
|
# Validate package name
|
|
if not validate_package_name(name, "npm"):
|
|
return {"status": "skipped", "reason": "Invalid package name"}
|
|
|
|
# Sanitize version for filesystem (handle git URLs)
|
|
if version.startswith("git") or "://" in version:
|
|
# For git dependencies, use a hash of the URL as version
|
|
import hashlib
|
|
version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
|
|
safe_version = f"git-{version_hash}"
|
|
else:
|
|
# For normal versions, just replace problematic characters
|
|
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
|
|
|
# Create package-specific directory with sanitized name
|
|
# Replace @ and / in scoped packages for filesystem safety
|
|
safe_name = name.replace("@", "_at_").replace("/", "_")
|
|
try:
|
|
pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
|
|
pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
except (OSError, SecurityError) as e:
|
|
return {"status": "error", "error": f"Cannot create package directory: {e}"}
|
|
|
|
doc_file = pkg_dir / "doc.md"
|
|
meta_file = pkg_dir / "meta.json"
|
|
|
|
# Check cache
|
|
if doc_file.exists() and meta_file.exists():
|
|
# Check if cache is still valid (simple time-based for now)
|
|
try:
|
|
with open(meta_file, encoding="utf-8") as f:
|
|
meta = json.load(f)
|
|
# Cache for 7 days
|
|
last_checked = datetime.fromisoformat(meta["last_checked"])
|
|
if (datetime.now() - last_checked).days < 7:
|
|
return {"status": "cached"}
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass # Invalid cache, refetch
|
|
|
|
# Fetch from registry with sanitized package name
|
|
safe_url_name = sanitize_url_component(name)
|
|
safe_url_version = sanitize_url_component(version)
|
|
url = f"https://registry.npmjs.org/{safe_url_name}/{safe_url_version}"
|
|
if not _is_url_allowed(url, allowlist):
|
|
return {"status": "skipped", "reason": "URL not in allowlist"}
|
|
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
data = json.loads(response.read())
|
|
|
|
readme = data.get("readme", "")
|
|
repository = data.get("repository", {})
|
|
homepage = data.get("homepage", "")
|
|
|
|
# Priority 1: Try to get README from GitHub if available
|
|
github_fetched = False
|
|
if isinstance(repository, dict):
|
|
repo_url = repository.get("url", "")
|
|
github_readme = _fetch_github_readme(repo_url, allowlist)
|
|
if github_readme and len(github_readme) > 500: # Only use if substantial
|
|
readme = github_readme
|
|
github_fetched = True
|
|
|
|
# Priority 2: If no good GitHub README, try homepage if it's GitHub
|
|
if not github_fetched and homepage and "github.com" in homepage:
|
|
github_readme = _fetch_github_readme(homepage, allowlist)
|
|
if github_readme and len(github_readme) > 500:
|
|
readme = github_readme
|
|
github_fetched = True
|
|
|
|
# Priority 3: Use npm README if it's substantial
|
|
if not github_fetched and len(readme) < 500:
|
|
# The npm README is too short, try to enhance it
|
|
readme = _enhance_npm_readme(data, readme)
|
|
|
|
# Write documentation
|
|
with open(doc_file, "w", encoding="utf-8") as f:
|
|
f.write(f"# {name}@{version}\n\n")
|
|
f.write(f"**Package**: [{name}](https://www.npmjs.com/package/{name})\n")
|
|
f.write(f"**Version**: {version}\n")
|
|
if homepage:
|
|
f.write(f"**Homepage**: {homepage}\n")
|
|
f.write("\n---\n\n")
|
|
f.write(readme)
|
|
|
|
# Add usage examples if not in README
|
|
if "## Usage" not in readme and "## Example" not in readme:
|
|
f.write("\n\n## Installation\n\n```bash\nnpm install {name}\n```\n".format(name=name))
|
|
|
|
# Write metadata
|
|
meta = {
|
|
"source_url": url,
|
|
"last_checked": datetime.now().isoformat(),
|
|
"etag": response.headers.get("ETag"),
|
|
"repository": repository,
|
|
"from_github": github_fetched
|
|
}
|
|
with open(meta_file, "w", encoding="utf-8") as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
return {"status": "fetched"}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 429:
|
|
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
|
|
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
|
|
except (urllib.error.URLError, json.JSONDecodeError) as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def _fetch_pypi_docs(
|
|
dep: Dict[str, Any],
|
|
output_dir: Path,
|
|
allowlist: List[str]
|
|
) -> Dict[str, Any]:
|
|
"""Fetch documentation for a PyPI package."""
|
|
name = dep["name"].strip() # Strip any whitespace from name
|
|
version = dep["version"]
|
|
|
|
# Validate package name
|
|
if not validate_package_name(name, "py"):
|
|
return {"status": "skipped", "reason": "Invalid package name"}
|
|
|
|
# Sanitize package name for URL
|
|
safe_url_name = sanitize_url_component(name)
|
|
|
|
# Handle special versions
|
|
if version in ["latest", "git"]:
|
|
# For latest, fetch current version first
|
|
if version == "latest":
|
|
url = f"https://pypi.org/pypi/{safe_url_name}/json"
|
|
else:
|
|
return {"status": "skipped", "reason": "git dependency"}
|
|
else:
|
|
safe_url_version = sanitize_url_component(version)
|
|
url = f"https://pypi.org/pypi/{safe_url_name}/{safe_url_version}/json"
|
|
|
|
if not _is_url_allowed(url, allowlist):
|
|
return {"status": "skipped", "reason": "URL not in allowlist"}
|
|
|
|
# Sanitize version for filesystem
|
|
safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
|
|
|
|
# Create package-specific directory with sanitized name
|
|
safe_name = name.replace("/", "_").replace("\\", "_")
|
|
try:
|
|
pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
|
|
pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
except (OSError, SecurityError) as e:
|
|
return {"status": "error", "error": f"Cannot create package directory: {e}"}
|
|
|
|
doc_file = pkg_dir / "doc.md"
|
|
meta_file = pkg_dir / "meta.json"
|
|
|
|
# Check cache
|
|
if doc_file.exists() and meta_file.exists():
|
|
try:
|
|
with open(meta_file, encoding="utf-8") as f:
|
|
meta = json.load(f)
|
|
last_checked = datetime.fromisoformat(meta["last_checked"])
|
|
if (datetime.now() - last_checked).days < 7:
|
|
return {"status": "cached"}
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
data = json.loads(response.read())
|
|
|
|
info = data.get("info", {})
|
|
description = info.get("description", "")
|
|
summary = info.get("summary", "")
|
|
|
|
# Priority 1: Try to get README from project URLs (GitHub, GitLab, etc.)
|
|
github_fetched = False
|
|
project_urls = info.get("project_urls", {})
|
|
|
|
# Check all possible URL sources for GitHub
|
|
all_urls = []
|
|
for key, proj_url in project_urls.items():
|
|
if proj_url:
|
|
all_urls.append(proj_url)
|
|
|
|
# Also check home_page and download_url
|
|
home_page = info.get("home_page", "")
|
|
if home_page:
|
|
all_urls.append(home_page)
|
|
download_url = info.get("download_url", "")
|
|
if download_url:
|
|
all_urls.append(download_url)
|
|
|
|
# Try GitHub first
|
|
for url in all_urls:
|
|
if "github.com" in url.lower():
|
|
github_readme = _fetch_github_readme(url, allowlist)
|
|
if github_readme and len(github_readme) > 500:
|
|
description = github_readme
|
|
github_fetched = True
|
|
break
|
|
|
|
# Priority 2: Try ReadTheDocs if available
|
|
if not github_fetched:
|
|
for url in all_urls:
|
|
if "readthedocs" in url.lower():
|
|
rtd_content = _fetch_readthedocs(url, allowlist)
|
|
if rtd_content and len(rtd_content) > 500:
|
|
description = rtd_content
|
|
github_fetched = True # Mark as fetched from external source
|
|
break
|
|
|
|
# Priority 3: Try to scrape PyPI web page (not API) for full README
|
|
if not github_fetched and len(description) < 1000:
|
|
pypi_readme = _fetch_pypi_web_readme(name, version, allowlist)
|
|
if pypi_readme and len(pypi_readme) > len(description):
|
|
description = pypi_readme
|
|
github_fetched = True # Mark as fetched from external source
|
|
|
|
# Priority 4: Use PyPI description (often contains full README)
|
|
# PyPI descriptions can be quite good if properly uploaded
|
|
if not github_fetched and len(description) < 500 and summary:
|
|
# If description is too short, enhance it
|
|
description = _enhance_pypi_description(info, description, summary)
|
|
|
|
# Write documentation
|
|
with open(doc_file, "w", encoding="utf-8") as f:
|
|
f.write(f"# {name}@{version}\n\n")
|
|
f.write(f"**Package**: [{name}](https://pypi.org/project/{name}/)\n")
|
|
f.write(f"**Version**: {version}\n")
|
|
|
|
# Add project URLs if available
|
|
if project_urls:
|
|
f.write("\n**Links**:\n")
|
|
for key, url in list(project_urls.items())[:5]: # Limit to 5
|
|
if url:
|
|
f.write(f"- {key}: {url}\n")
|
|
|
|
f.write("\n---\n\n")
|
|
|
|
# Add summary if different from description
|
|
if summary and summary not in description:
|
|
f.write(f"**Summary**: {summary}\n\n")
|
|
|
|
f.write(description)
|
|
|
|
# Add installation instructions if not in description
|
|
if "pip install" not in description.lower():
|
|
f.write(f"\n\n## Installation\n\n```bash\npip install {name}\n```\n")
|
|
|
|
# Add basic usage if really minimal docs
|
|
if len(description) < 200:
|
|
f.write(f"\n\n## Basic Usage\n\n```python\nimport {name.replace('-', '_')}\n```\n")
|
|
|
|
# Write metadata
|
|
meta = {
|
|
"source_url": url,
|
|
"last_checked": datetime.now().isoformat(),
|
|
"etag": response.headers.get("ETag"),
|
|
"project_urls": project_urls,
|
|
"from_github": github_fetched
|
|
}
|
|
with open(meta_file, "w", encoding="utf-8") as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
return {"status": "fetched"}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
if e.code == 429:
|
|
return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
|
|
return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
|
|
except (urllib.error.URLError, json.JSONDecodeError) as e:
|
|
return {"status": "error", "error": str(e)}
|
|
|
|
|
|
def _fetch_github_readme(repo_url: str, allowlist: List[str]) -> Optional[str]:
|
|
"""
|
|
Fetch README from GitHub repository.
|
|
Converts repository URL to raw GitHub URL for README.
|
|
"""
|
|
if not repo_url:
|
|
return None
|
|
|
|
# Extract owner/repo from various GitHub URL formats
|
|
patterns = [
|
|
r'github\.com[:/]([^/]+)/([^/\s]+)',
|
|
r'git\+https://github\.com/([^/]+)/([^/\s]+)',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, repo_url)
|
|
if match:
|
|
owner, repo = match.groups()
|
|
# Clean repo name
|
|
repo = repo.replace(".git", "")
|
|
|
|
# Try common README filenames
|
|
readme_files = ["README.md", "readme.md", "README.rst", "README.txt"]
|
|
|
|
# Sanitize owner and repo for URL
|
|
safe_owner = sanitize_url_component(owner)
|
|
safe_repo = sanitize_url_component(repo)
|
|
|
|
for readme_name in readme_files:
|
|
safe_readme = sanitize_url_component(readme_name)
|
|
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/main/{safe_readme}"
|
|
|
|
if not _is_url_allowed(raw_url, allowlist):
|
|
continue
|
|
|
|
try:
|
|
with urllib.request.urlopen(raw_url, timeout=5) as response:
|
|
return response.read().decode("utf-8")
|
|
except urllib.error.HTTPError:
|
|
# Try master branch
|
|
raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/master/{safe_readme}"
|
|
try:
|
|
with urllib.request.urlopen(raw_url, timeout=5) as response:
|
|
return response.read().decode("utf-8")
|
|
except urllib.error.URLError:
|
|
continue
|
|
except urllib.error.URLError:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def _is_url_allowed(url: str, allowlist: List[str]) -> bool:
|
|
"""Check if URL is in the allowlist."""
|
|
for allowed in allowlist:
|
|
if url.startswith(allowed):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _enhance_npm_readme(data: Dict[str, Any], readme: str) -> str:
|
|
"""Enhance minimal npm README with package metadata."""
|
|
enhanced = readme if readme else ""
|
|
|
|
# Add description if not in README
|
|
description = data.get("description", "")
|
|
if description and description not in enhanced:
|
|
enhanced = f"{description}\n\n{enhanced}"
|
|
|
|
# Add keywords
|
|
keywords = data.get("keywords", [])
|
|
if keywords and "keywords" not in enhanced.lower():
|
|
enhanced += f"\n\n## Keywords\n\n{', '.join(keywords)}"
|
|
|
|
# Add main entry point info
|
|
main = data.get("main", "")
|
|
if main:
|
|
enhanced += f"\n\n## Entry Point\n\nMain file: `{main}`"
|
|
|
|
# Add dependencies info if substantial
|
|
deps = data.get("dependencies", {})
|
|
if len(deps) > 0 and len(deps) <= 10: # Only if reasonable number
|
|
enhanced += "\n\n## Dependencies\n\n"
|
|
for dep, ver in deps.items():
|
|
enhanced += f"- {dep}: {ver}\n"
|
|
|
|
return enhanced
|
|
|
|
|
|
def _fetch_readthedocs(url: str, allowlist: List[str]) -> Optional[str]:
|
|
"""
|
|
Fetch documentation from ReadTheDocs.
|
|
Tries to get the main index page content.
|
|
"""
|
|
if not url or not _is_url_allowed(url, allowlist):
|
|
return None
|
|
|
|
# Ensure we're getting the latest version
|
|
if not url.endswith("/"):
|
|
url += "/"
|
|
|
|
# Try to fetch the main page
|
|
try:
|
|
# Add en/latest if not already in URL
|
|
if "/en/latest" not in url and "/en/stable" not in url:
|
|
url = url.rstrip("/") + "/en/latest/"
|
|
|
|
with urllib.request.urlopen(url, timeout=10) as response:
|
|
html_content = response.read().decode("utf-8")
|
|
|
|
# Basic HTML to markdown conversion (very simplified)
|
|
# Remove script and style tags
|
|
html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
|
|
html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
|
|
|
|
# Extract main content (look for common RTD content divs)
|
|
content_match = re.search(r'<div[^>]*class="[^"]*document[^"]*"[^>]*>(.*?)</div>', html_content, re.DOTALL)
|
|
if content_match:
|
|
html_content = content_match.group(1)
|
|
|
|
# Convert basic HTML tags to markdown
|
|
html_content = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', html_content)
|
|
html_content = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', html_content)
|
|
html_content = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', html_content)
|
|
html_content = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html_content)
|
|
html_content = re.sub(r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```', html_content, flags=re.DOTALL)
|
|
html_content = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html_content)
|
|
html_content = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', html_content)
|
|
html_content = re.sub(r'<[^>]+>', '', html_content) # Remove remaining HTML tags
|
|
|
|
# Clean up whitespace
|
|
html_content = re.sub(r'\n{3,}', '\n\n', html_content)
|
|
|
|
return html_content.strip()
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _fetch_pypi_web_readme(name: str, version: str, allowlist: List[str]) -> Optional[str]:
|
|
"""
|
|
Fetch the rendered README from PyPI's web interface.
|
|
The web interface shows the full README that's often missing from the API.
|
|
"""
|
|
# Validate package name
|
|
if not validate_package_name(name, "py"):
|
|
return None
|
|
|
|
# Sanitize for URL
|
|
safe_name = sanitize_url_component(name)
|
|
safe_version = sanitize_url_component(version)
|
|
|
|
# PyPI web URLs
|
|
urls_to_try = [
|
|
f"https://pypi.org/project/{safe_name}/{safe_version}/",
|
|
f"https://pypi.org/project/{safe_name}/"
|
|
]
|
|
|
|
for url in urls_to_try:
|
|
if not _is_url_allowed(url, allowlist):
|
|
continue
|
|
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
'User-Agent': 'Mozilla/5.0 (compatible; TheAuditor/1.0)'
|
|
})
|
|
with urllib.request.urlopen(req, timeout=10) as response:
|
|
html_content = response.read().decode("utf-8")
|
|
|
|
# Look for the project description div
|
|
# PyPI uses a specific class for the README content
|
|
readme_match = re.search(
|
|
r'<div[^>]*class="[^"]*project-description[^"]*"[^>]*>(.*?)</div>',
|
|
html_content,
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
if not readme_match:
|
|
# Try alternative patterns
|
|
readme_match = re.search(
|
|
r'<div[^>]*class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
|
|
html_content,
|
|
re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
if readme_match:
|
|
readme_html = readme_match.group(1)
|
|
|
|
# Convert HTML to markdown (simplified)
|
|
# Headers
|
|
readme_html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', readme_html, flags=re.IGNORECASE)
|
|
readme_html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', readme_html, flags=re.IGNORECASE)
|
|
readme_html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', readme_html, flags=re.IGNORECASE)
|
|
|
|
# Code blocks
|
|
readme_html = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```', readme_html, flags=re.DOTALL | re.IGNORECASE)
|
|
readme_html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', readme_html, flags=re.IGNORECASE)
|
|
|
|
# Lists
|
|
readme_html = re.sub(r'<li[^>]*>(.*?)</li>', r'- \1\n', readme_html, flags=re.IGNORECASE)
|
|
|
|
# Links
|
|
readme_html = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', readme_html, flags=re.IGNORECASE)
|
|
|
|
# Paragraphs and line breaks
|
|
readme_html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', readme_html, flags=re.DOTALL | re.IGNORECASE)
|
|
readme_html = re.sub(r'<br[^>]*>', '\n', readme_html, flags=re.IGNORECASE)
|
|
|
|
# Remove remaining HTML tags
|
|
readme_html = re.sub(r'<[^>]+>', '', readme_html)
|
|
|
|
# Decode HTML entities
|
|
readme_html = readme_html.replace('<', '<')
|
|
readme_html = readme_html.replace('>', '>')
|
|
readme_html = readme_html.replace('&', '&')
|
|
readme_html = readme_html.replace('"', '"')
|
|
readme_html = readme_html.replace(''', "'")
|
|
|
|
# Clean up whitespace
|
|
readme_html = re.sub(r'\n{3,}', '\n\n', readme_html)
|
|
readme_html = readme_html.strip()
|
|
|
|
if len(readme_html) > 100: # Only return if we got substantial content
|
|
return readme_html
|
|
except Exception:
|
|
continue
|
|
|
|
return None
|
|
|
|
|
|
def _enhance_pypi_description(info: Dict[str, Any], description: str, summary: str) -> str:
|
|
"""Enhance minimal PyPI description with package metadata."""
|
|
enhanced = description if description else ""
|
|
|
|
# Start with summary if description is empty
|
|
if not enhanced and summary:
|
|
enhanced = f"{summary}\n\n"
|
|
|
|
# Add author info
|
|
author = info.get("author", "")
|
|
author_email = info.get("author_email", "")
|
|
if author and "author" not in enhanced.lower():
|
|
author_info = f"\n\n## Author\n\n{author}"
|
|
if author_email:
|
|
author_info += f" ({author_email})"
|
|
enhanced += author_info
|
|
|
|
# Add license
|
|
license_info = info.get("license", "")
|
|
if license_info and "license" not in enhanced.lower():
|
|
enhanced += f"\n\n## License\n\n{license_info}"
|
|
|
|
# Add classifiers (limited)
|
|
classifiers = info.get("classifiers", [])
|
|
relevant_classifiers = [
|
|
c for c in classifiers
|
|
if "Programming Language" in c or "Framework" in c or "Topic" in c
|
|
][:5] # Limit to 5
|
|
if relevant_classifiers:
|
|
enhanced += "\n\n## Classifiers\n\n"
|
|
for classifier in relevant_classifiers:
|
|
enhanced += f"- {classifier}\n"
|
|
|
|
# Add requires_python if specified
|
|
requires_python = info.get("requires_python", "")
|
|
if requires_python:
|
|
enhanced += f"\n\n## Python Version\n\nRequires Python {requires_python}"
|
|
|
|
return enhanced
|
|
|
|
|
|
def check_latest(
|
|
deps: List[Dict[str, Any]],
|
|
allow_net: bool = True,
|
|
offline: bool = False,
|
|
output_path: str = "./.pf/deps_latest.json"
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Check latest versions and compare to locked versions.
|
|
|
|
This is a wrapper around deps.check_latest_versions for consistency.
|
|
"""
|
|
from .deps import check_latest_versions, write_deps_latest_json
|
|
|
|
if offline or not allow_net:
|
|
return {
|
|
"mode": "offline",
|
|
"checked": 0,
|
|
"outdated": 0
|
|
}
|
|
|
|
latest_info = check_latest_versions(deps, allow_net=allow_net, offline=offline)
|
|
|
|
if latest_info:
|
|
# Sanitize output path before writing
|
|
try:
|
|
safe_output_path = str(sanitize_path(output_path, "."))
|
|
write_deps_latest_json(latest_info, safe_output_path)
|
|
except SecurityError as e:
|
|
return {
|
|
"mode": "error",
|
|
"error": f"Invalid output path: {e}",
|
|
"checked": 0,
|
|
"outdated": 0
|
|
}
|
|
|
|
outdated = sum(1 for info in latest_info.values() if info["is_outdated"])
|
|
|
|
return {
|
|
"mode": "online",
|
|
"checked": len(latest_info),
|
|
"outdated": outdated,
|
|
"output": output_path
|
|
}
|