Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform

2025-12-17 19:34:19 +01:00 · 2025-09-07 20:39:47 +07:00
commit ba5c287b02
215 changed files with 50911 additions and 0 deletions
--- a/theauditor/docs_fetch.py
+++ b/theauditor/docs_fetch.py
@@ -0,0 +1,793 @@
+"""Documentation fetcher for version-correct package docs."""
+
+import json
+import re
+import time
+import urllib.error
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from theauditor.security import sanitize_path, sanitize_url_component, validate_package_name, SecurityError
+
+
+# Default allowlist for registries
+DEFAULT_ALLOWLIST = [
+    "https://registry.npmjs.org/",
+    "https://pypi.org/",  # Allow both API and web scraping
+    "https://raw.githubusercontent.com/",
+    "https://readthedocs.io/",
+    "https://readthedocs.org/",
+]
+
+# Rate limiting configuration - optimized for minimal runtime
+RATE_LIMIT_DELAY = 0.15  # Average delay between requests (balanced for npm/PyPI)
+RATE_LIMIT_BACKOFF = 15  # Backoff on 429/disconnect (15s gives APIs time to reset)
+
+
+def fetch_docs(
+    deps: List[Dict[str, Any]],
+    allow_net: bool = True,
+    allowlist: Optional[List[str]] = None,
+    offline: bool = False,
+    output_dir: str = "./.pf/context/docs"
+) -> Dict[str, Any]:
+    """
+    Fetch version-correct documentation for dependencies.
+    
+    Args:
+        deps: List of dependency objects from deps.py
+        allow_net: Whether network access is allowed
+        allowlist: List of allowed URL prefixes (uses DEFAULT_ALLOWLIST if None)
+        offline: Force offline mode
+        output_dir: Base directory for cached docs
+    
+    Returns:
+        Summary of fetch operations
+    """
+    if offline or not allow_net:
+        return {
+            "mode": "offline",
+            "fetched": 0,
+            "cached": 0,
+            "skipped": len(deps),
+            "errors": []
+        }
+
+    if allowlist is None:
+        allowlist = DEFAULT_ALLOWLIST
+
+    try:
+        output_path = sanitize_path(output_dir, ".")
+        output_path.mkdir(parents=True, exist_ok=True)
+    except SecurityError as e:
+        return {
+            "mode": "error",
+            "error": f"Invalid output directory: {e}",
+            "fetched": 0,
+            "cached": 0,
+            "skipped": len(deps)
+        }
+
+    stats = {
+        "mode": "online",
+        "fetched": 0,
+        "cached": 0,
+        "skipped": 0,
+        "errors": []
+    }
+
+    # FIRST PASS: Check what's cached
+    needs_fetch = []
+    for dep in deps:
+        # Quick cache check without network
+        cache_result = _check_cache_for_dep(dep, output_path)
+        if cache_result["cached"]:
+            stats["cached"] += 1
+        else:
+            needs_fetch.append(dep)
+    
+    # Early exit if everything is cached
+    if not needs_fetch:
+        return stats
+    
+    # SECOND PASS: Fetch only what we need, with per-service rate limiting
+    npm_rate_limited_until = 0
+    pypi_rate_limited_until = 0
+    
+    for i, dep in enumerate(needs_fetch):
+        try:
+            current_time = time.time()
+            
+            # Check if this service is rate limited
+            if dep["manager"] == "npm" and current_time < npm_rate_limited_until:
+                stats["skipped"] += 1
+                stats["errors"].append(f"{dep['name']}: Skipped (npm rate limited)")
+                continue
+            elif dep["manager"] == "py" and current_time < pypi_rate_limited_until:
+                stats["skipped"] += 1
+                stats["errors"].append(f"{dep['name']}: Skipped (PyPI rate limited)")
+                continue
+            
+            # Fetch the documentation
+            if dep["manager"] == "npm":
+                result = _fetch_npm_docs(dep, output_path, allowlist)
+            elif dep["manager"] == "py":
+                result = _fetch_pypi_docs(dep, output_path, allowlist)
+            else:
+                stats["skipped"] += 1
+                continue
+
+            if result["status"] == "fetched":
+                stats["fetched"] += 1
+                # Rate limiting: delay after successful fetch to be server-friendly
+                # npm and PyPI both have rate limits (npm: 100/min, PyPI: 60/min)
+                time.sleep(RATE_LIMIT_DELAY)  # Be server-friendly
+            elif result["status"] == "cached":
+                stats["cached"] += 1  # Shouldn't happen here but handle it
+            elif result.get("reason") == "rate_limited":
+                stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
+                stats["skipped"] += 1
+                # Set rate limit expiry for this service
+                if dep["manager"] == "npm":
+                    npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
+                elif dep["manager"] == "py":
+                    pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
+            else:
+                stats["skipped"] += 1
+
+        except Exception as e:
+            error_msg = str(e)
+            if "429" in error_msg or "rate" in error_msg.lower():
+                stats["errors"].append(f"{dep['name']}: Rate limited - backing off {RATE_LIMIT_BACKOFF}s")
+                # Set rate limit expiry for this service
+                if dep["manager"] == "npm":
+                    npm_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
+                elif dep["manager"] == "py":
+                    pypi_rate_limited_until = time.time() + RATE_LIMIT_BACKOFF
+            else:
+                stats["errors"].append(f"{dep['name']}: {error_msg}")
+
+    return stats
+
+
+def _check_cache_for_dep(dep: Dict[str, Any], output_dir: Path) -> Dict[str, bool]:
+    """
+    Quick cache check for a dependency without making network calls.
+    Returns {"cached": True/False}
+    """
+    name = dep["name"]
+    version = dep["version"]
+    manager = dep["manager"]
+    
+    # Build the cache file path
+    if manager == "npm":
+        # Handle git versions
+        if version.startswith("git") or "://" in version:
+            import hashlib
+            version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
+            safe_version = f"git-{version_hash}"
+        else:
+            safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
+        safe_name = name.replace("@", "_at_").replace("/", "_")
+        pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
+    elif manager == "py":
+        safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
+        safe_name = name.replace("/", "_").replace("\\", "_")
+        pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
+    else:
+        return {"cached": False}
+    
+    doc_file = pkg_dir / "doc.md"
+    meta_file = pkg_dir / "meta.json"
+    
+    # Check cache validity
+    if doc_file.exists() and meta_file.exists():
+        try:
+            with open(meta_file, encoding="utf-8") as f:
+                meta = json.load(f)
+            # Cache for 7 days
+            last_checked = datetime.fromisoformat(meta["last_checked"])
+            if (datetime.now() - last_checked).days < 7:
+                return {"cached": True}
+        except (json.JSONDecodeError, KeyError):
+            pass
+    
+    return {"cached": False}
+
+
+def _fetch_npm_docs(
+    dep: Dict[str, Any],
+    output_dir: Path,
+    allowlist: List[str]
+) -> Dict[str, Any]:
+    """Fetch documentation for an npm package."""
+    name = dep["name"]
+    version = dep["version"]
+    
+    # Validate package name
+    if not validate_package_name(name, "npm"):
+        return {"status": "skipped", "reason": "Invalid package name"}
+
+    # Sanitize version for filesystem (handle git URLs)
+    if version.startswith("git") or "://" in version:
+        # For git dependencies, use a hash of the URL as version
+        import hashlib
+        version_hash = hashlib.md5(version.encode()).hexdigest()[:8]
+        safe_version = f"git-{version_hash}"
+    else:
+        # For normal versions, just replace problematic characters
+        safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
+
+    # Create package-specific directory with sanitized name
+    # Replace @ and / in scoped packages for filesystem safety
+    safe_name = name.replace("@", "_at_").replace("/", "_")
+    try:
+        pkg_dir = output_dir / "npm" / f"{safe_name}@{safe_version}"
+        pkg_dir.mkdir(parents=True, exist_ok=True)
+    except (OSError, SecurityError) as e:
+        return {"status": "error", "error": f"Cannot create package directory: {e}"}
+
+    doc_file = pkg_dir / "doc.md"
+    meta_file = pkg_dir / "meta.json"
+
+    # Check cache
+    if doc_file.exists() and meta_file.exists():
+        # Check if cache is still valid (simple time-based for now)
+        try:
+            with open(meta_file, encoding="utf-8") as f:
+                meta = json.load(f)
+            # Cache for 7 days
+            last_checked = datetime.fromisoformat(meta["last_checked"])
+            if (datetime.now() - last_checked).days < 7:
+                return {"status": "cached"}
+        except (json.JSONDecodeError, KeyError):
+            pass  # Invalid cache, refetch
+
+    # Fetch from registry with sanitized package name
+    safe_url_name = sanitize_url_component(name)
+    safe_url_version = sanitize_url_component(version)
+    url = f"https://registry.npmjs.org/{safe_url_name}/{safe_url_version}"
+    if not _is_url_allowed(url, allowlist):
+        return {"status": "skipped", "reason": "URL not in allowlist"}
+
+    try:
+        with urllib.request.urlopen(url, timeout=10) as response:
+            data = json.loads(response.read())
+
+        readme = data.get("readme", "")
+        repository = data.get("repository", {})
+        homepage = data.get("homepage", "")
+
+        # Priority 1: Try to get README from GitHub if available
+        github_fetched = False
+        if isinstance(repository, dict):
+            repo_url = repository.get("url", "")
+            github_readme = _fetch_github_readme(repo_url, allowlist)
+            if github_readme and len(github_readme) > 500:  # Only use if substantial
+                readme = github_readme
+                github_fetched = True
+
+        # Priority 2: If no good GitHub README, try homepage if it's GitHub
+        if not github_fetched and homepage and "github.com" in homepage:
+            github_readme = _fetch_github_readme(homepage, allowlist)
+            if github_readme and len(github_readme) > 500:
+                readme = github_readme
+                github_fetched = True
+
+        # Priority 3: Use npm README if it's substantial
+        if not github_fetched and len(readme) < 500:
+            # The npm README is too short, try to enhance it
+            readme = _enhance_npm_readme(data, readme)
+
+        # Write documentation
+        with open(doc_file, "w", encoding="utf-8") as f:
+            f.write(f"# {name}@{version}\n\n")
+            f.write(f"**Package**: [{name}](https://www.npmjs.com/package/{name})\n")
+            f.write(f"**Version**: {version}\n")
+            if homepage:
+                f.write(f"**Homepage**: {homepage}\n")
+            f.write("\n---\n\n")
+            f.write(readme)
+
+            # Add usage examples if not in README
+            if "## Usage" not in readme and "## Example" not in readme:
+                f.write("\n\n## Installation\n\n```bash\nnpm install {name}\n```\n".format(name=name))
+
+        # Write metadata
+        meta = {
+            "source_url": url,
+            "last_checked": datetime.now().isoformat(),
+            "etag": response.headers.get("ETag"),
+            "repository": repository,
+            "from_github": github_fetched
+        }
+        with open(meta_file, "w", encoding="utf-8") as f:
+            json.dump(meta, f, indent=2)
+
+        return {"status": "fetched"}
+
+    except urllib.error.HTTPError as e:
+        if e.code == 429:
+            return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
+        return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
+    except (urllib.error.URLError, json.JSONDecodeError) as e:
+        return {"status": "error", "error": str(e)}
+
+
+def _fetch_pypi_docs(
+    dep: Dict[str, Any],
+    output_dir: Path,
+    allowlist: List[str]
+) -> Dict[str, Any]:
+    """Fetch documentation for a PyPI package."""
+    name = dep["name"].strip()  # Strip any whitespace from name
+    version = dep["version"]
+    
+    # Validate package name
+    if not validate_package_name(name, "py"):
+        return {"status": "skipped", "reason": "Invalid package name"}
+
+    # Sanitize package name for URL
+    safe_url_name = sanitize_url_component(name)
+    
+    # Handle special versions
+    if version in ["latest", "git"]:
+        # For latest, fetch current version first
+        if version == "latest":
+            url = f"https://pypi.org/pypi/{safe_url_name}/json"
+        else:
+            return {"status": "skipped", "reason": "git dependency"}
+    else:
+        safe_url_version = sanitize_url_component(version)
+        url = f"https://pypi.org/pypi/{safe_url_name}/{safe_url_version}/json"
+
+    if not _is_url_allowed(url, allowlist):
+        return {"status": "skipped", "reason": "URL not in allowlist"}
+
+    # Sanitize version for filesystem
+    safe_version = version.replace(":", "_").replace("/", "_").replace("\\", "_")
+    
+    # Create package-specific directory with sanitized name
+    safe_name = name.replace("/", "_").replace("\\", "_")
+    try:
+        pkg_dir = output_dir / "py" / f"{safe_name}@{safe_version}"
+        pkg_dir.mkdir(parents=True, exist_ok=True)
+    except (OSError, SecurityError) as e:
+        return {"status": "error", "error": f"Cannot create package directory: {e}"}
+
+    doc_file = pkg_dir / "doc.md"
+    meta_file = pkg_dir / "meta.json"
+
+    # Check cache
+    if doc_file.exists() and meta_file.exists():
+        try:
+            with open(meta_file, encoding="utf-8") as f:
+                meta = json.load(f)
+            last_checked = datetime.fromisoformat(meta["last_checked"])
+            if (datetime.now() - last_checked).days < 7:
+                return {"status": "cached"}
+        except (json.JSONDecodeError, KeyError):
+            pass
+
+    try:
+        with urllib.request.urlopen(url, timeout=10) as response:
+            data = json.loads(response.read())
+
+        info = data.get("info", {})
+        description = info.get("description", "")
+        summary = info.get("summary", "")
+
+        # Priority 1: Try to get README from project URLs (GitHub, GitLab, etc.)
+        github_fetched = False
+        project_urls = info.get("project_urls", {})
+
+        # Check all possible URL sources for GitHub
+        all_urls = []
+        for key, proj_url in project_urls.items():
+            if proj_url:
+                all_urls.append(proj_url)
+
+        # Also check home_page and download_url
+        home_page = info.get("home_page", "")
+        if home_page:
+            all_urls.append(home_page)
+        download_url = info.get("download_url", "")
+        if download_url:
+            all_urls.append(download_url)
+
+        # Try GitHub first
+        for url in all_urls:
+            if "github.com" in url.lower():
+                github_readme = _fetch_github_readme(url, allowlist)
+                if github_readme and len(github_readme) > 500:
+                    description = github_readme
+                    github_fetched = True
+                    break
+
+        # Priority 2: Try ReadTheDocs if available
+        if not github_fetched:
+            for url in all_urls:
+                if "readthedocs" in url.lower():
+                    rtd_content = _fetch_readthedocs(url, allowlist)
+                    if rtd_content and len(rtd_content) > 500:
+                        description = rtd_content
+                        github_fetched = True  # Mark as fetched from external source
+                        break
+
+        # Priority 3: Try to scrape PyPI web page (not API) for full README
+        if not github_fetched and len(description) < 1000:
+            pypi_readme = _fetch_pypi_web_readme(name, version, allowlist)
+            if pypi_readme and len(pypi_readme) > len(description):
+                description = pypi_readme
+                github_fetched = True  # Mark as fetched from external source
+
+        # Priority 4: Use PyPI description (often contains full README)
+        # PyPI descriptions can be quite good if properly uploaded
+        if not github_fetched and len(description) < 500 and summary:
+            # If description is too short, enhance it
+            description = _enhance_pypi_description(info, description, summary)
+
+        # Write documentation
+        with open(doc_file, "w", encoding="utf-8") as f:
+            f.write(f"# {name}@{version}\n\n")
+            f.write(f"**Package**: [{name}](https://pypi.org/project/{name}/)\n")
+            f.write(f"**Version**: {version}\n")
+
+            # Add project URLs if available
+            if project_urls:
+                f.write("\n**Links**:\n")
+                for key, url in list(project_urls.items())[:5]:  # Limit to 5
+                    if url:
+                        f.write(f"- {key}: {url}\n")
+
+            f.write("\n---\n\n")
+
+            # Add summary if different from description
+            if summary and summary not in description:
+                f.write(f"**Summary**: {summary}\n\n")
+
+            f.write(description)
+
+            # Add installation instructions if not in description
+            if "pip install" not in description.lower():
+                f.write(f"\n\n## Installation\n\n```bash\npip install {name}\n```\n")
+
+            # Add basic usage if really minimal docs
+            if len(description) < 200:
+                f.write(f"\n\n## Basic Usage\n\n```python\nimport {name.replace('-', '_')}\n```\n")
+
+        # Write metadata
+        meta = {
+            "source_url": url,
+            "last_checked": datetime.now().isoformat(),
+            "etag": response.headers.get("ETag"),
+            "project_urls": project_urls,
+            "from_github": github_fetched
+        }
+        with open(meta_file, "w", encoding="utf-8") as f:
+            json.dump(meta, f, indent=2)
+
+        return {"status": "fetched"}
+
+    except urllib.error.HTTPError as e:
+        if e.code == 429:
+            return {"status": "error", "reason": "rate_limited", "error": "HTTP 429: Rate limited"}
+        return {"status": "error", "error": f"HTTP {e.code}: {str(e)}"}
+    except (urllib.error.URLError, json.JSONDecodeError) as e:
+        return {"status": "error", "error": str(e)}
+
+
+def _fetch_github_readme(repo_url: str, allowlist: List[str]) -> Optional[str]:
+    """
+    Fetch README from GitHub repository.
+    Converts repository URL to raw GitHub URL for README.
+    """
+    if not repo_url:
+        return None
+
+    # Extract owner/repo from various GitHub URL formats
+    patterns = [
+        r'github\.com[:/]([^/]+)/([^/\s]+)',
+        r'git\+https://github\.com/([^/]+)/([^/\s]+)',
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, repo_url)
+        if match:
+            owner, repo = match.groups()
+            # Clean repo name
+            repo = repo.replace(".git", "")
+
+            # Try common README filenames
+            readme_files = ["README.md", "readme.md", "README.rst", "README.txt"]
+
+            # Sanitize owner and repo for URL
+            safe_owner = sanitize_url_component(owner)
+            safe_repo = sanitize_url_component(repo)
+
+            for readme_name in readme_files:
+                safe_readme = sanitize_url_component(readme_name)
+                raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/main/{safe_readme}"
+
+                if not _is_url_allowed(raw_url, allowlist):
+                    continue
+
+                try:
+                    with urllib.request.urlopen(raw_url, timeout=5) as response:
+                        return response.read().decode("utf-8")
+                except urllib.error.HTTPError:
+                    # Try master branch
+                    raw_url = f"https://raw.githubusercontent.com/{safe_owner}/{safe_repo}/master/{safe_readme}"
+                    try:
+                        with urllib.request.urlopen(raw_url, timeout=5) as response:
+                            return response.read().decode("utf-8")
+                    except urllib.error.URLError:
+                        continue
+                except urllib.error.URLError:
+                    continue
+
+    return None
+
+
+def _is_url_allowed(url: str, allowlist: List[str]) -> bool:
+    """Check if URL is in the allowlist."""
+    for allowed in allowlist:
+        if url.startswith(allowed):
+            return True
+    return False
+
+
+def _enhance_npm_readme(data: Dict[str, Any], readme: str) -> str:
+    """Enhance minimal npm README with package metadata."""
+    enhanced = readme if readme else ""
+
+    # Add description if not in README
+    description = data.get("description", "")
+    if description and description not in enhanced:
+        enhanced = f"{description}\n\n{enhanced}"
+
+    # Add keywords
+    keywords = data.get("keywords", [])
+    if keywords and "keywords" not in enhanced.lower():
+        enhanced += f"\n\n## Keywords\n\n{', '.join(keywords)}"
+
+    # Add main entry point info
+    main = data.get("main", "")
+    if main:
+        enhanced += f"\n\n## Entry Point\n\nMain file: `{main}`"
+
+    # Add dependencies info if substantial
+    deps = data.get("dependencies", {})
+    if len(deps) > 0 and len(deps) <= 10:  # Only if reasonable number
+        enhanced += "\n\n## Dependencies\n\n"
+        for dep, ver in deps.items():
+            enhanced += f"- {dep}: {ver}\n"
+
+    return enhanced
+
+
+def _fetch_readthedocs(url: str, allowlist: List[str]) -> Optional[str]:
+    """
+    Fetch documentation from ReadTheDocs.
+    Tries to get the main index page content.
+    """
+    if not url or not _is_url_allowed(url, allowlist):
+        return None
+
+    # Ensure we're getting the latest version
+    if not url.endswith("/"):
+        url += "/"
+
+    # Try to fetch the main page
+    try:
+        # Add en/latest if not already in URL
+        if "/en/latest" not in url and "/en/stable" not in url:
+            url = url.rstrip("/") + "/en/latest/"
+
+        with urllib.request.urlopen(url, timeout=10) as response:
+            html_content = response.read().decode("utf-8")
+
+        # Basic HTML to markdown conversion (very simplified)
+        # Remove script and style tags
+        html_content = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL)
+        html_content = re.sub(r'<style[^>]*>.*?</style>', '', html_content, flags=re.DOTALL)
+
+        # Extract main content (look for common RTD content divs)
+        content_match = re.search(r'<div[^>]*class="[^"]*document[^"]*"[^>]*>(.*?)</div>', html_content, re.DOTALL)
+        if content_match:
+            html_content = content_match.group(1)
+
+        # Convert basic HTML tags to markdown
+        html_content = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', html_content)
+        html_content = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', html_content)
+        html_content = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', html_content)
+        html_content = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html_content)
+        html_content = re.sub(r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```', html_content, flags=re.DOTALL)
+        html_content = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', html_content)
+        html_content = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', html_content)
+        html_content = re.sub(r'<[^>]+>', '', html_content)  # Remove remaining HTML tags
+
+        # Clean up whitespace
+        html_content = re.sub(r'\n{3,}', '\n\n', html_content)
+
+        return html_content.strip()
+    except Exception:
+        return None
+
+
+def _fetch_pypi_web_readme(name: str, version: str, allowlist: List[str]) -> Optional[str]:
+    """
+    Fetch the rendered README from PyPI's web interface.
+    The web interface shows the full README that's often missing from the API.
+    """
+    # Validate package name
+    if not validate_package_name(name, "py"):
+        return None
+    
+    # Sanitize for URL
+    safe_name = sanitize_url_component(name)
+    safe_version = sanitize_url_component(version)
+    
+    # PyPI web URLs
+    urls_to_try = [
+        f"https://pypi.org/project/{safe_name}/{safe_version}/",
+        f"https://pypi.org/project/{safe_name}/"
+    ]
+
+    for url in urls_to_try:
+        if not _is_url_allowed(url, allowlist):
+            continue
+
+        try:
+            req = urllib.request.Request(url, headers={
+                'User-Agent': 'Mozilla/5.0 (compatible; TheAuditor/1.0)'
+            })
+            with urllib.request.urlopen(req, timeout=10) as response:
+                html_content = response.read().decode("utf-8")
+
+            # Look for the project description div
+            # PyPI uses a specific class for the README content
+            readme_match = re.search(
+                r'<div[^>]*class="[^"]*project-description[^"]*"[^>]*>(.*?)</div>',
+                html_content,
+                re.DOTALL | re.IGNORECASE
+            )
+
+            if not readme_match:
+                # Try alternative patterns
+                readme_match = re.search(
+                    r'<div[^>]*class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+                    html_content,
+                    re.DOTALL | re.IGNORECASE
+                )
+
+            if readme_match:
+                readme_html = readme_match.group(1)
+
+                # Convert HTML to markdown (simplified)
+                # Headers
+                readme_html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1\n', readme_html, flags=re.IGNORECASE)
+                readme_html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1\n', readme_html, flags=re.IGNORECASE)
+                readme_html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1\n', readme_html, flags=re.IGNORECASE)
+
+                # Code blocks
+                readme_html = re.sub(r'<pre[^>]*><code[^>]*>(.*?)</code></pre>', r'```\n\1\n```', readme_html, flags=re.DOTALL | re.IGNORECASE)
+                readme_html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', readme_html, flags=re.IGNORECASE)
+
+                # Lists
+                readme_html = re.sub(r'<li[^>]*>(.*?)</li>', r'- \1\n', readme_html, flags=re.IGNORECASE)
+
+                # Links
+                readme_html = re.sub(r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>', r'[\2](\1)', readme_html, flags=re.IGNORECASE)
+
+                # Paragraphs and line breaks
+                readme_html = re.sub(r'<p[^>]*>(.*?)</p>', r'\1\n\n', readme_html, flags=re.DOTALL | re.IGNORECASE)
+                readme_html = re.sub(r'<br[^>]*>', '\n', readme_html, flags=re.IGNORECASE)
+
+                # Remove remaining HTML tags
+                readme_html = re.sub(r'<[^>]+>', '', readme_html)
+
+                # Decode HTML entities
+                readme_html = readme_html.replace('&lt;', '<')
+                readme_html = readme_html.replace('&gt;', '>')
+                readme_html = readme_html.replace('&amp;', '&')
+                readme_html = readme_html.replace('&quot;', '"')
+                readme_html = readme_html.replace('&#39;', "'")
+
+                # Clean up whitespace
+                readme_html = re.sub(r'\n{3,}', '\n\n', readme_html)
+                readme_html = readme_html.strip()
+
+                if len(readme_html) > 100:  # Only return if we got substantial content
+                    return readme_html
+        except Exception:
+            continue
+
+    return None
+
+
+def _enhance_pypi_description(info: Dict[str, Any], description: str, summary: str) -> str:
+    """Enhance minimal PyPI description with package metadata."""
+    enhanced = description if description else ""
+
+    # Start with summary if description is empty
+    if not enhanced and summary:
+        enhanced = f"{summary}\n\n"
+
+    # Add author info
+    author = info.get("author", "")
+    author_email = info.get("author_email", "")
+    if author and "author" not in enhanced.lower():
+        author_info = f"\n\n## Author\n\n{author}"
+        if author_email:
+            author_info += f" ({author_email})"
+        enhanced += author_info
+
+    # Add license
+    license_info = info.get("license", "")
+    if license_info and "license" not in enhanced.lower():
+        enhanced += f"\n\n## License\n\n{license_info}"
+
+    # Add classifiers (limited)
+    classifiers = info.get("classifiers", [])
+    relevant_classifiers = [
+        c for c in classifiers
+        if "Programming Language" in c or "Framework" in c or "Topic" in c
+    ][:5]  # Limit to 5
+    if relevant_classifiers:
+        enhanced += "\n\n## Classifiers\n\n"
+        for classifier in relevant_classifiers:
+            enhanced += f"- {classifier}\n"
+
+    # Add requires_python if specified
+    requires_python = info.get("requires_python", "")
+    if requires_python:
+        enhanced += f"\n\n## Python Version\n\nRequires Python {requires_python}"
+
+    return enhanced
+
+
+def check_latest(
+    deps: List[Dict[str, Any]],
+    allow_net: bool = True,
+    offline: bool = False,
+    output_path: str = "./.pf/deps_latest.json"
+) -> Dict[str, Any]:
+    """
+    Check latest versions and compare to locked versions.
+    
+    This is a wrapper around deps.check_latest_versions for consistency.
+    """
+    from .deps import check_latest_versions, write_deps_latest_json
+
+    if offline or not allow_net:
+        return {
+            "mode": "offline",
+            "checked": 0,
+            "outdated": 0
+        }
+
+    latest_info = check_latest_versions(deps, allow_net=allow_net, offline=offline)
+
+    if latest_info:
+        # Sanitize output path before writing
+        try:
+            safe_output_path = str(sanitize_path(output_path, "."))
+            write_deps_latest_json(latest_info, safe_output_path)
+        except SecurityError as e:
+            return {
+                "mode": "error",
+                "error": f"Invalid output path: {e}",
+                "checked": 0,
+                "outdated": 0
+            }
+
+    outdated = sum(1 for info in latest_info.values() if info["is_outdated"])
+
+    return {
+        "mode": "online",
+        "checked": len(latest_info),
+        "outdated": outdated,
+        "output": output_path
+    }