Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform

2025-12-17 11:24:19 +01:00 · 2025-09-07 20:39:47 +07:00
commit ba5c287b02
215 changed files with 50911 additions and 0 deletions
--- a/theauditor/docs_summarize.py
+++ b/theauditor/docs_summarize.py
@@ -0,0 +1,408 @@
+"""Documentation summarizer for creating concise doc capsules."""
+
+import json
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+
+def summarize_docs(
+    docs_dir: str = "./.pf/context/docs",
+    output_dir: str = "./.pf/context/doc_capsules",
+    workset_path: Optional[str] = None,
+    max_capsule_lines: int = 50
+) -> Dict[str, Any]:
+    """
+    Generate concise doc capsules from fetched documentation.
+    
+    Args:
+        docs_dir: Directory containing fetched docs
+        output_dir: Directory for output capsules
+        workset_path: Optional workset to filter relevant deps
+        max_capsule_lines: Maximum lines per capsule
+    
+    Returns:
+        Summary statistics
+    """
+    docs_path = Path(docs_dir)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    # Load workset if provided
+    relevant_deps = None
+    if workset_path and Path(workset_path).exists():
+        relevant_deps = _load_workset_deps(workset_path)
+    
+    stats = {
+        "total_docs": 0,
+        "capsules_created": 0,
+        "skipped": 0,
+        "errors": []
+    }
+    
+    capsules_index = []
+    
+    # Process npm docs
+    npm_dir = docs_path / "npm"
+    if npm_dir.exists():
+        for pkg_dir in npm_dir.iterdir():
+            if not pkg_dir.is_dir():
+                continue
+            
+            # Extract package name and version
+            pkg_info = pkg_dir.name  # format: name@version
+            if "@" not in pkg_info:
+                stats["skipped"] += 1
+                continue
+            
+            name_version = pkg_info.rsplit("@", 1)
+            if len(name_version) != 2:
+                stats["skipped"] += 1
+                continue
+            
+            name, version = name_version
+            
+            # Check if in workset
+            if relevant_deps and f"npm:{name}" not in relevant_deps:
+                stats["skipped"] += 1
+                continue
+            
+            stats["total_docs"] += 1
+            
+            # Create capsule
+            doc_file = pkg_dir / "doc.md"
+            meta_file = pkg_dir / "meta.json"
+            
+            if doc_file.exists():
+                try:
+                    capsule = _create_capsule(
+                        doc_file, meta_file, name, version, "npm", max_capsule_lines
+                    )
+                    
+                    # Write capsule
+                    capsule_file = output_path / f"npm__{name}@{version}.md"
+                    with open(capsule_file, "w", encoding="utf-8") as f:
+                        f.write(capsule)
+                    
+                    capsules_index.append({
+                        "name": name,
+                        "version": version,
+                        "ecosystem": "npm",
+                        "path": str(capsule_file.relative_to(output_path))
+                    })
+                    
+                    stats["capsules_created"] += 1
+                    
+                except Exception as e:
+                    stats["errors"].append(f"{name}@{version}: {str(e)}")
+    
+    # Process Python docs
+    py_dir = docs_path / "py"
+    if py_dir.exists():
+        for pkg_dir in py_dir.iterdir():
+            if not pkg_dir.is_dir():
+                continue
+            
+            # Extract package name and version
+            pkg_info = pkg_dir.name  # format: name@version
+            if "@" not in pkg_info:
+                stats["skipped"] += 1
+                continue
+            
+            name_version = pkg_info.rsplit("@", 1)
+            if len(name_version) != 2:
+                stats["skipped"] += 1
+                continue
+            
+            name, version = name_version
+            
+            # Check if in workset
+            if relevant_deps and f"py:{name}" not in relevant_deps:
+                stats["skipped"] += 1
+                continue
+            
+            stats["total_docs"] += 1
+            
+            # Create capsule
+            doc_file = pkg_dir / "doc.md"
+            meta_file = pkg_dir / "meta.json"
+            
+            if doc_file.exists():
+                try:
+                    capsule = _create_capsule(
+                        doc_file, meta_file, name, version, "py", max_capsule_lines
+                    )
+                    
+                    # Write capsule
+                    capsule_file = output_path / f"py__{name}@{version}.md"
+                    with open(capsule_file, "w", encoding="utf-8") as f:
+                        f.write(capsule)
+                    
+                    capsules_index.append({
+                        "name": name,
+                        "version": version,
+                        "ecosystem": "py",
+                        "path": str(capsule_file.relative_to(output_path))
+                    })
+                    
+                    stats["capsules_created"] += 1
+                    
+                except Exception as e:
+                    stats["errors"].append(f"{name}@{version}: {str(e)}")
+    
+    # Write index
+    index_file = output_path.parent / "doc_index.json"
+    with open(index_file, "w", encoding="utf-8") as f:
+        json.dump({
+            "created_at": datetime.now().isoformat(),
+            "capsules": capsules_index,
+            "stats": stats
+        }, f, indent=2)
+    
+    return stats
+
+
+def _load_workset_deps(workset_path: str) -> Set[str]:
+    """
+    Load relevant dependencies from workset.
+    Returns set of "manager:name" keys.
+    """
+    relevant = set()
+    
+    try:
+        with open(workset_path, encoding="utf-8") as f:
+            workset = json.load(f)
+        
+        # Extract imported packages from workset files
+        # This is a simplified version - would need more sophisticated parsing
+        for file_info in workset.get("files", []):
+            path = file_info.get("path", "")
+            
+            # Simple heuristic: look at file extension
+            if path.endswith((".js", ".ts", ".jsx", ".tsx")):
+                # Would parse imports/requires
+                # For now, include all npm deps
+                relevant.add("npm:*")
+            elif path.endswith(".py"):
+                # Would parse imports
+                # For now, include all py deps
+                relevant.add("py:*")
+    
+    except (json.JSONDecodeError, KeyError):
+        pass
+    
+    # If we couldn't determine specific deps, include all
+    if not relevant or "npm:*" in relevant or "py:*" in relevant:
+        return set()  # Empty set means include all
+    
+    return relevant
+
+
+def _create_capsule(
+    doc_file: Path,
+    meta_file: Path,
+    name: str,
+    version: str,
+    ecosystem: str,
+    max_lines: int
+) -> str:
+    """Create a concise capsule from documentation."""
+    
+    # Read documentation
+    with open(doc_file, encoding="utf-8") as f:
+        content = f.read()
+    
+    # Read metadata
+    meta = {}
+    if meta_file.exists():
+        try:
+            with open(meta_file, encoding="utf-8") as f:
+                meta = json.load(f)
+        except json.JSONDecodeError:
+            pass
+    
+    # Extract key sections
+    sections = {
+        "init": _extract_initialization(content, ecosystem),
+        "apis": _extract_top_apis(content),
+        "examples": _extract_examples(content),
+    }
+    
+    # Build capsule
+    capsule_lines = [
+        f"# {name}@{version} ({ecosystem})",
+        "",
+        "## Quick Start",
+        ""
+    ]
+    
+    if sections["init"]:
+        capsule_lines.extend(sections["init"][:10])  # Limit lines
+        capsule_lines.append("")
+    elif content:  # If no structured init but has content, add some raw content
+        content_lines = content.split("\n")[:10]
+        capsule_lines.extend(content_lines)
+        capsule_lines.append("")
+    
+    if sections["apis"]:
+        capsule_lines.append("## Top APIs")
+        capsule_lines.append("")
+        capsule_lines.extend(sections["apis"][:15])  # Limit lines
+        capsule_lines.append("")
+    
+    if sections["examples"]:
+        capsule_lines.append("## Examples")
+        capsule_lines.append("")
+        capsule_lines.extend(sections["examples"][:15])  # Limit lines
+        capsule_lines.append("")
+    
+    # Add reference to full documentation
+    capsule_lines.append("## 📄 Full Documentation Available")
+    capsule_lines.append("")
+    # Calculate relative path from project root
+    full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md"
+    capsule_lines.append(f"**Full content**: `{full_doc_path}`")
+    
+    # Count lines in full doc if it exists
+    if doc_file.exists():
+        try:
+            with open(doc_file, encoding="utf-8") as f:
+                line_count = len(f.readlines())
+            capsule_lines.append(f"**Size**: {line_count} lines")
+        except Exception:
+            pass
+    
+    capsule_lines.append("")
+    
+    # Add source info
+    capsule_lines.append("## Source")
+    capsule_lines.append("")
+    capsule_lines.append(f"- URL: {meta.get('source_url', '')}")
+    capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}")
+    
+    # Truncate if too long
+    if len(capsule_lines) > max_lines:
+        # Keep the full doc reference even when truncating
+        keep_lines = capsule_lines[:max_lines-7]  # Leave room for reference and truncation
+        ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l]
+        capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines
+    
+    return "\n".join(capsule_lines)
+
+
+def _extract_initialization(content: str, ecosystem: str) -> List[str]:
+    """Extract initialization/installation snippets."""
+    lines = []
+    
+    # Look for installation section
+    install_patterns = [
+        r"## Install\w*",
+        r"## Getting Started",
+        r"## Quick Start",
+        r"### Install\w*",
+    ]
+    
+    for pattern in install_patterns:
+        match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
+        if match:
+            # Extract next code block
+            start = match.end()
+            code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
+            if code_match:
+                lines.append(f"```{code_match.group(1)}")
+                lines.extend(code_match.group(2).strip().split("\n")[:5])
+                lines.append("```")
+                break
+    
+    # Fallback: look for common patterns
+    if not lines:
+        if ecosystem == "npm":
+            if "require(" in content:
+                match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content)
+                if match:
+                    lines = ["```javascript", match.group(0), "```"]
+            elif "import " in content:
+                match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content)
+                if match:
+                    lines = ["```javascript", match.group(0), "```"]
+        elif ecosystem == "py":
+            if "import " in content:
+                match = re.search(r"import\s+\w+", content)
+                if match:
+                    lines = ["```python", match.group(0), "```"]
+            elif "from " in content:
+                match = re.search(r"from\s+\w+\s+import\s+\w+", content)
+                if match:
+                    lines = ["```python", match.group(0), "```"]
+    
+    return lines
+
+
+def _extract_top_apis(content: str) -> List[str]:
+    """Extract top API methods."""
+    lines = []
+    
+    # Look for API section
+    api_patterns = [
+        r"## API",
+        r"## Methods",
+        r"## Functions",
+        r"### API",
+    ]
+    
+    for pattern in api_patterns:
+        match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
+        if match:
+            start = match.end()
+            # Extract next few method signatures
+            method_matches = re.findall(
+                r"^[\*\-]\s*`([^`]+)`",
+                content[start:start+2000],
+                re.MULTILINE
+            )
+            for method in method_matches[:5]:  # Top 5 methods
+                lines.append(f"- `{method}`")
+            break
+    
+    # Fallback: look for function definitions in code blocks
+    if not lines:
+        code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL)
+        for block in code_blocks[:2]:  # Check first 2 code blocks
+            # Look for function signatures
+            funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block)
+            for func_name, params in funcs[:5]:
+                lines.append(f"- `{func_name}({params})`")
+            if lines:
+                break
+    
+    return lines
+
+
+def _extract_examples(content: str) -> List[str]:
+    """Extract usage examples."""
+    lines = []
+    
+    # Look for examples section
+    example_patterns = [
+        r"## Example",
+        r"## Usage",
+        r"### Example",
+        r"### Usage",
+    ]
+    
+    for pattern in example_patterns:
+        match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
+        if match:
+            start = match.end()
+            # Extract next code block
+            code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
+            if code_match:
+                lang = code_match.group(1) or "javascript"
+                code_lines = code_match.group(2).strip().split("\n")[:10]  # Max 10 lines
+                lines.append(f"```{lang}")
+                lines.extend(code_lines)
+                lines.append("```")
+                break
+    
+    return lines