Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform

This commit is contained in:
TheAuditorTool
2025-09-07 20:39:47 +07:00
commit ba5c287b02
215 changed files with 50911 additions and 0 deletions

View File

@@ -0,0 +1,408 @@
"""Documentation summarizer for creating concise doc capsules."""
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
def summarize_docs(
docs_dir: str = "./.pf/context/docs",
output_dir: str = "./.pf/context/doc_capsules",
workset_path: Optional[str] = None,
max_capsule_lines: int = 50
) -> Dict[str, Any]:
"""
Generate concise doc capsules from fetched documentation.
Args:
docs_dir: Directory containing fetched docs
output_dir: Directory for output capsules
workset_path: Optional workset to filter relevant deps
max_capsule_lines: Maximum lines per capsule
Returns:
Summary statistics
"""
docs_path = Path(docs_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Load workset if provided
relevant_deps = None
if workset_path and Path(workset_path).exists():
relevant_deps = _load_workset_deps(workset_path)
stats = {
"total_docs": 0,
"capsules_created": 0,
"skipped": 0,
"errors": []
}
capsules_index = []
# Process npm docs
npm_dir = docs_path / "npm"
if npm_dir.exists():
for pkg_dir in npm_dir.iterdir():
if not pkg_dir.is_dir():
continue
# Extract package name and version
pkg_info = pkg_dir.name # format: name@version
if "@" not in pkg_info:
stats["skipped"] += 1
continue
name_version = pkg_info.rsplit("@", 1)
if len(name_version) != 2:
stats["skipped"] += 1
continue
name, version = name_version
# Check if in workset
if relevant_deps and f"npm:{name}" not in relevant_deps:
stats["skipped"] += 1
continue
stats["total_docs"] += 1
# Create capsule
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
if doc_file.exists():
try:
capsule = _create_capsule(
doc_file, meta_file, name, version, "npm", max_capsule_lines
)
# Write capsule
capsule_file = output_path / f"npm__{name}@{version}.md"
with open(capsule_file, "w", encoding="utf-8") as f:
f.write(capsule)
capsules_index.append({
"name": name,
"version": version,
"ecosystem": "npm",
"path": str(capsule_file.relative_to(output_path))
})
stats["capsules_created"] += 1
except Exception as e:
stats["errors"].append(f"{name}@{version}: {str(e)}")
# Process Python docs
py_dir = docs_path / "py"
if py_dir.exists():
for pkg_dir in py_dir.iterdir():
if not pkg_dir.is_dir():
continue
# Extract package name and version
pkg_info = pkg_dir.name # format: name@version
if "@" not in pkg_info:
stats["skipped"] += 1
continue
name_version = pkg_info.rsplit("@", 1)
if len(name_version) != 2:
stats["skipped"] += 1
continue
name, version = name_version
# Check if in workset
if relevant_deps and f"py:{name}" not in relevant_deps:
stats["skipped"] += 1
continue
stats["total_docs"] += 1
# Create capsule
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
if doc_file.exists():
try:
capsule = _create_capsule(
doc_file, meta_file, name, version, "py", max_capsule_lines
)
# Write capsule
capsule_file = output_path / f"py__{name}@{version}.md"
with open(capsule_file, "w", encoding="utf-8") as f:
f.write(capsule)
capsules_index.append({
"name": name,
"version": version,
"ecosystem": "py",
"path": str(capsule_file.relative_to(output_path))
})
stats["capsules_created"] += 1
except Exception as e:
stats["errors"].append(f"{name}@{version}: {str(e)}")
# Write index
index_file = output_path.parent / "doc_index.json"
with open(index_file, "w", encoding="utf-8") as f:
json.dump({
"created_at": datetime.now().isoformat(),
"capsules": capsules_index,
"stats": stats
}, f, indent=2)
return stats
def _load_workset_deps(workset_path: str) -> Set[str]:
"""
Load relevant dependencies from workset.
Returns set of "manager:name" keys.
"""
relevant = set()
try:
with open(workset_path, encoding="utf-8") as f:
workset = json.load(f)
# Extract imported packages from workset files
# This is a simplified version - would need more sophisticated parsing
for file_info in workset.get("files", []):
path = file_info.get("path", "")
# Simple heuristic: look at file extension
if path.endswith((".js", ".ts", ".jsx", ".tsx")):
# Would parse imports/requires
# For now, include all npm deps
relevant.add("npm:*")
elif path.endswith(".py"):
# Would parse imports
# For now, include all py deps
relevant.add("py:*")
except (json.JSONDecodeError, KeyError):
pass
# If we couldn't determine specific deps, include all
if not relevant or "npm:*" in relevant or "py:*" in relevant:
return set() # Empty set means include all
return relevant
def _create_capsule(
doc_file: Path,
meta_file: Path,
name: str,
version: str,
ecosystem: str,
max_lines: int
) -> str:
"""Create a concise capsule from documentation."""
# Read documentation
with open(doc_file, encoding="utf-8") as f:
content = f.read()
# Read metadata
meta = {}
if meta_file.exists():
try:
with open(meta_file, encoding="utf-8") as f:
meta = json.load(f)
except json.JSONDecodeError:
pass
# Extract key sections
sections = {
"init": _extract_initialization(content, ecosystem),
"apis": _extract_top_apis(content),
"examples": _extract_examples(content),
}
# Build capsule
capsule_lines = [
f"# {name}@{version} ({ecosystem})",
"",
"## Quick Start",
""
]
if sections["init"]:
capsule_lines.extend(sections["init"][:10]) # Limit lines
capsule_lines.append("")
elif content: # If no structured init but has content, add some raw content
content_lines = content.split("\n")[:10]
capsule_lines.extend(content_lines)
capsule_lines.append("")
if sections["apis"]:
capsule_lines.append("## Top APIs")
capsule_lines.append("")
capsule_lines.extend(sections["apis"][:15]) # Limit lines
capsule_lines.append("")
if sections["examples"]:
capsule_lines.append("## Examples")
capsule_lines.append("")
capsule_lines.extend(sections["examples"][:15]) # Limit lines
capsule_lines.append("")
# Add reference to full documentation
capsule_lines.append("## 📄 Full Documentation Available")
capsule_lines.append("")
# Calculate relative path from project root
full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md"
capsule_lines.append(f"**Full content**: `{full_doc_path}`")
# Count lines in full doc if it exists
if doc_file.exists():
try:
with open(doc_file, encoding="utf-8") as f:
line_count = len(f.readlines())
capsule_lines.append(f"**Size**: {line_count} lines")
except Exception:
pass
capsule_lines.append("")
# Add source info
capsule_lines.append("## Source")
capsule_lines.append("")
capsule_lines.append(f"- URL: {meta.get('source_url', '')}")
capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}")
# Truncate if too long
if len(capsule_lines) > max_lines:
# Keep the full doc reference even when truncating
keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation
ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l]
capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines
return "\n".join(capsule_lines)
def _extract_initialization(content: str, ecosystem: str) -> List[str]:
"""Extract initialization/installation snippets."""
lines = []
# Look for installation section
install_patterns = [
r"## Install\w*",
r"## Getting Started",
r"## Quick Start",
r"### Install\w*",
]
for pattern in install_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
# Extract next code block
start = match.end()
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
if code_match:
lines.append(f"```{code_match.group(1)}")
lines.extend(code_match.group(2).strip().split("\n")[:5])
lines.append("```")
break
# Fallback: look for common patterns
if not lines:
if ecosystem == "npm":
if "require(" in content:
match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content)
if match:
lines = ["```javascript", match.group(0), "```"]
elif "import " in content:
match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content)
if match:
lines = ["```javascript", match.group(0), "```"]
elif ecosystem == "py":
if "import " in content:
match = re.search(r"import\s+\w+", content)
if match:
lines = ["```python", match.group(0), "```"]
elif "from " in content:
match = re.search(r"from\s+\w+\s+import\s+\w+", content)
if match:
lines = ["```python", match.group(0), "```"]
return lines
def _extract_top_apis(content: str) -> List[str]:
"""Extract top API methods."""
lines = []
# Look for API section
api_patterns = [
r"## API",
r"## Methods",
r"## Functions",
r"### API",
]
for pattern in api_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
start = match.end()
# Extract next few method signatures
method_matches = re.findall(
r"^[\*\-]\s*`([^`]+)`",
content[start:start+2000],
re.MULTILINE
)
for method in method_matches[:5]: # Top 5 methods
lines.append(f"- `{method}`")
break
# Fallback: look for function definitions in code blocks
if not lines:
code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL)
for block in code_blocks[:2]: # Check first 2 code blocks
# Look for function signatures
funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block)
for func_name, params in funcs[:5]:
lines.append(f"- `{func_name}({params})`")
if lines:
break
return lines
def _extract_examples(content: str) -> List[str]:
"""Extract usage examples."""
lines = []
# Look for examples section
example_patterns = [
r"## Example",
r"## Usage",
r"### Example",
r"### Usage",
]
for pattern in example_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
start = match.end()
# Extract next code block
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
if code_match:
lang = code_match.group(1) or "javascript"
code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines
lines.append(f"```{lang}")
lines.extend(code_lines)
lines.append("```")
break
return lines