mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 11:24:19 +01:00
Initial commit: TheAuditor v1.0.1 - AI-centric SAST and Code Intelligence Platform
This commit is contained in:
408
theauditor/docs_summarize.py
Normal file
408
theauditor/docs_summarize.py
Normal file
@@ -0,0 +1,408 @@
|
||||
"""Documentation summarizer for creating concise doc capsules."""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
|
||||
def summarize_docs(
|
||||
docs_dir: str = "./.pf/context/docs",
|
||||
output_dir: str = "./.pf/context/doc_capsules",
|
||||
workset_path: Optional[str] = None,
|
||||
max_capsule_lines: int = 50
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate concise doc capsules from fetched documentation.
|
||||
|
||||
Args:
|
||||
docs_dir: Directory containing fetched docs
|
||||
output_dir: Directory for output capsules
|
||||
workset_path: Optional workset to filter relevant deps
|
||||
max_capsule_lines: Maximum lines per capsule
|
||||
|
||||
Returns:
|
||||
Summary statistics
|
||||
"""
|
||||
docs_path = Path(docs_dir)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load workset if provided
|
||||
relevant_deps = None
|
||||
if workset_path and Path(workset_path).exists():
|
||||
relevant_deps = _load_workset_deps(workset_path)
|
||||
|
||||
stats = {
|
||||
"total_docs": 0,
|
||||
"capsules_created": 0,
|
||||
"skipped": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
capsules_index = []
|
||||
|
||||
# Process npm docs
|
||||
npm_dir = docs_path / "npm"
|
||||
if npm_dir.exists():
|
||||
for pkg_dir in npm_dir.iterdir():
|
||||
if not pkg_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Extract package name and version
|
||||
pkg_info = pkg_dir.name # format: name@version
|
||||
if "@" not in pkg_info:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
name_version = pkg_info.rsplit("@", 1)
|
||||
if len(name_version) != 2:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
name, version = name_version
|
||||
|
||||
# Check if in workset
|
||||
if relevant_deps and f"npm:{name}" not in relevant_deps:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
stats["total_docs"] += 1
|
||||
|
||||
# Create capsule
|
||||
doc_file = pkg_dir / "doc.md"
|
||||
meta_file = pkg_dir / "meta.json"
|
||||
|
||||
if doc_file.exists():
|
||||
try:
|
||||
capsule = _create_capsule(
|
||||
doc_file, meta_file, name, version, "npm", max_capsule_lines
|
||||
)
|
||||
|
||||
# Write capsule
|
||||
capsule_file = output_path / f"npm__{name}@{version}.md"
|
||||
with open(capsule_file, "w", encoding="utf-8") as f:
|
||||
f.write(capsule)
|
||||
|
||||
capsules_index.append({
|
||||
"name": name,
|
||||
"version": version,
|
||||
"ecosystem": "npm",
|
||||
"path": str(capsule_file.relative_to(output_path))
|
||||
})
|
||||
|
||||
stats["capsules_created"] += 1
|
||||
|
||||
except Exception as e:
|
||||
stats["errors"].append(f"{name}@{version}: {str(e)}")
|
||||
|
||||
# Process Python docs
|
||||
py_dir = docs_path / "py"
|
||||
if py_dir.exists():
|
||||
for pkg_dir in py_dir.iterdir():
|
||||
if not pkg_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Extract package name and version
|
||||
pkg_info = pkg_dir.name # format: name@version
|
||||
if "@" not in pkg_info:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
name_version = pkg_info.rsplit("@", 1)
|
||||
if len(name_version) != 2:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
name, version = name_version
|
||||
|
||||
# Check if in workset
|
||||
if relevant_deps and f"py:{name}" not in relevant_deps:
|
||||
stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
stats["total_docs"] += 1
|
||||
|
||||
# Create capsule
|
||||
doc_file = pkg_dir / "doc.md"
|
||||
meta_file = pkg_dir / "meta.json"
|
||||
|
||||
if doc_file.exists():
|
||||
try:
|
||||
capsule = _create_capsule(
|
||||
doc_file, meta_file, name, version, "py", max_capsule_lines
|
||||
)
|
||||
|
||||
# Write capsule
|
||||
capsule_file = output_path / f"py__{name}@{version}.md"
|
||||
with open(capsule_file, "w", encoding="utf-8") as f:
|
||||
f.write(capsule)
|
||||
|
||||
capsules_index.append({
|
||||
"name": name,
|
||||
"version": version,
|
||||
"ecosystem": "py",
|
||||
"path": str(capsule_file.relative_to(output_path))
|
||||
})
|
||||
|
||||
stats["capsules_created"] += 1
|
||||
|
||||
except Exception as e:
|
||||
stats["errors"].append(f"{name}@{version}: {str(e)}")
|
||||
|
||||
# Write index
|
||||
index_file = output_path.parent / "doc_index.json"
|
||||
with open(index_file, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"capsules": capsules_index,
|
||||
"stats": stats
|
||||
}, f, indent=2)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def _load_workset_deps(workset_path: str) -> Set[str]:
|
||||
"""
|
||||
Load relevant dependencies from workset.
|
||||
Returns set of "manager:name" keys.
|
||||
"""
|
||||
relevant = set()
|
||||
|
||||
try:
|
||||
with open(workset_path, encoding="utf-8") as f:
|
||||
workset = json.load(f)
|
||||
|
||||
# Extract imported packages from workset files
|
||||
# This is a simplified version - would need more sophisticated parsing
|
||||
for file_info in workset.get("files", []):
|
||||
path = file_info.get("path", "")
|
||||
|
||||
# Simple heuristic: look at file extension
|
||||
if path.endswith((".js", ".ts", ".jsx", ".tsx")):
|
||||
# Would parse imports/requires
|
||||
# For now, include all npm deps
|
||||
relevant.add("npm:*")
|
||||
elif path.endswith(".py"):
|
||||
# Would parse imports
|
||||
# For now, include all py deps
|
||||
relevant.add("py:*")
|
||||
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
pass
|
||||
|
||||
# If we couldn't determine specific deps, include all
|
||||
if not relevant or "npm:*" in relevant or "py:*" in relevant:
|
||||
return set() # Empty set means include all
|
||||
|
||||
return relevant
|
||||
|
||||
|
||||
def _create_capsule(
|
||||
doc_file: Path,
|
||||
meta_file: Path,
|
||||
name: str,
|
||||
version: str,
|
||||
ecosystem: str,
|
||||
max_lines: int
|
||||
) -> str:
|
||||
"""Create a concise capsule from documentation."""
|
||||
|
||||
# Read documentation
|
||||
with open(doc_file, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Read metadata
|
||||
meta = {}
|
||||
if meta_file.exists():
|
||||
try:
|
||||
with open(meta_file, encoding="utf-8") as f:
|
||||
meta = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Extract key sections
|
||||
sections = {
|
||||
"init": _extract_initialization(content, ecosystem),
|
||||
"apis": _extract_top_apis(content),
|
||||
"examples": _extract_examples(content),
|
||||
}
|
||||
|
||||
# Build capsule
|
||||
capsule_lines = [
|
||||
f"# {name}@{version} ({ecosystem})",
|
||||
"",
|
||||
"## Quick Start",
|
||||
""
|
||||
]
|
||||
|
||||
if sections["init"]:
|
||||
capsule_lines.extend(sections["init"][:10]) # Limit lines
|
||||
capsule_lines.append("")
|
||||
elif content: # If no structured init but has content, add some raw content
|
||||
content_lines = content.split("\n")[:10]
|
||||
capsule_lines.extend(content_lines)
|
||||
capsule_lines.append("")
|
||||
|
||||
if sections["apis"]:
|
||||
capsule_lines.append("## Top APIs")
|
||||
capsule_lines.append("")
|
||||
capsule_lines.extend(sections["apis"][:15]) # Limit lines
|
||||
capsule_lines.append("")
|
||||
|
||||
if sections["examples"]:
|
||||
capsule_lines.append("## Examples")
|
||||
capsule_lines.append("")
|
||||
capsule_lines.extend(sections["examples"][:15]) # Limit lines
|
||||
capsule_lines.append("")
|
||||
|
||||
# Add reference to full documentation
|
||||
capsule_lines.append("## 📄 Full Documentation Available")
|
||||
capsule_lines.append("")
|
||||
# Calculate relative path from project root
|
||||
full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md"
|
||||
capsule_lines.append(f"**Full content**: `{full_doc_path}`")
|
||||
|
||||
# Count lines in full doc if it exists
|
||||
if doc_file.exists():
|
||||
try:
|
||||
with open(doc_file, encoding="utf-8") as f:
|
||||
line_count = len(f.readlines())
|
||||
capsule_lines.append(f"**Size**: {line_count} lines")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
capsule_lines.append("")
|
||||
|
||||
# Add source info
|
||||
capsule_lines.append("## Source")
|
||||
capsule_lines.append("")
|
||||
capsule_lines.append(f"- URL: {meta.get('source_url', '')}")
|
||||
capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}")
|
||||
|
||||
# Truncate if too long
|
||||
if len(capsule_lines) > max_lines:
|
||||
# Keep the full doc reference even when truncating
|
||||
keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation
|
||||
ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l]
|
||||
capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines
|
||||
|
||||
return "\n".join(capsule_lines)
|
||||
|
||||
|
||||
def _extract_initialization(content: str, ecosystem: str) -> List[str]:
|
||||
"""Extract initialization/installation snippets."""
|
||||
lines = []
|
||||
|
||||
# Look for installation section
|
||||
install_patterns = [
|
||||
r"## Install\w*",
|
||||
r"## Getting Started",
|
||||
r"## Quick Start",
|
||||
r"### Install\w*",
|
||||
]
|
||||
|
||||
for pattern in install_patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
# Extract next code block
|
||||
start = match.end()
|
||||
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
|
||||
if code_match:
|
||||
lines.append(f"```{code_match.group(1)}")
|
||||
lines.extend(code_match.group(2).strip().split("\n")[:5])
|
||||
lines.append("```")
|
||||
break
|
||||
|
||||
# Fallback: look for common patterns
|
||||
if not lines:
|
||||
if ecosystem == "npm":
|
||||
if "require(" in content:
|
||||
match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content)
|
||||
if match:
|
||||
lines = ["```javascript", match.group(0), "```"]
|
||||
elif "import " in content:
|
||||
match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content)
|
||||
if match:
|
||||
lines = ["```javascript", match.group(0), "```"]
|
||||
elif ecosystem == "py":
|
||||
if "import " in content:
|
||||
match = re.search(r"import\s+\w+", content)
|
||||
if match:
|
||||
lines = ["```python", match.group(0), "```"]
|
||||
elif "from " in content:
|
||||
match = re.search(r"from\s+\w+\s+import\s+\w+", content)
|
||||
if match:
|
||||
lines = ["```python", match.group(0), "```"]
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _extract_top_apis(content: str) -> List[str]:
|
||||
"""Extract top API methods."""
|
||||
lines = []
|
||||
|
||||
# Look for API section
|
||||
api_patterns = [
|
||||
r"## API",
|
||||
r"## Methods",
|
||||
r"## Functions",
|
||||
r"### API",
|
||||
]
|
||||
|
||||
for pattern in api_patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
start = match.end()
|
||||
# Extract next few method signatures
|
||||
method_matches = re.findall(
|
||||
r"^[\*\-]\s*`([^`]+)`",
|
||||
content[start:start+2000],
|
||||
re.MULTILINE
|
||||
)
|
||||
for method in method_matches[:5]: # Top 5 methods
|
||||
lines.append(f"- `{method}`")
|
||||
break
|
||||
|
||||
# Fallback: look for function definitions in code blocks
|
||||
if not lines:
|
||||
code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL)
|
||||
for block in code_blocks[:2]: # Check first 2 code blocks
|
||||
# Look for function signatures
|
||||
funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block)
|
||||
for func_name, params in funcs[:5]:
|
||||
lines.append(f"- `{func_name}({params})`")
|
||||
if lines:
|
||||
break
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def _extract_examples(content: str) -> List[str]:
|
||||
"""Extract usage examples."""
|
||||
lines = []
|
||||
|
||||
# Look for examples section
|
||||
example_patterns = [
|
||||
r"## Example",
|
||||
r"## Usage",
|
||||
r"### Example",
|
||||
r"### Usage",
|
||||
]
|
||||
|
||||
for pattern in example_patterns:
|
||||
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
||||
if match:
|
||||
start = match.end()
|
||||
# Extract next code block
|
||||
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
|
||||
if code_match:
|
||||
lang = code_match.group(1) or "javascript"
|
||||
code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines
|
||||
lines.append(f"```{lang}")
|
||||
lines.extend(code_lines)
|
||||
lines.append("```")
|
||||
break
|
||||
|
||||
return lines
|
||||
Reference in New Issue
Block a user