Files
Auditor/theauditor/docs_summarize.py

408 lines
13 KiB
Python

"""Documentation summarizer for creating concise doc capsules."""
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
def summarize_docs(
docs_dir: str = "./.pf/context/docs",
output_dir: str = "./.pf/context/doc_capsules",
workset_path: Optional[str] = None,
max_capsule_lines: int = 50
) -> Dict[str, Any]:
"""
Generate concise doc capsules from fetched documentation.
Args:
docs_dir: Directory containing fetched docs
output_dir: Directory for output capsules
workset_path: Optional workset to filter relevant deps
max_capsule_lines: Maximum lines per capsule
Returns:
Summary statistics
"""
docs_path = Path(docs_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# Load workset if provided
relevant_deps = None
if workset_path and Path(workset_path).exists():
relevant_deps = _load_workset_deps(workset_path)
stats = {
"total_docs": 0,
"capsules_created": 0,
"skipped": 0,
"errors": []
}
capsules_index = []
# Process npm docs
npm_dir = docs_path / "npm"
if npm_dir.exists():
for pkg_dir in npm_dir.iterdir():
if not pkg_dir.is_dir():
continue
# Extract package name and version
pkg_info = pkg_dir.name # format: name@version
if "@" not in pkg_info:
stats["skipped"] += 1
continue
name_version = pkg_info.rsplit("@", 1)
if len(name_version) != 2:
stats["skipped"] += 1
continue
name, version = name_version
# Check if in workset
if relevant_deps and f"npm:{name}" not in relevant_deps:
stats["skipped"] += 1
continue
stats["total_docs"] += 1
# Create capsule
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
if doc_file.exists():
try:
capsule = _create_capsule(
doc_file, meta_file, name, version, "npm", max_capsule_lines
)
# Write capsule
capsule_file = output_path / f"npm__{name}@{version}.md"
with open(capsule_file, "w", encoding="utf-8") as f:
f.write(capsule)
capsules_index.append({
"name": name,
"version": version,
"ecosystem": "npm",
"path": str(capsule_file.relative_to(output_path))
})
stats["capsules_created"] += 1
except Exception as e:
stats["errors"].append(f"{name}@{version}: {str(e)}")
# Process Python docs
py_dir = docs_path / "py"
if py_dir.exists():
for pkg_dir in py_dir.iterdir():
if not pkg_dir.is_dir():
continue
# Extract package name and version
pkg_info = pkg_dir.name # format: name@version
if "@" not in pkg_info:
stats["skipped"] += 1
continue
name_version = pkg_info.rsplit("@", 1)
if len(name_version) != 2:
stats["skipped"] += 1
continue
name, version = name_version
# Check if in workset
if relevant_deps and f"py:{name}" not in relevant_deps:
stats["skipped"] += 1
continue
stats["total_docs"] += 1
# Create capsule
doc_file = pkg_dir / "doc.md"
meta_file = pkg_dir / "meta.json"
if doc_file.exists():
try:
capsule = _create_capsule(
doc_file, meta_file, name, version, "py", max_capsule_lines
)
# Write capsule
capsule_file = output_path / f"py__{name}@{version}.md"
with open(capsule_file, "w", encoding="utf-8") as f:
f.write(capsule)
capsules_index.append({
"name": name,
"version": version,
"ecosystem": "py",
"path": str(capsule_file.relative_to(output_path))
})
stats["capsules_created"] += 1
except Exception as e:
stats["errors"].append(f"{name}@{version}: {str(e)}")
# Write index
index_file = output_path.parent / "doc_index.json"
with open(index_file, "w", encoding="utf-8") as f:
json.dump({
"created_at": datetime.now().isoformat(),
"capsules": capsules_index,
"stats": stats
}, f, indent=2)
return stats
def _load_workset_deps(workset_path: str) -> Set[str]:
"""
Load relevant dependencies from workset.
Returns set of "manager:name" keys.
"""
relevant = set()
try:
with open(workset_path, encoding="utf-8") as f:
workset = json.load(f)
# Extract imported packages from workset files
# This is a simplified version - would need more sophisticated parsing
for file_info in workset.get("files", []):
path = file_info.get("path", "")
# Simple heuristic: look at file extension
if path.endswith((".js", ".ts", ".jsx", ".tsx")):
# Would parse imports/requires
# For now, include all npm deps
relevant.add("npm:*")
elif path.endswith(".py"):
# Would parse imports
# For now, include all py deps
relevant.add("py:*")
except (json.JSONDecodeError, KeyError):
pass
# If we couldn't determine specific deps, include all
if not relevant or "npm:*" in relevant or "py:*" in relevant:
return set() # Empty set means include all
return relevant
def _create_capsule(
doc_file: Path,
meta_file: Path,
name: str,
version: str,
ecosystem: str,
max_lines: int
) -> str:
"""Create a concise capsule from documentation."""
# Read documentation
with open(doc_file, encoding="utf-8") as f:
content = f.read()
# Read metadata
meta = {}
if meta_file.exists():
try:
with open(meta_file, encoding="utf-8") as f:
meta = json.load(f)
except json.JSONDecodeError:
pass
# Extract key sections
sections = {
"init": _extract_initialization(content, ecosystem),
"apis": _extract_top_apis(content),
"examples": _extract_examples(content),
}
# Build capsule
capsule_lines = [
f"# {name}@{version} ({ecosystem})",
"",
"## Quick Start",
""
]
if sections["init"]:
capsule_lines.extend(sections["init"][:10]) # Limit lines
capsule_lines.append("")
elif content: # If no structured init but has content, add some raw content
content_lines = content.split("\n")[:10]
capsule_lines.extend(content_lines)
capsule_lines.append("")
if sections["apis"]:
capsule_lines.append("## Top APIs")
capsule_lines.append("")
capsule_lines.extend(sections["apis"][:15]) # Limit lines
capsule_lines.append("")
if sections["examples"]:
capsule_lines.append("## Examples")
capsule_lines.append("")
capsule_lines.extend(sections["examples"][:15]) # Limit lines
capsule_lines.append("")
# Add reference to full documentation
capsule_lines.append("## 📄 Full Documentation Available")
capsule_lines.append("")
# Calculate relative path from project root
full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md"
capsule_lines.append(f"**Full content**: `{full_doc_path}`")
# Count lines in full doc if it exists
if doc_file.exists():
try:
with open(doc_file, encoding="utf-8") as f:
line_count = len(f.readlines())
capsule_lines.append(f"**Size**: {line_count} lines")
except Exception:
pass
capsule_lines.append("")
# Add source info
capsule_lines.append("## Source")
capsule_lines.append("")
capsule_lines.append(f"- URL: {meta.get('source_url', '')}")
capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}")
# Truncate if too long
if len(capsule_lines) > max_lines:
# Keep the full doc reference even when truncating
keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation
ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l]
capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines
return "\n".join(capsule_lines)
def _extract_initialization(content: str, ecosystem: str) -> List[str]:
"""Extract initialization/installation snippets."""
lines = []
# Look for installation section
install_patterns = [
r"## Install\w*",
r"## Getting Started",
r"## Quick Start",
r"### Install\w*",
]
for pattern in install_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
# Extract next code block
start = match.end()
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
if code_match:
lines.append(f"```{code_match.group(1)}")
lines.extend(code_match.group(2).strip().split("\n")[:5])
lines.append("```")
break
# Fallback: look for common patterns
if not lines:
if ecosystem == "npm":
if "require(" in content:
match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content)
if match:
lines = ["```javascript", match.group(0), "```"]
elif "import " in content:
match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content)
if match:
lines = ["```javascript", match.group(0), "```"]
elif ecosystem == "py":
if "import " in content:
match = re.search(r"import\s+\w+", content)
if match:
lines = ["```python", match.group(0), "```"]
elif "from " in content:
match = re.search(r"from\s+\w+\s+import\s+\w+", content)
if match:
lines = ["```python", match.group(0), "```"]
return lines
def _extract_top_apis(content: str) -> List[str]:
"""Extract top API methods."""
lines = []
# Look for API section
api_patterns = [
r"## API",
r"## Methods",
r"## Functions",
r"### API",
]
for pattern in api_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
start = match.end()
# Extract next few method signatures
method_matches = re.findall(
r"^[\*\-]\s*`([^`]+)`",
content[start:start+2000],
re.MULTILINE
)
for method in method_matches[:5]: # Top 5 methods
lines.append(f"- `{method}`")
break
# Fallback: look for function definitions in code blocks
if not lines:
code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL)
for block in code_blocks[:2]: # Check first 2 code blocks
# Look for function signatures
funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block)
for func_name, params in funcs[:5]:
lines.append(f"- `{func_name}({params})`")
if lines:
break
return lines
def _extract_examples(content: str) -> List[str]:
"""Extract usage examples."""
lines = []
# Look for examples section
example_patterns = [
r"## Example",
r"## Usage",
r"### Example",
r"### Usage",
]
for pattern in example_patterns:
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
if match:
start = match.end()
# Extract next code block
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
if code_match:
lang = code_match.group(1) or "javascript"
code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines
lines.append(f"```{lang}")
lines.extend(code_lines)
lines.append("```")
break
return lines