mirror of
https://github.com/aljazceru/Auditor.git
synced 2025-12-17 03:24:18 +01:00
408 lines
13 KiB
Python
408 lines
13 KiB
Python
"""Documentation summarizer for creating concise doc capsules."""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Set
|
|
|
|
|
|
def summarize_docs(
|
|
docs_dir: str = "./.pf/context/docs",
|
|
output_dir: str = "./.pf/context/doc_capsules",
|
|
workset_path: Optional[str] = None,
|
|
max_capsule_lines: int = 50
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Generate concise doc capsules from fetched documentation.
|
|
|
|
Args:
|
|
docs_dir: Directory containing fetched docs
|
|
output_dir: Directory for output capsules
|
|
workset_path: Optional workset to filter relevant deps
|
|
max_capsule_lines: Maximum lines per capsule
|
|
|
|
Returns:
|
|
Summary statistics
|
|
"""
|
|
docs_path = Path(docs_dir)
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Load workset if provided
|
|
relevant_deps = None
|
|
if workset_path and Path(workset_path).exists():
|
|
relevant_deps = _load_workset_deps(workset_path)
|
|
|
|
stats = {
|
|
"total_docs": 0,
|
|
"capsules_created": 0,
|
|
"skipped": 0,
|
|
"errors": []
|
|
}
|
|
|
|
capsules_index = []
|
|
|
|
# Process npm docs
|
|
npm_dir = docs_path / "npm"
|
|
if npm_dir.exists():
|
|
for pkg_dir in npm_dir.iterdir():
|
|
if not pkg_dir.is_dir():
|
|
continue
|
|
|
|
# Extract package name and version
|
|
pkg_info = pkg_dir.name # format: name@version
|
|
if "@" not in pkg_info:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
name_version = pkg_info.rsplit("@", 1)
|
|
if len(name_version) != 2:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
name, version = name_version
|
|
|
|
# Check if in workset
|
|
if relevant_deps and f"npm:{name}" not in relevant_deps:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
stats["total_docs"] += 1
|
|
|
|
# Create capsule
|
|
doc_file = pkg_dir / "doc.md"
|
|
meta_file = pkg_dir / "meta.json"
|
|
|
|
if doc_file.exists():
|
|
try:
|
|
capsule = _create_capsule(
|
|
doc_file, meta_file, name, version, "npm", max_capsule_lines
|
|
)
|
|
|
|
# Write capsule
|
|
capsule_file = output_path / f"npm__{name}@{version}.md"
|
|
with open(capsule_file, "w", encoding="utf-8") as f:
|
|
f.write(capsule)
|
|
|
|
capsules_index.append({
|
|
"name": name,
|
|
"version": version,
|
|
"ecosystem": "npm",
|
|
"path": str(capsule_file.relative_to(output_path))
|
|
})
|
|
|
|
stats["capsules_created"] += 1
|
|
|
|
except Exception as e:
|
|
stats["errors"].append(f"{name}@{version}: {str(e)}")
|
|
|
|
# Process Python docs
|
|
py_dir = docs_path / "py"
|
|
if py_dir.exists():
|
|
for pkg_dir in py_dir.iterdir():
|
|
if not pkg_dir.is_dir():
|
|
continue
|
|
|
|
# Extract package name and version
|
|
pkg_info = pkg_dir.name # format: name@version
|
|
if "@" not in pkg_info:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
name_version = pkg_info.rsplit("@", 1)
|
|
if len(name_version) != 2:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
name, version = name_version
|
|
|
|
# Check if in workset
|
|
if relevant_deps and f"py:{name}" not in relevant_deps:
|
|
stats["skipped"] += 1
|
|
continue
|
|
|
|
stats["total_docs"] += 1
|
|
|
|
# Create capsule
|
|
doc_file = pkg_dir / "doc.md"
|
|
meta_file = pkg_dir / "meta.json"
|
|
|
|
if doc_file.exists():
|
|
try:
|
|
capsule = _create_capsule(
|
|
doc_file, meta_file, name, version, "py", max_capsule_lines
|
|
)
|
|
|
|
# Write capsule
|
|
capsule_file = output_path / f"py__{name}@{version}.md"
|
|
with open(capsule_file, "w", encoding="utf-8") as f:
|
|
f.write(capsule)
|
|
|
|
capsules_index.append({
|
|
"name": name,
|
|
"version": version,
|
|
"ecosystem": "py",
|
|
"path": str(capsule_file.relative_to(output_path))
|
|
})
|
|
|
|
stats["capsules_created"] += 1
|
|
|
|
except Exception as e:
|
|
stats["errors"].append(f"{name}@{version}: {str(e)}")
|
|
|
|
# Write index
|
|
index_file = output_path.parent / "doc_index.json"
|
|
with open(index_file, "w", encoding="utf-8") as f:
|
|
json.dump({
|
|
"created_at": datetime.now().isoformat(),
|
|
"capsules": capsules_index,
|
|
"stats": stats
|
|
}, f, indent=2)
|
|
|
|
return stats
|
|
|
|
|
|
def _load_workset_deps(workset_path: str) -> Set[str]:
|
|
"""
|
|
Load relevant dependencies from workset.
|
|
Returns set of "manager:name" keys.
|
|
"""
|
|
relevant = set()
|
|
|
|
try:
|
|
with open(workset_path, encoding="utf-8") as f:
|
|
workset = json.load(f)
|
|
|
|
# Extract imported packages from workset files
|
|
# This is a simplified version - would need more sophisticated parsing
|
|
for file_info in workset.get("files", []):
|
|
path = file_info.get("path", "")
|
|
|
|
# Simple heuristic: look at file extension
|
|
if path.endswith((".js", ".ts", ".jsx", ".tsx")):
|
|
# Would parse imports/requires
|
|
# For now, include all npm deps
|
|
relevant.add("npm:*")
|
|
elif path.endswith(".py"):
|
|
# Would parse imports
|
|
# For now, include all py deps
|
|
relevant.add("py:*")
|
|
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
# If we couldn't determine specific deps, include all
|
|
if not relevant or "npm:*" in relevant or "py:*" in relevant:
|
|
return set() # Empty set means include all
|
|
|
|
return relevant
|
|
|
|
|
|
def _create_capsule(
|
|
doc_file: Path,
|
|
meta_file: Path,
|
|
name: str,
|
|
version: str,
|
|
ecosystem: str,
|
|
max_lines: int
|
|
) -> str:
|
|
"""Create a concise capsule from documentation."""
|
|
|
|
# Read documentation
|
|
with open(doc_file, encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
# Read metadata
|
|
meta = {}
|
|
if meta_file.exists():
|
|
try:
|
|
with open(meta_file, encoding="utf-8") as f:
|
|
meta = json.load(f)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Extract key sections
|
|
sections = {
|
|
"init": _extract_initialization(content, ecosystem),
|
|
"apis": _extract_top_apis(content),
|
|
"examples": _extract_examples(content),
|
|
}
|
|
|
|
# Build capsule
|
|
capsule_lines = [
|
|
f"# {name}@{version} ({ecosystem})",
|
|
"",
|
|
"## Quick Start",
|
|
""
|
|
]
|
|
|
|
if sections["init"]:
|
|
capsule_lines.extend(sections["init"][:10]) # Limit lines
|
|
capsule_lines.append("")
|
|
elif content: # If no structured init but has content, add some raw content
|
|
content_lines = content.split("\n")[:10]
|
|
capsule_lines.extend(content_lines)
|
|
capsule_lines.append("")
|
|
|
|
if sections["apis"]:
|
|
capsule_lines.append("## Top APIs")
|
|
capsule_lines.append("")
|
|
capsule_lines.extend(sections["apis"][:15]) # Limit lines
|
|
capsule_lines.append("")
|
|
|
|
if sections["examples"]:
|
|
capsule_lines.append("## Examples")
|
|
capsule_lines.append("")
|
|
capsule_lines.extend(sections["examples"][:15]) # Limit lines
|
|
capsule_lines.append("")
|
|
|
|
# Add reference to full documentation
|
|
capsule_lines.append("## 📄 Full Documentation Available")
|
|
capsule_lines.append("")
|
|
# Calculate relative path from project root
|
|
full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md"
|
|
capsule_lines.append(f"**Full content**: `{full_doc_path}`")
|
|
|
|
# Count lines in full doc if it exists
|
|
if doc_file.exists():
|
|
try:
|
|
with open(doc_file, encoding="utf-8") as f:
|
|
line_count = len(f.readlines())
|
|
capsule_lines.append(f"**Size**: {line_count} lines")
|
|
except Exception:
|
|
pass
|
|
|
|
capsule_lines.append("")
|
|
|
|
# Add source info
|
|
capsule_lines.append("## Source")
|
|
capsule_lines.append("")
|
|
capsule_lines.append(f"- URL: {meta.get('source_url', '')}")
|
|
capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}")
|
|
|
|
# Truncate if too long
|
|
if len(capsule_lines) > max_lines:
|
|
# Keep the full doc reference even when truncating
|
|
keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation
|
|
ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l]
|
|
capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines
|
|
|
|
return "\n".join(capsule_lines)
|
|
|
|
|
|
def _extract_initialization(content: str, ecosystem: str) -> List[str]:
|
|
"""Extract initialization/installation snippets."""
|
|
lines = []
|
|
|
|
# Look for installation section
|
|
install_patterns = [
|
|
r"## Install\w*",
|
|
r"## Getting Started",
|
|
r"## Quick Start",
|
|
r"### Install\w*",
|
|
]
|
|
|
|
for pattern in install_patterns:
|
|
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
# Extract next code block
|
|
start = match.end()
|
|
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
|
|
if code_match:
|
|
lines.append(f"```{code_match.group(1)}")
|
|
lines.extend(code_match.group(2).strip().split("\n")[:5])
|
|
lines.append("```")
|
|
break
|
|
|
|
# Fallback: look for common patterns
|
|
if not lines:
|
|
if ecosystem == "npm":
|
|
if "require(" in content:
|
|
match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content)
|
|
if match:
|
|
lines = ["```javascript", match.group(0), "```"]
|
|
elif "import " in content:
|
|
match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content)
|
|
if match:
|
|
lines = ["```javascript", match.group(0), "```"]
|
|
elif ecosystem == "py":
|
|
if "import " in content:
|
|
match = re.search(r"import\s+\w+", content)
|
|
if match:
|
|
lines = ["```python", match.group(0), "```"]
|
|
elif "from " in content:
|
|
match = re.search(r"from\s+\w+\s+import\s+\w+", content)
|
|
if match:
|
|
lines = ["```python", match.group(0), "```"]
|
|
|
|
return lines
|
|
|
|
|
|
def _extract_top_apis(content: str) -> List[str]:
|
|
"""Extract top API methods."""
|
|
lines = []
|
|
|
|
# Look for API section
|
|
api_patterns = [
|
|
r"## API",
|
|
r"## Methods",
|
|
r"## Functions",
|
|
r"### API",
|
|
]
|
|
|
|
for pattern in api_patterns:
|
|
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
start = match.end()
|
|
# Extract next few method signatures
|
|
method_matches = re.findall(
|
|
r"^[\*\-]\s*`([^`]+)`",
|
|
content[start:start+2000],
|
|
re.MULTILINE
|
|
)
|
|
for method in method_matches[:5]: # Top 5 methods
|
|
lines.append(f"- `{method}`")
|
|
break
|
|
|
|
# Fallback: look for function definitions in code blocks
|
|
if not lines:
|
|
code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL)
|
|
for block in code_blocks[:2]: # Check first 2 code blocks
|
|
# Look for function signatures
|
|
funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block)
|
|
for func_name, params in funcs[:5]:
|
|
lines.append(f"- `{func_name}({params})`")
|
|
if lines:
|
|
break
|
|
|
|
return lines
|
|
|
|
|
|
def _extract_examples(content: str) -> List[str]:
|
|
"""Extract usage examples."""
|
|
lines = []
|
|
|
|
# Look for examples section
|
|
example_patterns = [
|
|
r"## Example",
|
|
r"## Usage",
|
|
r"### Example",
|
|
r"### Usage",
|
|
]
|
|
|
|
for pattern in example_patterns:
|
|
match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
|
|
if match:
|
|
start = match.end()
|
|
# Extract next code block
|
|
code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL)
|
|
if code_match:
|
|
lang = code_match.group(1) or "javascript"
|
|
code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines
|
|
lines.append(f"```{lang}")
|
|
lines.extend(code_lines)
|
|
lines.append("```")
|
|
break
|
|
|
|
return lines |