"""Documentation summarizer for creating concise doc capsules.""" import json import re from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Set def summarize_docs( docs_dir: str = "./.pf/context/docs", output_dir: str = "./.pf/context/doc_capsules", workset_path: Optional[str] = None, max_capsule_lines: int = 50 ) -> Dict[str, Any]: """ Generate concise doc capsules from fetched documentation. Args: docs_dir: Directory containing fetched docs output_dir: Directory for output capsules workset_path: Optional workset to filter relevant deps max_capsule_lines: Maximum lines per capsule Returns: Summary statistics """ docs_path = Path(docs_dir) output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) # Load workset if provided relevant_deps = None if workset_path and Path(workset_path).exists(): relevant_deps = _load_workset_deps(workset_path) stats = { "total_docs": 0, "capsules_created": 0, "skipped": 0, "errors": [] } capsules_index = [] # Process npm docs npm_dir = docs_path / "npm" if npm_dir.exists(): for pkg_dir in npm_dir.iterdir(): if not pkg_dir.is_dir(): continue # Extract package name and version pkg_info = pkg_dir.name # format: name@version if "@" not in pkg_info: stats["skipped"] += 1 continue name_version = pkg_info.rsplit("@", 1) if len(name_version) != 2: stats["skipped"] += 1 continue name, version = name_version # Check if in workset if relevant_deps and f"npm:{name}" not in relevant_deps: stats["skipped"] += 1 continue stats["total_docs"] += 1 # Create capsule doc_file = pkg_dir / "doc.md" meta_file = pkg_dir / "meta.json" if doc_file.exists(): try: capsule = _create_capsule( doc_file, meta_file, name, version, "npm", max_capsule_lines ) # Write capsule capsule_file = output_path / f"npm__{name}@{version}.md" with open(capsule_file, "w", encoding="utf-8") as f: f.write(capsule) capsules_index.append({ "name": name, "version": version, "ecosystem": "npm", "path": str(capsule_file.relative_to(output_path)) }) stats["capsules_created"] += 1 except Exception as e: stats["errors"].append(f"{name}@{version}: {str(e)}") # Process Python docs py_dir = docs_path / "py" if py_dir.exists(): for pkg_dir in py_dir.iterdir(): if not pkg_dir.is_dir(): continue # Extract package name and version pkg_info = pkg_dir.name # format: name@version if "@" not in pkg_info: stats["skipped"] += 1 continue name_version = pkg_info.rsplit("@", 1) if len(name_version) != 2: stats["skipped"] += 1 continue name, version = name_version # Check if in workset if relevant_deps and f"py:{name}" not in relevant_deps: stats["skipped"] += 1 continue stats["total_docs"] += 1 # Create capsule doc_file = pkg_dir / "doc.md" meta_file = pkg_dir / "meta.json" if doc_file.exists(): try: capsule = _create_capsule( doc_file, meta_file, name, version, "py", max_capsule_lines ) # Write capsule capsule_file = output_path / f"py__{name}@{version}.md" with open(capsule_file, "w", encoding="utf-8") as f: f.write(capsule) capsules_index.append({ "name": name, "version": version, "ecosystem": "py", "path": str(capsule_file.relative_to(output_path)) }) stats["capsules_created"] += 1 except Exception as e: stats["errors"].append(f"{name}@{version}: {str(e)}") # Write index index_file = output_path.parent / "doc_index.json" with open(index_file, "w", encoding="utf-8") as f: json.dump({ "created_at": datetime.now().isoformat(), "capsules": capsules_index, "stats": stats }, f, indent=2) return stats def _load_workset_deps(workset_path: str) -> Set[str]: """ Load relevant dependencies from workset. Returns set of "manager:name" keys. """ relevant = set() try: with open(workset_path, encoding="utf-8") as f: workset = json.load(f) # Extract imported packages from workset files # This is a simplified version - would need more sophisticated parsing for file_info in workset.get("files", []): path = file_info.get("path", "") # Simple heuristic: look at file extension if path.endswith((".js", ".ts", ".jsx", ".tsx")): # Would parse imports/requires # For now, include all npm deps relevant.add("npm:*") elif path.endswith(".py"): # Would parse imports # For now, include all py deps relevant.add("py:*") except (json.JSONDecodeError, KeyError): pass # If we couldn't determine specific deps, include all if not relevant or "npm:*" in relevant or "py:*" in relevant: return set() # Empty set means include all return relevant def _create_capsule( doc_file: Path, meta_file: Path, name: str, version: str, ecosystem: str, max_lines: int ) -> str: """Create a concise capsule from documentation.""" # Read documentation with open(doc_file, encoding="utf-8") as f: content = f.read() # Read metadata meta = {} if meta_file.exists(): try: with open(meta_file, encoding="utf-8") as f: meta = json.load(f) except json.JSONDecodeError: pass # Extract key sections sections = { "init": _extract_initialization(content, ecosystem), "apis": _extract_top_apis(content), "examples": _extract_examples(content), } # Build capsule capsule_lines = [ f"# {name}@{version} ({ecosystem})", "", "## Quick Start", "" ] if sections["init"]: capsule_lines.extend(sections["init"][:10]) # Limit lines capsule_lines.append("") elif content: # If no structured init but has content, add some raw content content_lines = content.split("\n")[:10] capsule_lines.extend(content_lines) capsule_lines.append("") if sections["apis"]: capsule_lines.append("## Top APIs") capsule_lines.append("") capsule_lines.extend(sections["apis"][:15]) # Limit lines capsule_lines.append("") if sections["examples"]: capsule_lines.append("## Examples") capsule_lines.append("") capsule_lines.extend(sections["examples"][:15]) # Limit lines capsule_lines.append("") # Add reference to full documentation capsule_lines.append("## 📄 Full Documentation Available") capsule_lines.append("") # Calculate relative path from project root full_doc_path = f"./.pf/context/docs/{ecosystem}/{name}@{version}/doc.md" capsule_lines.append(f"**Full content**: `{full_doc_path}`") # Count lines in full doc if it exists if doc_file.exists(): try: with open(doc_file, encoding="utf-8") as f: line_count = len(f.readlines()) capsule_lines.append(f"**Size**: {line_count} lines") except Exception: pass capsule_lines.append("") # Add source info capsule_lines.append("## Source") capsule_lines.append("") capsule_lines.append(f"- URL: {meta.get('source_url', '')}") capsule_lines.append(f"- Fetched: {meta.get('last_checked', '')}") # Truncate if too long if len(capsule_lines) > max_lines: # Keep the full doc reference even when truncating keep_lines = capsule_lines[:max_lines-7] # Leave room for reference and truncation ref_lines = [l for l in capsule_lines if "Full Documentation Available" in l or "Full content" in l or "Size" in l] capsule_lines = keep_lines + ["", "...","(truncated)", ""] + ref_lines return "\n".join(capsule_lines) def _extract_initialization(content: str, ecosystem: str) -> List[str]: """Extract initialization/installation snippets.""" lines = [] # Look for installation section install_patterns = [ r"## Install\w*", r"## Getting Started", r"## Quick Start", r"### Install\w*", ] for pattern in install_patterns: match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) if match: # Extract next code block start = match.end() code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL) if code_match: lines.append(f"```{code_match.group(1)}") lines.extend(code_match.group(2).strip().split("\n")[:5]) lines.append("```") break # Fallback: look for common patterns if not lines: if ecosystem == "npm": if "require(" in content: match = re.search(r"(const|var|let)\s+\w+\s*=\s*require\([^)]+\)", content) if match: lines = ["```javascript", match.group(0), "```"] elif "import " in content: match = re.search(r"import\s+.*?from\s+['\"][^'\"]+['\"]", content) if match: lines = ["```javascript", match.group(0), "```"] elif ecosystem == "py": if "import " in content: match = re.search(r"import\s+\w+", content) if match: lines = ["```python", match.group(0), "```"] elif "from " in content: match = re.search(r"from\s+\w+\s+import\s+\w+", content) if match: lines = ["```python", match.group(0), "```"] return lines def _extract_top_apis(content: str) -> List[str]: """Extract top API methods.""" lines = [] # Look for API section api_patterns = [ r"## API", r"## Methods", r"## Functions", r"### API", ] for pattern in api_patterns: match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) if match: start = match.end() # Extract next few method signatures method_matches = re.findall( r"^[\*\-]\s*`([^`]+)`", content[start:start+2000], re.MULTILINE ) for method in method_matches[:5]: # Top 5 methods lines.append(f"- `{method}`") break # Fallback: look for function definitions in code blocks if not lines: code_blocks = re.findall(r"```\w*\n(.*?)```", content, re.DOTALL) for block in code_blocks[:2]: # Check first 2 code blocks # Look for function signatures funcs = re.findall(r"(?:function|def|const|let|var)\s+(\w+)\s*\(([^)]*)\)", block) for func_name, params in funcs[:5]: lines.append(f"- `{func_name}({params})`") if lines: break return lines def _extract_examples(content: str) -> List[str]: """Extract usage examples.""" lines = [] # Look for examples section example_patterns = [ r"## Example", r"## Usage", r"### Example", r"### Usage", ] for pattern in example_patterns: match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE) if match: start = match.end() # Extract next code block code_match = re.search(r"```(\w*)\n(.*?)```", content[start:], re.DOTALL) if code_match: lang = code_match.group(1) or "javascript" code_lines = code_match.group(2).strip().split("\n")[:10] # Max 10 lines lines.append(f"```{lang}") lines.extend(code_lines) lines.append("```") break return lines