mirror of
https://github.com/aljazceru/Tutorial-Codebase-Knowledge.git
synced 2025-12-18 15:04:20 +01:00
update readme examples
This commit is contained in:
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/PocketFlow), a 100-line LLM framework. It crawls GitHub repositories and build a knowledge base from the code. It analyzes entire codebases to identify core abstractions and how they interact, and transforms complex code into beginner-friendly tutorials with clear visualizations.
|
This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/PocketFlow), a 100-line LLM framework. It crawls GitHub repositories and build a knowledge base from the code. It analyzes entire codebases to identify core abstractions and how they interact, and transforms complex code into beginner-friendly tutorials with clear visualizations.
|
||||||
|
|
||||||
## Example Tutorials for Popular GitHub Repositories!
|
## ⭐ Example Tutorials for Popular GitHub Repositories!
|
||||||
|
|
||||||
- [AutoGen Core](https://the-pocket.github.io/Tutorial-Codebase-Knowledge/AutoGen%20Core) - Build AI teams that talk, think, and solve problems together like coworkers!
|
- [AutoGen Core](https://the-pocket.github.io/Tutorial-Codebase-Knowledge/AutoGen%20Core) - Build AI teams that talk, think, and solve problems together like coworkers!
|
||||||
|
|
||||||
|
|||||||
43
flow.py
43
flow.py
@@ -1,14 +1,33 @@
|
|||||||
from pocketflow import Flow
|
from pocketflow import Flow
|
||||||
from nodes import GetQuestionNode, AnswerNode
|
# Import all node classes from nodes.py
|
||||||
|
from nodes import (
|
||||||
|
FetchRepo,
|
||||||
|
IdentifyAbstractions,
|
||||||
|
AnalyzeRelationships,
|
||||||
|
OrderChapters,
|
||||||
|
WriteChapters,
|
||||||
|
CombineTutorial
|
||||||
|
)
|
||||||
|
|
||||||
def create_qa_flow():
|
def create_tutorial_flow():
|
||||||
"""Create and return a question-answering flow."""
|
"""Creates and returns the codebase tutorial generation flow."""
|
||||||
# Create nodes
|
|
||||||
get_question_node = GetQuestionNode()
|
# Instantiate nodes
|
||||||
answer_node = AnswerNode()
|
fetch_repo = FetchRepo()
|
||||||
|
identify_abstractions = IdentifyAbstractions(max_retries=3, wait=10)
|
||||||
# Connect nodes in sequence
|
analyze_relationships = AnalyzeRelationships(max_retries=3, wait=10)
|
||||||
get_question_node >> answer_node
|
order_chapters = OrderChapters(max_retries=3, wait=10)
|
||||||
|
write_chapters = WriteChapters(max_retries=3, wait=10) # This is a BatchNode
|
||||||
# Create flow starting with input node
|
combine_tutorial = CombineTutorial()
|
||||||
return Flow(start=get_question_node)
|
|
||||||
|
# Connect nodes in sequence based on the design
|
||||||
|
fetch_repo >> identify_abstractions
|
||||||
|
identify_abstractions >> analyze_relationships
|
||||||
|
analyze_relationships >> order_chapters
|
||||||
|
order_chapters >> write_chapters
|
||||||
|
write_chapters >> combine_tutorial
|
||||||
|
|
||||||
|
# Create the flow starting with FetchRepo
|
||||||
|
tutorial_flow = Flow(start=fetch_repo)
|
||||||
|
|
||||||
|
return tutorial_flow
|
||||||
67
main.py
67
main.py
@@ -1,16 +1,67 @@
|
|||||||
from flow import qa_flow
|
import os
|
||||||
|
import argparse
|
||||||
|
# Import the function that creates the flow
|
||||||
|
from flow import create_tutorial_flow
|
||||||
|
|
||||||
# Example main function
|
# Default file patterns
|
||||||
# Please replace this with your own main function
|
DEFAULT_INCLUDE_PATTERNS = {
|
||||||
|
"*.py", "*.js", "*.ts", "*.go", "*.java", "*.pyi", "*.pyx",
|
||||||
|
"*.c", "*.cc", "*.cpp", "*.h", "*.md", "*.rst", "Dockerfile",
|
||||||
|
"Makefile", "*.yaml", "*.yml"
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_EXCLUDE_PATTERNS = {
|
||||||
|
"*test*", "tests/*", "docs/*", "examples/*", "v1/*",
|
||||||
|
"dist/*", "build/*", "experimental/*", "deprecated/*",
|
||||||
|
"legacy/*", ".git/*", ".github/*"
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Main Function ---
|
||||||
def main():
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Generate a tutorial for a GitHub codebase.")
|
||||||
|
parser.add_argument("repo_url", help="URL of the public GitHub repository.")
|
||||||
|
parser.add_argument("-n", "--name", help="Project name (optional, derived from URL if omitted).")
|
||||||
|
parser.add_argument("-t", "--token", help="GitHub personal access token (optional, reads from GITHUB_TOKEN env var if not provided).")
|
||||||
|
parser.add_argument("-o", "--output", default="output", help="Base directory for output (default: ./output).")
|
||||||
|
parser.add_argument("-i", "--include", nargs="+", help="Include file patterns (e.g. '*.py' '*.js'). Defaults to common code files if not specified.")
|
||||||
|
parser.add_argument("-e", "--exclude", nargs="+", help="Exclude file patterns (e.g. 'tests/*' 'docs/*'). Defaults to test/build directories if not specified.")
|
||||||
|
parser.add_argument("-s", "--max-size", type=int, default=100000, help="Maximum file size in bytes (default: 100000, about 100KB).")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Get GitHub token from argument or environment variable
|
||||||
|
github_token = args.token or os.environ.get('GITHUB_TOKEN')
|
||||||
|
if not github_token:
|
||||||
|
print("Warning: No GitHub token provided. You might hit rate limits for public repositories.")
|
||||||
|
|
||||||
|
# Initialize the shared dictionary with inputs
|
||||||
shared = {
|
shared = {
|
||||||
"question": "In one sentence, what's the end of universe?",
|
"repo_url": args.repo_url,
|
||||||
"answer": None
|
"project_name": args.name, # Can be None, FetchRepo will derive it
|
||||||
|
"github_token": github_token,
|
||||||
|
"output_dir": args.output, # Base directory for CombineTutorial output
|
||||||
|
|
||||||
|
# Add include/exclude patterns and max file size
|
||||||
|
"include_patterns": set(args.include) if args.include else DEFAULT_INCLUDE_PATTERNS,
|
||||||
|
"exclude_patterns": set(args.exclude) if args.exclude else DEFAULT_EXCLUDE_PATTERNS,
|
||||||
|
"max_file_size": args.max_size,
|
||||||
|
|
||||||
|
# Outputs will be populated by the nodes
|
||||||
|
"files": [],
|
||||||
|
"abstractions": [],
|
||||||
|
"relationships": {},
|
||||||
|
"chapter_order": [],
|
||||||
|
"chapters": [],
|
||||||
|
"final_output_dir": None
|
||||||
}
|
}
|
||||||
|
|
||||||
qa_flow.run(shared)
|
print(f"Starting tutorial generation for: {args.repo_url}")
|
||||||
print("Question:", shared["question"])
|
|
||||||
print("Answer:", shared["answer"])
|
|
||||||
|
|
||||||
|
# Create the flow instance
|
||||||
|
tutorial_flow = create_tutorial_flow()
|
||||||
|
|
||||||
|
# Run the flow
|
||||||
|
tutorial_flow.run(shared)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
660
nodes.py
660
nodes.py
@@ -1,26 +1,642 @@
|
|||||||
from pocketflow import Node
|
import os
|
||||||
from utils.call_llm import call_llm
|
import yaml
|
||||||
|
from pocketflow import Node, BatchNode
|
||||||
|
from utils.crawl_github_files import crawl_github_files
|
||||||
|
from utils.call_llm import call_llm # Assuming you have this utility
|
||||||
|
|
||||||
class GetQuestionNode(Node):
|
# Helper to create context from files, respecting limits (basic example)
|
||||||
def exec(self, _):
|
def create_llm_context(files_data):
|
||||||
# Get question directly from user input
|
context = ""
|
||||||
user_question = input("Enter your question: ")
|
file_info = [] # Store tuples of (index, path)
|
||||||
return user_question
|
for i, (path, content) in enumerate(files_data):
|
||||||
|
entry = f"--- File Index {i}: {path} ---\n{content}\n\n"
|
||||||
def post(self, shared, prep_res, exec_res):
|
context += entry
|
||||||
# Store the user's question
|
file_info.append((i, path))
|
||||||
shared["question"] = exec_res
|
|
||||||
return "default" # Go to the next node
|
|
||||||
|
|
||||||
class AnswerNode(Node):
|
return context, file_info # file_info is list of (index, path)
|
||||||
|
|
||||||
|
# Helper to get content for specific file indices
|
||||||
|
def get_content_for_indices(files_data, indices):
|
||||||
|
content_map = {}
|
||||||
|
for i in indices:
|
||||||
|
if 0 <= i < len(files_data):
|
||||||
|
path, content = files_data[i]
|
||||||
|
content_map[f"{i} # {path}"] = content # Use index + path as key for context
|
||||||
|
return content_map
|
||||||
|
|
||||||
|
class FetchRepo(Node):
|
||||||
def prep(self, shared):
|
def prep(self, shared):
|
||||||
# Read question from shared
|
repo_url = shared["repo_url"]
|
||||||
return shared["question"]
|
project_name = shared.get("project_name")
|
||||||
|
if not project_name:
|
||||||
def exec(self, question):
|
# Basic name derivation from URL
|
||||||
# Call LLM to get the answer
|
project_name = repo_url.split('/')[-1].replace('.git', '')
|
||||||
return call_llm(question)
|
shared["project_name"] = project_name
|
||||||
|
|
||||||
|
# Get file patterns directly from shared (defaults are defined in main.py)
|
||||||
|
include_patterns = shared["include_patterns"]
|
||||||
|
exclude_patterns = shared["exclude_patterns"]
|
||||||
|
max_file_size = shared["max_file_size"]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"repo_url": repo_url,
|
||||||
|
"token": shared.get("github_token"),
|
||||||
|
"include_patterns": include_patterns,
|
||||||
|
"exclude_patterns": exclude_patterns,
|
||||||
|
"max_file_size": max_file_size,
|
||||||
|
"use_relative_paths": True
|
||||||
|
}
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
print(f"Crawling repository: {prep_res['repo_url']}...")
|
||||||
|
result = crawl_github_files(
|
||||||
|
repo_url=prep_res["repo_url"],
|
||||||
|
token=prep_res["token"],
|
||||||
|
include_patterns=prep_res["include_patterns"],
|
||||||
|
exclude_patterns=prep_res["exclude_patterns"],
|
||||||
|
max_file_size=prep_res["max_file_size"],
|
||||||
|
use_relative_paths=prep_res["use_relative_paths"]
|
||||||
|
)
|
||||||
|
# Convert dict to list of tuples: [(path, content), ...]
|
||||||
|
files_list = list(result.get("files", {}).items())
|
||||||
|
print(f"Fetched {len(files_list)} files.")
|
||||||
|
return files_list
|
||||||
|
|
||||||
def post(self, shared, prep_res, exec_res):
|
def post(self, shared, prep_res, exec_res):
|
||||||
# Store the answer in shared
|
shared["files"] = exec_res # List of (path, content) tuples
|
||||||
shared["answer"] = exec_res
|
|
||||||
|
class IdentifyAbstractions(Node):
|
||||||
|
def prep(self, shared):
|
||||||
|
files_data = shared["files"]
|
||||||
|
project_name = shared["project_name"] # Get project name
|
||||||
|
context, file_info = create_llm_context(files_data)
|
||||||
|
# Format file info for the prompt (comment is just a hint for LLM)
|
||||||
|
file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info])
|
||||||
|
return context, file_listing_for_prompt, len(files_data), project_name # Return project name
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
context, file_listing_for_prompt, file_count, project_name = prep_res # Unpack project name
|
||||||
|
print("Identifying abstractions using LLM...")
|
||||||
|
prompt = f"""
|
||||||
|
For the project `{project_name}`:
|
||||||
|
|
||||||
|
Codebase Context:
|
||||||
|
{context}
|
||||||
|
|
||||||
|
Analyze the codebase context.
|
||||||
|
Identify the top 5-10 core most important abstractions to help those new to the codebase.
|
||||||
|
|
||||||
|
For each abstraction, provide:
|
||||||
|
1. A concise `name`.
|
||||||
|
2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words.
|
||||||
|
3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`.
|
||||||
|
|
||||||
|
List of file indices and paths present in the context:
|
||||||
|
{file_listing_for_prompt}
|
||||||
|
|
||||||
|
Format the output as a YAML list of dictionaries:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Query Processing
|
||||||
|
description: |
|
||||||
|
Explains what the abstraction does.
|
||||||
|
It's like a central dispatcher routing requests.
|
||||||
|
file_indices:
|
||||||
|
- 0 # path/to/file1.py
|
||||||
|
- 3 # path/to/related.py
|
||||||
|
- name: Query Optimization
|
||||||
|
description: |
|
||||||
|
Another core concept, similar to a blueprint for objects.
|
||||||
|
file_indices:
|
||||||
|
- 5 # path/to/another.js
|
||||||
|
# ... up to 10 abstractions
|
||||||
|
```"""
|
||||||
|
response = call_llm(prompt)
|
||||||
|
|
||||||
|
# --- Validation ---
|
||||||
|
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||||
|
abstractions = yaml.safe_load(yaml_str)
|
||||||
|
|
||||||
|
if not isinstance(abstractions, list):
|
||||||
|
raise ValueError("LLM Output is not a list")
|
||||||
|
|
||||||
|
validated_abstractions = []
|
||||||
|
for item in abstractions:
|
||||||
|
if not isinstance(item, dict) or not all(k in item for k in ["name", "description", "file_indices"]):
|
||||||
|
raise ValueError(f"Missing keys in abstraction item: {item}")
|
||||||
|
if not isinstance(item["description"], str):
|
||||||
|
raise ValueError(f"description is not a string in item: {item}")
|
||||||
|
if not isinstance(item["file_indices"], list):
|
||||||
|
raise ValueError(f"file_indices is not a list in item: {item}")
|
||||||
|
|
||||||
|
# Validate indices
|
||||||
|
validated_indices = []
|
||||||
|
for idx_entry in item["file_indices"]:
|
||||||
|
try:
|
||||||
|
if isinstance(idx_entry, int):
|
||||||
|
idx = idx_entry
|
||||||
|
elif isinstance(idx_entry, str) and '#' in idx_entry:
|
||||||
|
idx = int(idx_entry.split('#')[0].strip())
|
||||||
|
else:
|
||||||
|
idx = int(str(idx_entry).strip())
|
||||||
|
|
||||||
|
if not (0 <= idx < file_count):
|
||||||
|
raise ValueError(f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}.")
|
||||||
|
validated_indices.append(idx)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
raise ValueError(f"Could not parse index from entry: {idx_entry} in item {item['name']}")
|
||||||
|
|
||||||
|
item["files"] = sorted(list(set(validated_indices)))
|
||||||
|
# Store only the required fields
|
||||||
|
validated_abstractions.append({
|
||||||
|
"name": item["name"],
|
||||||
|
"description": item["description"],
|
||||||
|
"files": item["files"]
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"Identified {len(validated_abstractions)} abstractions.")
|
||||||
|
return validated_abstractions
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
shared["abstractions"] = exec_res # List of {"name": str, "description": str, "files": [int]}
|
||||||
|
|
||||||
|
class AnalyzeRelationships(Node):
|
||||||
|
def prep(self, shared):
|
||||||
|
abstractions = shared["abstractions"] # Now contains 'files' list of indices
|
||||||
|
files_data = shared["files"]
|
||||||
|
project_name = shared["project_name"] # Get project name
|
||||||
|
|
||||||
|
# Create context with abstraction names, indices, descriptions, and relevant file snippets
|
||||||
|
context = "Identified Abstractions:\n"
|
||||||
|
all_relevant_indices = set()
|
||||||
|
abstraction_info_for_prompt = []
|
||||||
|
for i, abstr in enumerate(abstractions):
|
||||||
|
# Use 'files' which contains indices directly
|
||||||
|
file_indices_str = ", ".join(map(str, abstr['files']))
|
||||||
|
info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\n Description: {abstr['description']}"
|
||||||
|
context += info_line + "\n"
|
||||||
|
abstraction_info_for_prompt.append(f"{i} # {abstr['name']}")
|
||||||
|
all_relevant_indices.update(abstr['files'])
|
||||||
|
|
||||||
|
context += "\nRelevant File Snippets (Referenced by Index and Path):\n"
|
||||||
|
# Get content for relevant files using helper
|
||||||
|
relevant_files_content_map = get_content_for_indices(
|
||||||
|
files_data,
|
||||||
|
sorted(list(all_relevant_indices))
|
||||||
|
)
|
||||||
|
# Format file content for context
|
||||||
|
file_context_str = "\n\n".join(
|
||||||
|
f"--- File: {idx_path} ---\n{content}"
|
||||||
|
for idx_path, content in relevant_files_content_map.items()
|
||||||
|
)
|
||||||
|
context += file_context_str
|
||||||
|
|
||||||
|
return context, "\n".join(abstraction_info_for_prompt), project_name # Return project name
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
context, abstraction_listing, project_name = prep_res # Unpack project name
|
||||||
|
print("Analyzing relationships using LLM...")
|
||||||
|
prompt = f"""
|
||||||
|
Based on the following abstractions and relevant code snippets from the project `{project_name}`:
|
||||||
|
|
||||||
|
List of Abstraction Indices and Names:
|
||||||
|
{abstraction_listing}
|
||||||
|
|
||||||
|
Context (Abstractions, Descriptions, Code):
|
||||||
|
{context}
|
||||||
|
|
||||||
|
Please provide:
|
||||||
|
1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences. Use markdown formatting with **bold** and *italic* text to highlight important concepts.
|
||||||
|
2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify:
|
||||||
|
- `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`)
|
||||||
|
- `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`)
|
||||||
|
- `label`: A brief label for the interaction **in just a few words** (e.g., "Manages", "Inherits", "Uses").
|
||||||
|
Ideally the relationship should be backed by one abstraction calling or passing parameters to another.
|
||||||
|
Simplify the relationship and exclude those non-important ones.
|
||||||
|
|
||||||
|
IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships.
|
||||||
|
|
||||||
|
Format the output as YAML:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
summary: |
|
||||||
|
A brief, simple explanation of the project.
|
||||||
|
Can span multiple lines with **bold** and *italic* for emphasis.
|
||||||
|
relationships:
|
||||||
|
- from_abstraction: 0 # AbstractionName1
|
||||||
|
to_abstraction: 1 # AbstractionName2
|
||||||
|
label: "Manages"
|
||||||
|
- from_abstraction: 2 # AbstractionName3
|
||||||
|
to_abstraction: 0 # AbstractionName1
|
||||||
|
label: "Provides config"
|
||||||
|
# ... other relationships
|
||||||
|
```
|
||||||
|
|
||||||
|
Now, provide the YAML output:
|
||||||
|
"""
|
||||||
|
response = call_llm(prompt)
|
||||||
|
|
||||||
|
# --- Validation ---
|
||||||
|
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||||
|
relationships_data = yaml.safe_load(yaml_str)
|
||||||
|
|
||||||
|
if not isinstance(relationships_data, dict) or not all(k in relationships_data for k in ["summary", "relationships"]):
|
||||||
|
raise ValueError("LLM output is not a dict or missing keys ('summary', 'relationships')")
|
||||||
|
if not isinstance(relationships_data["summary"], str):
|
||||||
|
raise ValueError("summary is not a string")
|
||||||
|
if not isinstance(relationships_data["relationships"], list):
|
||||||
|
raise ValueError("relationships is not a list")
|
||||||
|
|
||||||
|
# Validate relationships structure
|
||||||
|
validated_relationships = []
|
||||||
|
num_abstractions = len(abstraction_listing.split('\n'))
|
||||||
|
for rel in relationships_data["relationships"]:
|
||||||
|
# Check for 'label' key
|
||||||
|
if not isinstance(rel, dict) or not all(k in rel for k in ["from_abstraction", "to_abstraction", "label"]):
|
||||||
|
raise ValueError(f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}")
|
||||||
|
# Validate 'label' is a string
|
||||||
|
if not isinstance(rel["label"], str):
|
||||||
|
raise ValueError(f"Relationship label is not a string: {rel}")
|
||||||
|
|
||||||
|
# Validate indices
|
||||||
|
try:
|
||||||
|
from_idx = int(str(rel["from_abstraction"]).split('#')[0].strip())
|
||||||
|
to_idx = int(str(rel["to_abstraction"]).split('#')[0].strip())
|
||||||
|
if not (0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions):
|
||||||
|
raise ValueError(f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}.")
|
||||||
|
validated_relationships.append({
|
||||||
|
"from": from_idx,
|
||||||
|
"to": to_idx,
|
||||||
|
"label": rel["label"]
|
||||||
|
})
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
raise ValueError(f"Could not parse indices from relationship: {rel}")
|
||||||
|
|
||||||
|
print("Generated project summary and relationship details.")
|
||||||
|
return {
|
||||||
|
"summary": relationships_data["summary"],
|
||||||
|
"details": validated_relationships # Store validated, index-based relationships
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
# Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
|
||||||
|
shared["relationships"] = exec_res
|
||||||
|
|
||||||
|
class OrderChapters(Node):
|
||||||
|
def prep(self, shared):
|
||||||
|
abstractions = shared["abstractions"]
|
||||||
|
relationships = shared["relationships"]
|
||||||
|
project_name = shared["project_name"] # Get project name
|
||||||
|
|
||||||
|
# Prepare context for the LLM
|
||||||
|
abstraction_info_for_prompt = []
|
||||||
|
for i, a in enumerate(abstractions):
|
||||||
|
abstraction_info_for_prompt.append(f"- {i} # {a['name']}")
|
||||||
|
abstraction_listing = "\n".join(abstraction_info_for_prompt)
|
||||||
|
|
||||||
|
context = f"Project Summary:\n{relationships['summary']}\n\n"
|
||||||
|
context += "Relationships (Indices refer to abstractions above):\n"
|
||||||
|
for rel in relationships['details']:
|
||||||
|
from_name = abstractions[rel['from']]['name']
|
||||||
|
to_name = abstractions[rel['to']]['name']
|
||||||
|
# Use 'label' instead of 'desc'
|
||||||
|
context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n"
|
||||||
|
|
||||||
|
return abstraction_listing, context, len(abstractions), project_name
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
abstraction_listing, context, num_abstractions, project_name = prep_res
|
||||||
|
print("Determining chapter order using LLM...")
|
||||||
|
prompt = f"""
|
||||||
|
Given the following project abstractions and their relationships for the project ```` {project_name} ````:
|
||||||
|
|
||||||
|
Abstractions (Index # Name):
|
||||||
|
{abstraction_listing}
|
||||||
|
|
||||||
|
Context about relationships and project summary:
|
||||||
|
{context}
|
||||||
|
|
||||||
|
If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last?
|
||||||
|
Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts.
|
||||||
|
|
||||||
|
Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- 2 # FoundationalConcept
|
||||||
|
- 0 # CoreClassA
|
||||||
|
- 1 # CoreClassB (uses CoreClassA)
|
||||||
|
- ...
|
||||||
|
```
|
||||||
|
|
||||||
|
Now, provide the YAML output:
|
||||||
|
"""
|
||||||
|
response = call_llm(prompt)
|
||||||
|
|
||||||
|
# --- Validation ---
|
||||||
|
# Rely on Node's built-in retry/fallback
|
||||||
|
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||||
|
ordered_indices_raw = yaml.safe_load(yaml_str)
|
||||||
|
|
||||||
|
if not isinstance(ordered_indices_raw, list):
|
||||||
|
raise ValueError("LLM output is not a list")
|
||||||
|
|
||||||
|
ordered_indices = []
|
||||||
|
seen_indices = set()
|
||||||
|
for entry in ordered_indices_raw:
|
||||||
|
try:
|
||||||
|
if isinstance(entry, int):
|
||||||
|
idx = entry
|
||||||
|
elif isinstance(entry, str) and '#' in entry:
|
||||||
|
idx = int(entry.split('#')[0].strip())
|
||||||
|
else:
|
||||||
|
idx = int(str(entry).strip())
|
||||||
|
|
||||||
|
if not (0 <= idx < num_abstractions):
|
||||||
|
raise ValueError(f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}.")
|
||||||
|
if idx in seen_indices:
|
||||||
|
raise ValueError(f"Duplicate index {idx} found in ordered list.")
|
||||||
|
ordered_indices.append(idx)
|
||||||
|
seen_indices.add(idx)
|
||||||
|
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
raise ValueError(f"Could not parse index from ordered list entry: {entry}")
|
||||||
|
|
||||||
|
# Check if all abstractions are included
|
||||||
|
if len(ordered_indices) != num_abstractions:
|
||||||
|
raise ValueError(f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}")
|
||||||
|
|
||||||
|
print(f"Determined chapter order (indices): {ordered_indices}")
|
||||||
|
return ordered_indices # Return the list of indices
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
# exec_res is already the list of ordered indices
|
||||||
|
shared["chapter_order"] = exec_res # List of indices
|
||||||
|
|
||||||
|
class WriteChapters(BatchNode):
|
||||||
|
def prep(self, shared):
|
||||||
|
chapter_order = shared["chapter_order"] # List of indices
|
||||||
|
abstractions = shared["abstractions"] # List of dicts, now using 'files' with indices
|
||||||
|
files_data = shared["files"]
|
||||||
|
# Get already written chapters to provide context
|
||||||
|
# We store them temporarily during the batch run, not in shared memory yet
|
||||||
|
# The 'previous_chapters_summary' will be built progressively in the exec context
|
||||||
|
self.chapters_written_so_far = [] # Use instance variable for temporary storage across exec calls
|
||||||
|
|
||||||
|
# Create a complete list of all chapters
|
||||||
|
all_chapters = []
|
||||||
|
chapter_filenames = {} # Store chapter filename mapping for linking
|
||||||
|
for i, abstraction_index in enumerate(chapter_order):
|
||||||
|
if 0 <= abstraction_index < len(abstractions):
|
||||||
|
chapter_num = i + 1
|
||||||
|
chapter_name = abstractions[abstraction_index]["name"]
|
||||||
|
# Create safe filename
|
||||||
|
safe_name = "".join(c if c.isalnum() else '_' for c in chapter_name).lower()
|
||||||
|
filename = f"{i+1:02d}_{safe_name}.md"
|
||||||
|
# Format with link
|
||||||
|
all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})")
|
||||||
|
# Store mapping of chapter index to filename for linking
|
||||||
|
chapter_filenames[abstraction_index] = {"num": chapter_num, "name": chapter_name, "filename": filename}
|
||||||
|
|
||||||
|
# Create a formatted string with all chapters
|
||||||
|
full_chapter_listing = "\n".join(all_chapters)
|
||||||
|
|
||||||
|
items_to_process = []
|
||||||
|
for i, abstraction_index in enumerate(chapter_order):
|
||||||
|
if 0 <= abstraction_index < len(abstractions):
|
||||||
|
abstraction_details = abstractions[abstraction_index]
|
||||||
|
# Use 'files' (list of indices) directly
|
||||||
|
related_file_indices = abstraction_details.get("files", [])
|
||||||
|
# Get content using helper, passing indices
|
||||||
|
related_files_content_map = get_content_for_indices(files_data, related_file_indices)
|
||||||
|
|
||||||
|
# Get previous chapter info for transitions
|
||||||
|
prev_chapter = None
|
||||||
|
if i > 0:
|
||||||
|
prev_idx = chapter_order[i-1]
|
||||||
|
prev_chapter = chapter_filenames[prev_idx]
|
||||||
|
|
||||||
|
# Get next chapter info for transitions
|
||||||
|
next_chapter = None
|
||||||
|
if i < len(chapter_order) - 1:
|
||||||
|
next_idx = chapter_order[i+1]
|
||||||
|
next_chapter = chapter_filenames[next_idx]
|
||||||
|
|
||||||
|
items_to_process.append({
|
||||||
|
"chapter_num": i + 1,
|
||||||
|
"abstraction_index": abstraction_index,
|
||||||
|
"abstraction_details": abstraction_details,
|
||||||
|
"related_files_content_map": related_files_content_map,
|
||||||
|
"project_name": shared["project_name"], # Add project name
|
||||||
|
"full_chapter_listing": full_chapter_listing, # Add the full chapter listing
|
||||||
|
"chapter_filenames": chapter_filenames, # Add chapter filenames mapping
|
||||||
|
"prev_chapter": prev_chapter, # Add previous chapter info
|
||||||
|
"next_chapter": next_chapter, # Add next chapter info
|
||||||
|
# previous_chapters_summary will be added dynamically in exec
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
print(f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping.")
|
||||||
|
|
||||||
|
print(f"Preparing to write {len(items_to_process)} chapters...")
|
||||||
|
return items_to_process # Iterable for BatchNode
|
||||||
|
|
||||||
|
def exec(self, item):
|
||||||
|
# This runs for each item prepared above
|
||||||
|
abstraction_name = item["abstraction_details"]["name"]
|
||||||
|
chapter_num = item["chapter_num"]
|
||||||
|
project_name = item.get("project_name") # Get from item
|
||||||
|
print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...")
|
||||||
|
|
||||||
|
# Prepare file context string from the map
|
||||||
|
file_context_str = "\n\n".join(
|
||||||
|
f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"
|
||||||
|
for idx_path, content in item["related_files_content_map"].items()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get summary of chapters written *before* this one
|
||||||
|
# Use the temporary instance variable
|
||||||
|
previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far)
|
||||||
|
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}.
|
||||||
|
|
||||||
|
Concept Details:
|
||||||
|
- Description:
|
||||||
|
{item["abstraction_details"]["description"]}
|
||||||
|
|
||||||
|
Complete Tutorial Structure:
|
||||||
|
{item["full_chapter_listing"]}
|
||||||
|
|
||||||
|
Context from previous chapters (summary):
|
||||||
|
{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}
|
||||||
|
|
||||||
|
Relevant Code Snippets:
|
||||||
|
{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}
|
||||||
|
|
||||||
|
Instructions for the chapter:
|
||||||
|
- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`).
|
||||||
|
|
||||||
|
- If this is not the first chapter, begin with a brief transition from the previous chapter, referencing it with a proper Markdown link.
|
||||||
|
|
||||||
|
- Begin with a high-level motivation explaining what problem this abstraction solves. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners.
|
||||||
|
|
||||||
|
- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way.
|
||||||
|
|
||||||
|
- Explain how to use this abstraction to solve the use case. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen).
|
||||||
|
|
||||||
|
- Each code block should be BELOW 20 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it.
|
||||||
|
|
||||||
|
- Describe the internal implementation to help understand what's under the hood. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use:
|
||||||
|
`participant QP as Query Processing`
|
||||||
|
|
||||||
|
- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly.
|
||||||
|
|
||||||
|
- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename. Example: "we will talk about [Query Processing](03_query_processing.md) in Chapter 3".
|
||||||
|
|
||||||
|
- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format).
|
||||||
|
|
||||||
|
- Heavily use analogies and examples throughout to help beginners understand.
|
||||||
|
|
||||||
|
- End the chapter with a brief conclusion that summarizes what was learned and provides a transition to the next chapter. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename).
|
||||||
|
|
||||||
|
- Ensure the tone is welcoming and easy for a newcomer to understand.
|
||||||
|
|
||||||
|
- Output *only* the Markdown content for this chapter.
|
||||||
|
|
||||||
|
Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags):
|
||||||
|
"""
|
||||||
|
chapter_content = call_llm(prompt)
|
||||||
|
# Basic validation/cleanup
|
||||||
|
actual_heading = f"# Chapter {chapter_num}: {abstraction_name}"
|
||||||
|
if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"):
|
||||||
|
# Add heading if missing or incorrect, trying to preserve content
|
||||||
|
lines = chapter_content.strip().split('\n')
|
||||||
|
if lines and lines[0].strip().startswith("#"): # If there's some heading, replace it
|
||||||
|
lines[0] = actual_heading
|
||||||
|
chapter_content = "\n".join(lines)
|
||||||
|
else: # Otherwise, prepend it
|
||||||
|
chapter_content = f"{actual_heading}\n\n{chapter_content}"
|
||||||
|
|
||||||
|
# Add the generated content to our temporary list for the next iteration's context
|
||||||
|
self.chapters_written_so_far.append(chapter_content)
|
||||||
|
|
||||||
|
return chapter_content # Return the Markdown string
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res_list):
|
||||||
|
# exec_res_list contains the generated Markdown for each chapter, in order
|
||||||
|
shared["chapters"] = exec_res_list
|
||||||
|
# Clean up the temporary instance variable
|
||||||
|
del self.chapters_written_so_far
|
||||||
|
print(f"Finished writing {len(exec_res_list)} chapters.")
|
||||||
|
|
||||||
|
class CombineTutorial(Node):
|
||||||
|
def prep(self, shared):
|
||||||
|
project_name = shared["project_name"]
|
||||||
|
output_base_dir = shared.get("output_dir", "output") # Default output dir
|
||||||
|
output_path = os.path.join(output_base_dir, project_name)
|
||||||
|
repo_url = shared["repo_url"] # Get the repository URL
|
||||||
|
|
||||||
|
# Use 'label' from relationships_data['details']
|
||||||
|
relationships_data = shared["relationships"] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
|
||||||
|
chapter_order = shared["chapter_order"] # indices
|
||||||
|
abstractions = shared["abstractions"] # list of dicts
|
||||||
|
chapters_content = shared["chapters"] # list of strings
|
||||||
|
|
||||||
|
# --- Generate Mermaid Diagram ---
|
||||||
|
mermaid_lines = ["flowchart TD"]
|
||||||
|
# Add nodes for each abstraction
|
||||||
|
for i, abstr in enumerate(abstractions):
|
||||||
|
# Sanitize name for Mermaid ID and label
|
||||||
|
node_id = f"A{i}"
|
||||||
|
sanitized_name = abstr['name'].replace('"', '')
|
||||||
|
node_label = sanitized_name # Using sanitized name only, no index
|
||||||
|
mermaid_lines.append(f' {node_id}["{node_label}"]')
|
||||||
|
# Add edges for relationships using 'label'
|
||||||
|
for rel in relationships_data['details']:
|
||||||
|
from_node_id = f"A{rel['from']}"
|
||||||
|
to_node_id = f"A{rel['to']}"
|
||||||
|
# Sanitize 'label' for edge label
|
||||||
|
edge_label = rel['label'].replace('"', '').replace('\n', ' ') # Basic sanitization
|
||||||
|
# Limit edge label length for readability (optional, but good for diagrams)
|
||||||
|
max_label_len = 30 # Make it shorter for labels
|
||||||
|
if len(edge_label) > max_label_len:
|
||||||
|
edge_label = edge_label[:max_label_len-3] + "..."
|
||||||
|
mermaid_lines.append(f' {from_node_id} -- "{edge_label}" --> {to_node_id}')
|
||||||
|
|
||||||
|
mermaid_diagram = "\n".join(mermaid_lines)
|
||||||
|
# --- End Mermaid ---
|
||||||
|
|
||||||
|
|
||||||
|
# Prepare index.md content
|
||||||
|
index_content = f"# Tutorial: {project_name}\n\n"
|
||||||
|
index_content += f"{relationships_data['summary']}\n\n"
|
||||||
|
index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"
|
||||||
|
|
||||||
|
# Add Mermaid diagram for relationships
|
||||||
|
index_content += "```mermaid\n"
|
||||||
|
index_content += mermaid_diagram + "\n"
|
||||||
|
index_content += "```\n\n"
|
||||||
|
|
||||||
|
index_content += "## Chapters\n\n"
|
||||||
|
|
||||||
|
chapter_files = []
|
||||||
|
# Generate chapter links based on the determined order
|
||||||
|
for i, abstraction_index in enumerate(chapter_order):
|
||||||
|
# Ensure index is valid and we have content for it
|
||||||
|
if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content):
|
||||||
|
abstraction_name = abstractions[abstraction_index]["name"]
|
||||||
|
# Sanitize name for filename
|
||||||
|
safe_name = "".join(c if c.isalnum() else '_' for c in abstraction_name).lower()
|
||||||
|
# Use chapter number (i+1) for ordering filename
|
||||||
|
filename = f"{i+1:02d}_{safe_name}.md"
|
||||||
|
index_content += f"{i+1}. [{abstraction_name}]({filename})\n"
|
||||||
|
|
||||||
|
# Add attribution to chapter content
|
||||||
|
chapter_content = chapters_content[i]
|
||||||
|
if not chapter_content.endswith("\n\n"):
|
||||||
|
chapter_content += "\n\n"
|
||||||
|
chapter_content += "---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
|
||||||
|
|
||||||
|
# Store filename and corresponding content
|
||||||
|
chapter_files.append({"filename": filename, "content": chapter_content})
|
||||||
|
else:
|
||||||
|
print(f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry.")
|
||||||
|
|
||||||
|
# Add attribution to index content
|
||||||
|
index_content += "\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"output_path": output_path,
|
||||||
|
"index_content": index_content,
|
||||||
|
"chapter_files": chapter_files # List of {"filename": str, "content": str}
|
||||||
|
}
|
||||||
|
|
||||||
|
def exec(self, prep_res):
|
||||||
|
output_path = prep_res["output_path"]
|
||||||
|
index_content = prep_res["index_content"]
|
||||||
|
chapter_files = prep_res["chapter_files"]
|
||||||
|
|
||||||
|
print(f"Combining tutorial into directory: {output_path}")
|
||||||
|
# Rely on Node's built-in retry/fallback
|
||||||
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
|
# Write index.md
|
||||||
|
index_filepath = os.path.join(output_path, "index.md")
|
||||||
|
with open(index_filepath, "w", encoding="utf-8") as f:
|
||||||
|
f.write(index_content)
|
||||||
|
print(f" - Wrote {index_filepath}")
|
||||||
|
|
||||||
|
# Write chapter files
|
||||||
|
for chapter_info in chapter_files:
|
||||||
|
chapter_filepath = os.path.join(output_path, chapter_info["filename"])
|
||||||
|
with open(chapter_filepath, "w", encoding="utf-8") as f:
|
||||||
|
f.write(chapter_info["content"])
|
||||||
|
print(f" - Wrote {chapter_filepath}")
|
||||||
|
|
||||||
|
return output_path # Return the final path
|
||||||
|
|
||||||
|
|
||||||
|
def post(self, shared, prep_res, exec_res):
|
||||||
|
shared["final_output_dir"] = exec_res # Store the output path
|
||||||
|
print(f"\nTutorial generation complete! Files are in: {exec_res}")
|
||||||
@@ -1 +1,4 @@
|
|||||||
pocketflow>=0.0.1
|
pocketflow>=0.0.1
|
||||||
|
pyyaml>=6.0
|
||||||
|
requests>=2.28.0
|
||||||
|
google-cloud-aiplatform>=1.25.0
|
||||||
@@ -1,14 +1,87 @@
|
|||||||
from openai import OpenAI
|
from google import genai
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
# Learn more about calling the LLM: https://the-pocket.github.io/PocketFlow/utility_function/llm.html
|
# Configure logging
|
||||||
def call_llm(prompt):
|
log_directory = os.getenv("LOG_DIR", "logs")
|
||||||
client = OpenAI(api_key="YOUR_API_KEY_HERE")
|
os.makedirs(log_directory, exist_ok=True)
|
||||||
r = client.chat.completions.create(
|
log_file = os.path.join(log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log")
|
||||||
model="gpt-4o",
|
|
||||||
messages=[{"role": "user", "content": prompt}]
|
# Set up logger
|
||||||
)
|
logger = logging.getLogger("llm_logger")
|
||||||
return r.choices[0].message.content
|
logger.setLevel(logging.INFO)
|
||||||
|
logger.propagate = False # Prevent propagation to root logger
|
||||||
|
file_handler = logging.FileHandler(log_file)
|
||||||
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
# Simple cache configuration
|
||||||
|
cache_file = "llm_cache.json"
|
||||||
|
|
||||||
|
def call_llm(prompt: str, use_cache: bool = True) -> str:
|
||||||
|
# Log the prompt
|
||||||
|
logger.info(f"PROMPT: {prompt}")
|
||||||
|
|
||||||
|
# Check cache if enabled
|
||||||
|
if use_cache:
|
||||||
|
# Load cache from disk
|
||||||
|
cache = {}
|
||||||
|
if os.path.exists(cache_file):
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'r') as f:
|
||||||
|
cache = json.load(f)
|
||||||
|
except:
|
||||||
|
logger.warning(f"Failed to load cache, starting with empty cache")
|
||||||
|
|
||||||
|
# Return from cache if exists
|
||||||
|
if prompt in cache:
|
||||||
|
logger.info(f"RESPONSE: {cache[prompt]}")
|
||||||
|
return cache[prompt]
|
||||||
|
|
||||||
|
# Call the LLM if not in cache or cache disabled
|
||||||
|
client = genai.Client(
|
||||||
|
vertexai=True,
|
||||||
|
project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"),
|
||||||
|
location=os.getenv("GEMINI_LOCATION", "us-central1")
|
||||||
|
)
|
||||||
|
model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25")
|
||||||
|
response = client.models.generate_content(
|
||||||
|
model=model,
|
||||||
|
contents=[prompt]
|
||||||
|
)
|
||||||
|
response_text = response.text
|
||||||
|
|
||||||
|
# Log the response
|
||||||
|
logger.info(f"RESPONSE: {response_text}")
|
||||||
|
|
||||||
|
# Update cache if enabled
|
||||||
|
if use_cache:
|
||||||
|
# Load cache again to avoid overwrites
|
||||||
|
cache = {}
|
||||||
|
if os.path.exists(cache_file):
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'r') as f:
|
||||||
|
cache = json.load(f)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Add to cache and save
|
||||||
|
cache[prompt] = response_text
|
||||||
|
try:
|
||||||
|
with open(cache_file, 'w') as f:
|
||||||
|
json.dump(cache, f)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save cache: {e}")
|
||||||
|
|
||||||
|
return response_text
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
prompt = "What is the meaning of life?"
|
test_prompt = "Hello, how are you?"
|
||||||
print(call_llm(prompt))
|
|
||||||
|
# First call - should hit the API
|
||||||
|
print("Making call...")
|
||||||
|
response1 = call_llm(test_prompt, use_cache=False)
|
||||||
|
print(f"Response: {response1}")
|
||||||
|
|
||||||
236
utils/crawl_github_files.py
Normal file
236
utils/crawl_github_files.py
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
import requests
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import fnmatch
|
||||||
|
from typing import Union, Set, List, Dict, Tuple, Any
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
def crawl_github_files(
|
||||||
|
repo_url,
|
||||||
|
token=None,
|
||||||
|
max_file_size: int = 1 * 1024 * 1024, # 1 MB
|
||||||
|
use_relative_paths: bool = False,
|
||||||
|
include_patterns: Union[str, Set[str]] = None,
|
||||||
|
exclude_patterns: Union[str, Set[str]] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Crawl files from a specific path in a GitHub repository at a specific commit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_url (str): URL of the GitHub repository with specific path and commit
|
||||||
|
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
|
||||||
|
token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
|
||||||
|
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
|
||||||
|
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
|
||||||
|
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
|
||||||
|
If None, all files are included.
|
||||||
|
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
|
||||||
|
If None, no files are excluded.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary with files and statistics
|
||||||
|
"""
|
||||||
|
# Convert single pattern to set
|
||||||
|
if include_patterns and isinstance(include_patterns, str):
|
||||||
|
include_patterns = {include_patterns}
|
||||||
|
if exclude_patterns and isinstance(exclude_patterns, str):
|
||||||
|
exclude_patterns = {exclude_patterns}
|
||||||
|
|
||||||
|
# Parse GitHub URL to extract owner, repo, commit/branch, and path
|
||||||
|
parsed_url = urlparse(repo_url)
|
||||||
|
path_parts = parsed_url.path.strip('/').split('/')
|
||||||
|
|
||||||
|
if len(path_parts) < 2:
|
||||||
|
raise ValueError(f"Invalid GitHub URL: {repo_url}")
|
||||||
|
|
||||||
|
# Extract the basic components
|
||||||
|
owner = path_parts[0]
|
||||||
|
repo = path_parts[1]
|
||||||
|
|
||||||
|
# Check if URL contains a specific branch/commit
|
||||||
|
if 'tree' in path_parts:
|
||||||
|
tree_index = path_parts.index('tree')
|
||||||
|
ref = path_parts[tree_index + 1]
|
||||||
|
# Combine all parts after the ref as the path
|
||||||
|
path_start = tree_index + 2
|
||||||
|
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
|
||||||
|
else:
|
||||||
|
ref = "main" # Default branch
|
||||||
|
specific_path = ""
|
||||||
|
|
||||||
|
# Setup for GitHub API
|
||||||
|
headers = {"Accept": "application/vnd.github.v3+json"}
|
||||||
|
if token:
|
||||||
|
headers["Authorization"] = f"token {token}"
|
||||||
|
|
||||||
|
# Dictionary to store path -> content mapping
|
||||||
|
files = {}
|
||||||
|
skipped_files = []
|
||||||
|
|
||||||
|
def should_include_file(file_path: str, file_name: str) -> bool:
|
||||||
|
"""Determine if a file should be included based on patterns"""
|
||||||
|
# If no include patterns are specified, include all files
|
||||||
|
if not include_patterns:
|
||||||
|
include_file = True
|
||||||
|
else:
|
||||||
|
# Check if file matches any include pattern
|
||||||
|
include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)
|
||||||
|
|
||||||
|
# If exclude patterns are specified, check if file should be excluded
|
||||||
|
if exclude_patterns and include_file:
|
||||||
|
# Exclude if file matches any exclude pattern
|
||||||
|
exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
|
||||||
|
return not exclude_file
|
||||||
|
|
||||||
|
return include_file
|
||||||
|
|
||||||
|
def fetch_contents(path):
|
||||||
|
"""Fetch contents of the repository at a specific path and commit"""
|
||||||
|
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
|
||||||
|
params = {"ref": ref}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers, params=params)
|
||||||
|
|
||||||
|
if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
|
||||||
|
reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
|
||||||
|
wait_time = max(reset_time - time.time(), 0) + 1
|
||||||
|
print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
|
||||||
|
time.sleep(wait_time)
|
||||||
|
return fetch_contents(path)
|
||||||
|
|
||||||
|
if response.status_code == 404:
|
||||||
|
if not token:
|
||||||
|
print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
|
||||||
|
else:
|
||||||
|
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Error fetching {path}: {response.status_code} - {response.text}")
|
||||||
|
return
|
||||||
|
|
||||||
|
contents = response.json()
|
||||||
|
|
||||||
|
# Handle both single file and directory responses
|
||||||
|
if not isinstance(contents, list):
|
||||||
|
contents = [contents]
|
||||||
|
|
||||||
|
for item in contents:
|
||||||
|
item_path = item["path"]
|
||||||
|
|
||||||
|
# Calculate relative path if requested
|
||||||
|
if use_relative_paths and specific_path:
|
||||||
|
# Make sure the path is relative to the specified subdirectory
|
||||||
|
if item_path.startswith(specific_path):
|
||||||
|
rel_path = item_path[len(specific_path):].lstrip('/')
|
||||||
|
else:
|
||||||
|
rel_path = item_path
|
||||||
|
else:
|
||||||
|
rel_path = item_path
|
||||||
|
|
||||||
|
if item["type"] == "file":
|
||||||
|
# Check if file should be included based on patterns
|
||||||
|
if not should_include_file(rel_path, item["name"]):
|
||||||
|
print(f"Skipping {rel_path}: Does not match include/exclude patterns")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check file size if available
|
||||||
|
file_size = item.get("size", 0)
|
||||||
|
if file_size > max_file_size:
|
||||||
|
skipped_files.append((item_path, file_size))
|
||||||
|
print(f"Skipping {rel_path}: File size ({file_size} bytes) exceeds limit ({max_file_size} bytes)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For files, get raw content
|
||||||
|
if "download_url" in item and item["download_url"]:
|
||||||
|
file_url = item["download_url"]
|
||||||
|
file_response = requests.get(file_url, headers=headers)
|
||||||
|
|
||||||
|
# Final size check in case content-length header is available but differs from metadata
|
||||||
|
content_length = int(file_response.headers.get('content-length', 0))
|
||||||
|
if content_length > max_file_size:
|
||||||
|
skipped_files.append((item_path, content_length))
|
||||||
|
print(f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if file_response.status_code == 200:
|
||||||
|
files[rel_path] = file_response.text
|
||||||
|
print(f"Downloaded: {rel_path} ({file_size} bytes) ")
|
||||||
|
else:
|
||||||
|
print(f"Failed to download {rel_path}: {file_response.status_code}")
|
||||||
|
else:
|
||||||
|
# Alternative method if download_url is not available
|
||||||
|
content_response = requests.get(item["url"], headers=headers)
|
||||||
|
if content_response.status_code == 200:
|
||||||
|
content_data = content_response.json()
|
||||||
|
if content_data.get("encoding") == "base64" and "content" in content_data:
|
||||||
|
# Check size of base64 content before decoding
|
||||||
|
if len(content_data["content"]) * 0.75 > max_file_size: # Approximate size calculation
|
||||||
|
estimated_size = int(len(content_data["content"]) * 0.75)
|
||||||
|
skipped_files.append((item_path, estimated_size))
|
||||||
|
print(f"Skipping {rel_path}: Encoded content exceeds size limit")
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_content = base64.b64decode(content_data["content"]).decode('utf-8')
|
||||||
|
files[rel_path] = file_content
|
||||||
|
print(f"Downloaded: {rel_path} ({file_size} bytes)")
|
||||||
|
else:
|
||||||
|
print(f"Unexpected content format for {rel_path}")
|
||||||
|
else:
|
||||||
|
print(f"Failed to get content for {rel_path}: {content_response.status_code}")
|
||||||
|
|
||||||
|
elif item["type"] == "dir":
|
||||||
|
# Recursively process subdirectories
|
||||||
|
fetch_contents(item_path)
|
||||||
|
|
||||||
|
# Start crawling from the specified path
|
||||||
|
fetch_contents(specific_path)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"files": files,
|
||||||
|
"stats": {
|
||||||
|
"downloaded_count": len(files),
|
||||||
|
"skipped_count": len(skipped_files),
|
||||||
|
"skipped_files": skipped_files,
|
||||||
|
"base_path": specific_path if use_relative_paths else None,
|
||||||
|
"include_patterns": include_patterns,
|
||||||
|
"exclude_patterns": exclude_patterns
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Example usage
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Get token from environment variable (more secure than hardcoding)
|
||||||
|
github_token = os.environ.get("GITHUB_TOKEN")
|
||||||
|
|
||||||
|
repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
|
||||||
|
|
||||||
|
# Example: Get Python and Markdown files, but exclude test files
|
||||||
|
result = crawl_github_files(
|
||||||
|
repo_url,
|
||||||
|
token=github_token,
|
||||||
|
max_file_size=1 * 1024 * 1024, # 1 MB in bytes
|
||||||
|
use_relative_paths=True, # Enable relative paths
|
||||||
|
include_patterns={"*.py", "*.md"}, # Include Python and Markdown files
|
||||||
|
)
|
||||||
|
|
||||||
|
files = result["files"]
|
||||||
|
stats = result["stats"]
|
||||||
|
|
||||||
|
print(f"\nDownloaded {stats['downloaded_count']} files.")
|
||||||
|
print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
|
||||||
|
print(f"Base path for relative paths: {stats['base_path']}")
|
||||||
|
print(f"Include patterns: {stats['include_patterns']}")
|
||||||
|
print(f"Exclude patterns: {stats['exclude_patterns']}")
|
||||||
|
|
||||||
|
# Display all file paths in the dictionary
|
||||||
|
print("\nFiles in dictionary:")
|
||||||
|
for file_path in sorted(files.keys()):
|
||||||
|
print(f" {file_path}")
|
||||||
|
|
||||||
|
# Example: accessing content of a specific file
|
||||||
|
if files:
|
||||||
|
sample_file = next(iter(files))
|
||||||
|
print(f"\nSample file: {sample_file}")
|
||||||
|
print(f"Content preview: {files[sample_file][:200]}...")
|
||||||
Reference in New Issue
Block a user