mirror of
https://github.com/aljazceru/Tutorial-Codebase-Knowledge.git
synced 2025-12-18 15:04:20 +01:00
update readme examples
This commit is contained in:
@@ -16,7 +16,7 @@
|
||||
|
||||
This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/PocketFlow), a 100-line LLM framework. It crawls GitHub repositories and build a knowledge base from the code. It analyzes entire codebases to identify core abstractions and how they interact, and transforms complex code into beginner-friendly tutorials with clear visualizations.
|
||||
|
||||
## Example Tutorials for Popular GitHub Repositories!
|
||||
## ⭐ Example Tutorials for Popular GitHub Repositories!
|
||||
|
||||
- [AutoGen Core](https://the-pocket.github.io/Tutorial-Codebase-Knowledge/AutoGen%20Core) - Build AI teams that talk, think, and solve problems together like coworkers!
|
||||
|
||||
|
||||
43
flow.py
43
flow.py
@@ -1,14 +1,33 @@
|
||||
from pocketflow import Flow
|
||||
from nodes import GetQuestionNode, AnswerNode
|
||||
# Import all node classes from nodes.py
|
||||
from nodes import (
|
||||
FetchRepo,
|
||||
IdentifyAbstractions,
|
||||
AnalyzeRelationships,
|
||||
OrderChapters,
|
||||
WriteChapters,
|
||||
CombineTutorial
|
||||
)
|
||||
|
||||
def create_qa_flow():
|
||||
"""Create and return a question-answering flow."""
|
||||
# Create nodes
|
||||
get_question_node = GetQuestionNode()
|
||||
answer_node = AnswerNode()
|
||||
|
||||
# Connect nodes in sequence
|
||||
get_question_node >> answer_node
|
||||
|
||||
# Create flow starting with input node
|
||||
return Flow(start=get_question_node)
|
||||
def create_tutorial_flow():
|
||||
"""Creates and returns the codebase tutorial generation flow."""
|
||||
|
||||
# Instantiate nodes
|
||||
fetch_repo = FetchRepo()
|
||||
identify_abstractions = IdentifyAbstractions(max_retries=3, wait=10)
|
||||
analyze_relationships = AnalyzeRelationships(max_retries=3, wait=10)
|
||||
order_chapters = OrderChapters(max_retries=3, wait=10)
|
||||
write_chapters = WriteChapters(max_retries=3, wait=10) # This is a BatchNode
|
||||
combine_tutorial = CombineTutorial()
|
||||
|
||||
# Connect nodes in sequence based on the design
|
||||
fetch_repo >> identify_abstractions
|
||||
identify_abstractions >> analyze_relationships
|
||||
analyze_relationships >> order_chapters
|
||||
order_chapters >> write_chapters
|
||||
write_chapters >> combine_tutorial
|
||||
|
||||
# Create the flow starting with FetchRepo
|
||||
tutorial_flow = Flow(start=fetch_repo)
|
||||
|
||||
return tutorial_flow
|
||||
67
main.py
67
main.py
@@ -1,16 +1,67 @@
|
||||
from flow import qa_flow
|
||||
import os
|
||||
import argparse
|
||||
# Import the function that creates the flow
|
||||
from flow import create_tutorial_flow
|
||||
|
||||
# Example main function
|
||||
# Please replace this with your own main function
|
||||
# Default file patterns
|
||||
DEFAULT_INCLUDE_PATTERNS = {
|
||||
"*.py", "*.js", "*.ts", "*.go", "*.java", "*.pyi", "*.pyx",
|
||||
"*.c", "*.cc", "*.cpp", "*.h", "*.md", "*.rst", "Dockerfile",
|
||||
"Makefile", "*.yaml", "*.yml"
|
||||
}
|
||||
|
||||
DEFAULT_EXCLUDE_PATTERNS = {
|
||||
"*test*", "tests/*", "docs/*", "examples/*", "v1/*",
|
||||
"dist/*", "build/*", "experimental/*", "deprecated/*",
|
||||
"legacy/*", ".git/*", ".github/*"
|
||||
}
|
||||
|
||||
# --- Main Function ---
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate a tutorial for a GitHub codebase.")
|
||||
parser.add_argument("repo_url", help="URL of the public GitHub repository.")
|
||||
parser.add_argument("-n", "--name", help="Project name (optional, derived from URL if omitted).")
|
||||
parser.add_argument("-t", "--token", help="GitHub personal access token (optional, reads from GITHUB_TOKEN env var if not provided).")
|
||||
parser.add_argument("-o", "--output", default="output", help="Base directory for output (default: ./output).")
|
||||
parser.add_argument("-i", "--include", nargs="+", help="Include file patterns (e.g. '*.py' '*.js'). Defaults to common code files if not specified.")
|
||||
parser.add_argument("-e", "--exclude", nargs="+", help="Exclude file patterns (e.g. 'tests/*' 'docs/*'). Defaults to test/build directories if not specified.")
|
||||
parser.add_argument("-s", "--max-size", type=int, default=100000, help="Maximum file size in bytes (default: 100000, about 100KB).")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get GitHub token from argument or environment variable
|
||||
github_token = args.token or os.environ.get('GITHUB_TOKEN')
|
||||
if not github_token:
|
||||
print("Warning: No GitHub token provided. You might hit rate limits for public repositories.")
|
||||
|
||||
# Initialize the shared dictionary with inputs
|
||||
shared = {
|
||||
"question": "In one sentence, what's the end of universe?",
|
||||
"answer": None
|
||||
"repo_url": args.repo_url,
|
||||
"project_name": args.name, # Can be None, FetchRepo will derive it
|
||||
"github_token": github_token,
|
||||
"output_dir": args.output, # Base directory for CombineTutorial output
|
||||
|
||||
# Add include/exclude patterns and max file size
|
||||
"include_patterns": set(args.include) if args.include else DEFAULT_INCLUDE_PATTERNS,
|
||||
"exclude_patterns": set(args.exclude) if args.exclude else DEFAULT_EXCLUDE_PATTERNS,
|
||||
"max_file_size": args.max_size,
|
||||
|
||||
# Outputs will be populated by the nodes
|
||||
"files": [],
|
||||
"abstractions": [],
|
||||
"relationships": {},
|
||||
"chapter_order": [],
|
||||
"chapters": [],
|
||||
"final_output_dir": None
|
||||
}
|
||||
|
||||
qa_flow.run(shared)
|
||||
print("Question:", shared["question"])
|
||||
print("Answer:", shared["answer"])
|
||||
print(f"Starting tutorial generation for: {args.repo_url}")
|
||||
|
||||
# Create the flow instance
|
||||
tutorial_flow = create_tutorial_flow()
|
||||
|
||||
# Run the flow
|
||||
tutorial_flow.run(shared)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
660
nodes.py
660
nodes.py
@@ -1,26 +1,642 @@
|
||||
from pocketflow import Node
|
||||
from utils.call_llm import call_llm
|
||||
import os
|
||||
import yaml
|
||||
from pocketflow import Node, BatchNode
|
||||
from utils.crawl_github_files import crawl_github_files
|
||||
from utils.call_llm import call_llm # Assuming you have this utility
|
||||
|
||||
class GetQuestionNode(Node):
|
||||
def exec(self, _):
|
||||
# Get question directly from user input
|
||||
user_question = input("Enter your question: ")
|
||||
return user_question
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
# Store the user's question
|
||||
shared["question"] = exec_res
|
||||
return "default" # Go to the next node
|
||||
# Helper to create context from files, respecting limits (basic example)
|
||||
def create_llm_context(files_data):
|
||||
context = ""
|
||||
file_info = [] # Store tuples of (index, path)
|
||||
for i, (path, content) in enumerate(files_data):
|
||||
entry = f"--- File Index {i}: {path} ---\n{content}\n\n"
|
||||
context += entry
|
||||
file_info.append((i, path))
|
||||
|
||||
class AnswerNode(Node):
|
||||
return context, file_info # file_info is list of (index, path)
|
||||
|
||||
# Helper to get content for specific file indices
|
||||
def get_content_for_indices(files_data, indices):
|
||||
content_map = {}
|
||||
for i in indices:
|
||||
if 0 <= i < len(files_data):
|
||||
path, content = files_data[i]
|
||||
content_map[f"{i} # {path}"] = content # Use index + path as key for context
|
||||
return content_map
|
||||
|
||||
class FetchRepo(Node):
|
||||
def prep(self, shared):
|
||||
# Read question from shared
|
||||
return shared["question"]
|
||||
|
||||
def exec(self, question):
|
||||
# Call LLM to get the answer
|
||||
return call_llm(question)
|
||||
|
||||
repo_url = shared["repo_url"]
|
||||
project_name = shared.get("project_name")
|
||||
if not project_name:
|
||||
# Basic name derivation from URL
|
||||
project_name = repo_url.split('/')[-1].replace('.git', '')
|
||||
shared["project_name"] = project_name
|
||||
|
||||
# Get file patterns directly from shared (defaults are defined in main.py)
|
||||
include_patterns = shared["include_patterns"]
|
||||
exclude_patterns = shared["exclude_patterns"]
|
||||
max_file_size = shared["max_file_size"]
|
||||
|
||||
return {
|
||||
"repo_url": repo_url,
|
||||
"token": shared.get("github_token"),
|
||||
"include_patterns": include_patterns,
|
||||
"exclude_patterns": exclude_patterns,
|
||||
"max_file_size": max_file_size,
|
||||
"use_relative_paths": True
|
||||
}
|
||||
|
||||
def exec(self, prep_res):
|
||||
print(f"Crawling repository: {prep_res['repo_url']}...")
|
||||
result = crawl_github_files(
|
||||
repo_url=prep_res["repo_url"],
|
||||
token=prep_res["token"],
|
||||
include_patterns=prep_res["include_patterns"],
|
||||
exclude_patterns=prep_res["exclude_patterns"],
|
||||
max_file_size=prep_res["max_file_size"],
|
||||
use_relative_paths=prep_res["use_relative_paths"]
|
||||
)
|
||||
# Convert dict to list of tuples: [(path, content), ...]
|
||||
files_list = list(result.get("files", {}).items())
|
||||
print(f"Fetched {len(files_list)} files.")
|
||||
return files_list
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
# Store the answer in shared
|
||||
shared["answer"] = exec_res
|
||||
shared["files"] = exec_res # List of (path, content) tuples
|
||||
|
||||
class IdentifyAbstractions(Node):
|
||||
def prep(self, shared):
|
||||
files_data = shared["files"]
|
||||
project_name = shared["project_name"] # Get project name
|
||||
context, file_info = create_llm_context(files_data)
|
||||
# Format file info for the prompt (comment is just a hint for LLM)
|
||||
file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info])
|
||||
return context, file_listing_for_prompt, len(files_data), project_name # Return project name
|
||||
|
||||
def exec(self, prep_res):
|
||||
context, file_listing_for_prompt, file_count, project_name = prep_res # Unpack project name
|
||||
print("Identifying abstractions using LLM...")
|
||||
prompt = f"""
|
||||
For the project `{project_name}`:
|
||||
|
||||
Codebase Context:
|
||||
{context}
|
||||
|
||||
Analyze the codebase context.
|
||||
Identify the top 5-10 core most important abstractions to help those new to the codebase.
|
||||
|
||||
For each abstraction, provide:
|
||||
1. A concise `name`.
|
||||
2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words.
|
||||
3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`.
|
||||
|
||||
List of file indices and paths present in the context:
|
||||
{file_listing_for_prompt}
|
||||
|
||||
Format the output as a YAML list of dictionaries:
|
||||
|
||||
```yaml
|
||||
- name: Query Processing
|
||||
description: |
|
||||
Explains what the abstraction does.
|
||||
It's like a central dispatcher routing requests.
|
||||
file_indices:
|
||||
- 0 # path/to/file1.py
|
||||
- 3 # path/to/related.py
|
||||
- name: Query Optimization
|
||||
description: |
|
||||
Another core concept, similar to a blueprint for objects.
|
||||
file_indices:
|
||||
- 5 # path/to/another.js
|
||||
# ... up to 10 abstractions
|
||||
```"""
|
||||
response = call_llm(prompt)
|
||||
|
||||
# --- Validation ---
|
||||
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||
abstractions = yaml.safe_load(yaml_str)
|
||||
|
||||
if not isinstance(abstractions, list):
|
||||
raise ValueError("LLM Output is not a list")
|
||||
|
||||
validated_abstractions = []
|
||||
for item in abstractions:
|
||||
if not isinstance(item, dict) or not all(k in item for k in ["name", "description", "file_indices"]):
|
||||
raise ValueError(f"Missing keys in abstraction item: {item}")
|
||||
if not isinstance(item["description"], str):
|
||||
raise ValueError(f"description is not a string in item: {item}")
|
||||
if not isinstance(item["file_indices"], list):
|
||||
raise ValueError(f"file_indices is not a list in item: {item}")
|
||||
|
||||
# Validate indices
|
||||
validated_indices = []
|
||||
for idx_entry in item["file_indices"]:
|
||||
try:
|
||||
if isinstance(idx_entry, int):
|
||||
idx = idx_entry
|
||||
elif isinstance(idx_entry, str) and '#' in idx_entry:
|
||||
idx = int(idx_entry.split('#')[0].strip())
|
||||
else:
|
||||
idx = int(str(idx_entry).strip())
|
||||
|
||||
if not (0 <= idx < file_count):
|
||||
raise ValueError(f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}.")
|
||||
validated_indices.append(idx)
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"Could not parse index from entry: {idx_entry} in item {item['name']}")
|
||||
|
||||
item["files"] = sorted(list(set(validated_indices)))
|
||||
# Store only the required fields
|
||||
validated_abstractions.append({
|
||||
"name": item["name"],
|
||||
"description": item["description"],
|
||||
"files": item["files"]
|
||||
})
|
||||
|
||||
print(f"Identified {len(validated_abstractions)} abstractions.")
|
||||
return validated_abstractions
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
shared["abstractions"] = exec_res # List of {"name": str, "description": str, "files": [int]}
|
||||
|
||||
class AnalyzeRelationships(Node):
|
||||
def prep(self, shared):
|
||||
abstractions = shared["abstractions"] # Now contains 'files' list of indices
|
||||
files_data = shared["files"]
|
||||
project_name = shared["project_name"] # Get project name
|
||||
|
||||
# Create context with abstraction names, indices, descriptions, and relevant file snippets
|
||||
context = "Identified Abstractions:\n"
|
||||
all_relevant_indices = set()
|
||||
abstraction_info_for_prompt = []
|
||||
for i, abstr in enumerate(abstractions):
|
||||
# Use 'files' which contains indices directly
|
||||
file_indices_str = ", ".join(map(str, abstr['files']))
|
||||
info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\n Description: {abstr['description']}"
|
||||
context += info_line + "\n"
|
||||
abstraction_info_for_prompt.append(f"{i} # {abstr['name']}")
|
||||
all_relevant_indices.update(abstr['files'])
|
||||
|
||||
context += "\nRelevant File Snippets (Referenced by Index and Path):\n"
|
||||
# Get content for relevant files using helper
|
||||
relevant_files_content_map = get_content_for_indices(
|
||||
files_data,
|
||||
sorted(list(all_relevant_indices))
|
||||
)
|
||||
# Format file content for context
|
||||
file_context_str = "\n\n".join(
|
||||
f"--- File: {idx_path} ---\n{content}"
|
||||
for idx_path, content in relevant_files_content_map.items()
|
||||
)
|
||||
context += file_context_str
|
||||
|
||||
return context, "\n".join(abstraction_info_for_prompt), project_name # Return project name
|
||||
|
||||
def exec(self, prep_res):
|
||||
context, abstraction_listing, project_name = prep_res # Unpack project name
|
||||
print("Analyzing relationships using LLM...")
|
||||
prompt = f"""
|
||||
Based on the following abstractions and relevant code snippets from the project `{project_name}`:
|
||||
|
||||
List of Abstraction Indices and Names:
|
||||
{abstraction_listing}
|
||||
|
||||
Context (Abstractions, Descriptions, Code):
|
||||
{context}
|
||||
|
||||
Please provide:
|
||||
1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences. Use markdown formatting with **bold** and *italic* text to highlight important concepts.
|
||||
2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify:
|
||||
- `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`)
|
||||
- `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`)
|
||||
- `label`: A brief label for the interaction **in just a few words** (e.g., "Manages", "Inherits", "Uses").
|
||||
Ideally the relationship should be backed by one abstraction calling or passing parameters to another.
|
||||
Simplify the relationship and exclude those non-important ones.
|
||||
|
||||
IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships.
|
||||
|
||||
Format the output as YAML:
|
||||
|
||||
```yaml
|
||||
summary: |
|
||||
A brief, simple explanation of the project.
|
||||
Can span multiple lines with **bold** and *italic* for emphasis.
|
||||
relationships:
|
||||
- from_abstraction: 0 # AbstractionName1
|
||||
to_abstraction: 1 # AbstractionName2
|
||||
label: "Manages"
|
||||
- from_abstraction: 2 # AbstractionName3
|
||||
to_abstraction: 0 # AbstractionName1
|
||||
label: "Provides config"
|
||||
# ... other relationships
|
||||
```
|
||||
|
||||
Now, provide the YAML output:
|
||||
"""
|
||||
response = call_llm(prompt)
|
||||
|
||||
# --- Validation ---
|
||||
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||
relationships_data = yaml.safe_load(yaml_str)
|
||||
|
||||
if not isinstance(relationships_data, dict) or not all(k in relationships_data for k in ["summary", "relationships"]):
|
||||
raise ValueError("LLM output is not a dict or missing keys ('summary', 'relationships')")
|
||||
if not isinstance(relationships_data["summary"], str):
|
||||
raise ValueError("summary is not a string")
|
||||
if not isinstance(relationships_data["relationships"], list):
|
||||
raise ValueError("relationships is not a list")
|
||||
|
||||
# Validate relationships structure
|
||||
validated_relationships = []
|
||||
num_abstractions = len(abstraction_listing.split('\n'))
|
||||
for rel in relationships_data["relationships"]:
|
||||
# Check for 'label' key
|
||||
if not isinstance(rel, dict) or not all(k in rel for k in ["from_abstraction", "to_abstraction", "label"]):
|
||||
raise ValueError(f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}")
|
||||
# Validate 'label' is a string
|
||||
if not isinstance(rel["label"], str):
|
||||
raise ValueError(f"Relationship label is not a string: {rel}")
|
||||
|
||||
# Validate indices
|
||||
try:
|
||||
from_idx = int(str(rel["from_abstraction"]).split('#')[0].strip())
|
||||
to_idx = int(str(rel["to_abstraction"]).split('#')[0].strip())
|
||||
if not (0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions):
|
||||
raise ValueError(f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}.")
|
||||
validated_relationships.append({
|
||||
"from": from_idx,
|
||||
"to": to_idx,
|
||||
"label": rel["label"]
|
||||
})
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"Could not parse indices from relationship: {rel}")
|
||||
|
||||
print("Generated project summary and relationship details.")
|
||||
return {
|
||||
"summary": relationships_data["summary"],
|
||||
"details": validated_relationships # Store validated, index-based relationships
|
||||
}
|
||||
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
# Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
|
||||
shared["relationships"] = exec_res
|
||||
|
||||
class OrderChapters(Node):
|
||||
def prep(self, shared):
|
||||
abstractions = shared["abstractions"]
|
||||
relationships = shared["relationships"]
|
||||
project_name = shared["project_name"] # Get project name
|
||||
|
||||
# Prepare context for the LLM
|
||||
abstraction_info_for_prompt = []
|
||||
for i, a in enumerate(abstractions):
|
||||
abstraction_info_for_prompt.append(f"- {i} # {a['name']}")
|
||||
abstraction_listing = "\n".join(abstraction_info_for_prompt)
|
||||
|
||||
context = f"Project Summary:\n{relationships['summary']}\n\n"
|
||||
context += "Relationships (Indices refer to abstractions above):\n"
|
||||
for rel in relationships['details']:
|
||||
from_name = abstractions[rel['from']]['name']
|
||||
to_name = abstractions[rel['to']]['name']
|
||||
# Use 'label' instead of 'desc'
|
||||
context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n"
|
||||
|
||||
return abstraction_listing, context, len(abstractions), project_name
|
||||
|
||||
def exec(self, prep_res):
|
||||
abstraction_listing, context, num_abstractions, project_name = prep_res
|
||||
print("Determining chapter order using LLM...")
|
||||
prompt = f"""
|
||||
Given the following project abstractions and their relationships for the project ```` {project_name} ````:
|
||||
|
||||
Abstractions (Index # Name):
|
||||
{abstraction_listing}
|
||||
|
||||
Context about relationships and project summary:
|
||||
{context}
|
||||
|
||||
If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last?
|
||||
Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts.
|
||||
|
||||
Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`.
|
||||
|
||||
```yaml
|
||||
- 2 # FoundationalConcept
|
||||
- 0 # CoreClassA
|
||||
- 1 # CoreClassB (uses CoreClassA)
|
||||
- ...
|
||||
```
|
||||
|
||||
Now, provide the YAML output:
|
||||
"""
|
||||
response = call_llm(prompt)
|
||||
|
||||
# --- Validation ---
|
||||
# Rely on Node's built-in retry/fallback
|
||||
yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
|
||||
ordered_indices_raw = yaml.safe_load(yaml_str)
|
||||
|
||||
if not isinstance(ordered_indices_raw, list):
|
||||
raise ValueError("LLM output is not a list")
|
||||
|
||||
ordered_indices = []
|
||||
seen_indices = set()
|
||||
for entry in ordered_indices_raw:
|
||||
try:
|
||||
if isinstance(entry, int):
|
||||
idx = entry
|
||||
elif isinstance(entry, str) and '#' in entry:
|
||||
idx = int(entry.split('#')[0].strip())
|
||||
else:
|
||||
idx = int(str(entry).strip())
|
||||
|
||||
if not (0 <= idx < num_abstractions):
|
||||
raise ValueError(f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}.")
|
||||
if idx in seen_indices:
|
||||
raise ValueError(f"Duplicate index {idx} found in ordered list.")
|
||||
ordered_indices.append(idx)
|
||||
seen_indices.add(idx)
|
||||
|
||||
except (ValueError, TypeError):
|
||||
raise ValueError(f"Could not parse index from ordered list entry: {entry}")
|
||||
|
||||
# Check if all abstractions are included
|
||||
if len(ordered_indices) != num_abstractions:
|
||||
raise ValueError(f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}")
|
||||
|
||||
print(f"Determined chapter order (indices): {ordered_indices}")
|
||||
return ordered_indices # Return the list of indices
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
# exec_res is already the list of ordered indices
|
||||
shared["chapter_order"] = exec_res # List of indices
|
||||
|
||||
class WriteChapters(BatchNode):
|
||||
def prep(self, shared):
|
||||
chapter_order = shared["chapter_order"] # List of indices
|
||||
abstractions = shared["abstractions"] # List of dicts, now using 'files' with indices
|
||||
files_data = shared["files"]
|
||||
# Get already written chapters to provide context
|
||||
# We store them temporarily during the batch run, not in shared memory yet
|
||||
# The 'previous_chapters_summary' will be built progressively in the exec context
|
||||
self.chapters_written_so_far = [] # Use instance variable for temporary storage across exec calls
|
||||
|
||||
# Create a complete list of all chapters
|
||||
all_chapters = []
|
||||
chapter_filenames = {} # Store chapter filename mapping for linking
|
||||
for i, abstraction_index in enumerate(chapter_order):
|
||||
if 0 <= abstraction_index < len(abstractions):
|
||||
chapter_num = i + 1
|
||||
chapter_name = abstractions[abstraction_index]["name"]
|
||||
# Create safe filename
|
||||
safe_name = "".join(c if c.isalnum() else '_' for c in chapter_name).lower()
|
||||
filename = f"{i+1:02d}_{safe_name}.md"
|
||||
# Format with link
|
||||
all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})")
|
||||
# Store mapping of chapter index to filename for linking
|
||||
chapter_filenames[abstraction_index] = {"num": chapter_num, "name": chapter_name, "filename": filename}
|
||||
|
||||
# Create a formatted string with all chapters
|
||||
full_chapter_listing = "\n".join(all_chapters)
|
||||
|
||||
items_to_process = []
|
||||
for i, abstraction_index in enumerate(chapter_order):
|
||||
if 0 <= abstraction_index < len(abstractions):
|
||||
abstraction_details = abstractions[abstraction_index]
|
||||
# Use 'files' (list of indices) directly
|
||||
related_file_indices = abstraction_details.get("files", [])
|
||||
# Get content using helper, passing indices
|
||||
related_files_content_map = get_content_for_indices(files_data, related_file_indices)
|
||||
|
||||
# Get previous chapter info for transitions
|
||||
prev_chapter = None
|
||||
if i > 0:
|
||||
prev_idx = chapter_order[i-1]
|
||||
prev_chapter = chapter_filenames[prev_idx]
|
||||
|
||||
# Get next chapter info for transitions
|
||||
next_chapter = None
|
||||
if i < len(chapter_order) - 1:
|
||||
next_idx = chapter_order[i+1]
|
||||
next_chapter = chapter_filenames[next_idx]
|
||||
|
||||
items_to_process.append({
|
||||
"chapter_num": i + 1,
|
||||
"abstraction_index": abstraction_index,
|
||||
"abstraction_details": abstraction_details,
|
||||
"related_files_content_map": related_files_content_map,
|
||||
"project_name": shared["project_name"], # Add project name
|
||||
"full_chapter_listing": full_chapter_listing, # Add the full chapter listing
|
||||
"chapter_filenames": chapter_filenames, # Add chapter filenames mapping
|
||||
"prev_chapter": prev_chapter, # Add previous chapter info
|
||||
"next_chapter": next_chapter, # Add next chapter info
|
||||
# previous_chapters_summary will be added dynamically in exec
|
||||
})
|
||||
else:
|
||||
print(f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping.")
|
||||
|
||||
print(f"Preparing to write {len(items_to_process)} chapters...")
|
||||
return items_to_process # Iterable for BatchNode
|
||||
|
||||
def exec(self, item):
|
||||
# This runs for each item prepared above
|
||||
abstraction_name = item["abstraction_details"]["name"]
|
||||
chapter_num = item["chapter_num"]
|
||||
project_name = item.get("project_name") # Get from item
|
||||
print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...")
|
||||
|
||||
# Prepare file context string from the map
|
||||
file_context_str = "\n\n".join(
|
||||
f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"
|
||||
for idx_path, content in item["related_files_content_map"].items()
|
||||
)
|
||||
|
||||
# Get summary of chapters written *before* this one
|
||||
# Use the temporary instance variable
|
||||
previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far)
|
||||
|
||||
|
||||
prompt = f"""
|
||||
Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}.
|
||||
|
||||
Concept Details:
|
||||
- Description:
|
||||
{item["abstraction_details"]["description"]}
|
||||
|
||||
Complete Tutorial Structure:
|
||||
{item["full_chapter_listing"]}
|
||||
|
||||
Context from previous chapters (summary):
|
||||
{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}
|
||||
|
||||
Relevant Code Snippets:
|
||||
{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}
|
||||
|
||||
Instructions for the chapter:
|
||||
- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`).
|
||||
|
||||
- If this is not the first chapter, begin with a brief transition from the previous chapter, referencing it with a proper Markdown link.
|
||||
|
||||
- Begin with a high-level motivation explaining what problem this abstraction solves. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners.
|
||||
|
||||
- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way.
|
||||
|
||||
- Explain how to use this abstraction to solve the use case. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen).
|
||||
|
||||
- Each code block should be BELOW 20 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it.
|
||||
|
||||
- Describe the internal implementation to help understand what's under the hood. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use:
|
||||
`participant QP as Query Processing`
|
||||
|
||||
- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly.
|
||||
|
||||
- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename. Example: "we will talk about [Query Processing](03_query_processing.md) in Chapter 3".
|
||||
|
||||
- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format).
|
||||
|
||||
- Heavily use analogies and examples throughout to help beginners understand.
|
||||
|
||||
- End the chapter with a brief conclusion that summarizes what was learned and provides a transition to the next chapter. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename).
|
||||
|
||||
- Ensure the tone is welcoming and easy for a newcomer to understand.
|
||||
|
||||
- Output *only* the Markdown content for this chapter.
|
||||
|
||||
Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags):
|
||||
"""
|
||||
chapter_content = call_llm(prompt)
|
||||
# Basic validation/cleanup
|
||||
actual_heading = f"# Chapter {chapter_num}: {abstraction_name}"
|
||||
if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"):
|
||||
# Add heading if missing or incorrect, trying to preserve content
|
||||
lines = chapter_content.strip().split('\n')
|
||||
if lines and lines[0].strip().startswith("#"): # If there's some heading, replace it
|
||||
lines[0] = actual_heading
|
||||
chapter_content = "\n".join(lines)
|
||||
else: # Otherwise, prepend it
|
||||
chapter_content = f"{actual_heading}\n\n{chapter_content}"
|
||||
|
||||
# Add the generated content to our temporary list for the next iteration's context
|
||||
self.chapters_written_so_far.append(chapter_content)
|
||||
|
||||
return chapter_content # Return the Markdown string
|
||||
|
||||
def post(self, shared, prep_res, exec_res_list):
|
||||
# exec_res_list contains the generated Markdown for each chapter, in order
|
||||
shared["chapters"] = exec_res_list
|
||||
# Clean up the temporary instance variable
|
||||
del self.chapters_written_so_far
|
||||
print(f"Finished writing {len(exec_res_list)} chapters.")
|
||||
|
||||
class CombineTutorial(Node):
|
||||
def prep(self, shared):
|
||||
project_name = shared["project_name"]
|
||||
output_base_dir = shared.get("output_dir", "output") # Default output dir
|
||||
output_path = os.path.join(output_base_dir, project_name)
|
||||
repo_url = shared["repo_url"] # Get the repository URL
|
||||
|
||||
# Use 'label' from relationships_data['details']
|
||||
relationships_data = shared["relationships"] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
|
||||
chapter_order = shared["chapter_order"] # indices
|
||||
abstractions = shared["abstractions"] # list of dicts
|
||||
chapters_content = shared["chapters"] # list of strings
|
||||
|
||||
# --- Generate Mermaid Diagram ---
|
||||
mermaid_lines = ["flowchart TD"]
|
||||
# Add nodes for each abstraction
|
||||
for i, abstr in enumerate(abstractions):
|
||||
# Sanitize name for Mermaid ID and label
|
||||
node_id = f"A{i}"
|
||||
sanitized_name = abstr['name'].replace('"', '')
|
||||
node_label = sanitized_name # Using sanitized name only, no index
|
||||
mermaid_lines.append(f' {node_id}["{node_label}"]')
|
||||
# Add edges for relationships using 'label'
|
||||
for rel in relationships_data['details']:
|
||||
from_node_id = f"A{rel['from']}"
|
||||
to_node_id = f"A{rel['to']}"
|
||||
# Sanitize 'label' for edge label
|
||||
edge_label = rel['label'].replace('"', '').replace('\n', ' ') # Basic sanitization
|
||||
# Limit edge label length for readability (optional, but good for diagrams)
|
||||
max_label_len = 30 # Make it shorter for labels
|
||||
if len(edge_label) > max_label_len:
|
||||
edge_label = edge_label[:max_label_len-3] + "..."
|
||||
mermaid_lines.append(f' {from_node_id} -- "{edge_label}" --> {to_node_id}')
|
||||
|
||||
mermaid_diagram = "\n".join(mermaid_lines)
|
||||
# --- End Mermaid ---
|
||||
|
||||
|
||||
# Prepare index.md content
|
||||
index_content = f"# Tutorial: {project_name}\n\n"
|
||||
index_content += f"{relationships_data['summary']}\n\n"
|
||||
index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"
|
||||
|
||||
# Add Mermaid diagram for relationships
|
||||
index_content += "```mermaid\n"
|
||||
index_content += mermaid_diagram + "\n"
|
||||
index_content += "```\n\n"
|
||||
|
||||
index_content += "## Chapters\n\n"
|
||||
|
||||
chapter_files = []
|
||||
# Generate chapter links based on the determined order
|
||||
for i, abstraction_index in enumerate(chapter_order):
|
||||
# Ensure index is valid and we have content for it
|
||||
if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content):
|
||||
abstraction_name = abstractions[abstraction_index]["name"]
|
||||
# Sanitize name for filename
|
||||
safe_name = "".join(c if c.isalnum() else '_' for c in abstraction_name).lower()
|
||||
# Use chapter number (i+1) for ordering filename
|
||||
filename = f"{i+1:02d}_{safe_name}.md"
|
||||
index_content += f"{i+1}. [{abstraction_name}]({filename})\n"
|
||||
|
||||
# Add attribution to chapter content
|
||||
chapter_content = chapters_content[i]
|
||||
if not chapter_content.endswith("\n\n"):
|
||||
chapter_content += "\n\n"
|
||||
chapter_content += "---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
|
||||
|
||||
# Store filename and corresponding content
|
||||
chapter_files.append({"filename": filename, "content": chapter_content})
|
||||
else:
|
||||
print(f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry.")
|
||||
|
||||
# Add attribution to index content
|
||||
index_content += "\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
|
||||
|
||||
return {
|
||||
"output_path": output_path,
|
||||
"index_content": index_content,
|
||||
"chapter_files": chapter_files # List of {"filename": str, "content": str}
|
||||
}
|
||||
|
||||
def exec(self, prep_res):
|
||||
output_path = prep_res["output_path"]
|
||||
index_content = prep_res["index_content"]
|
||||
chapter_files = prep_res["chapter_files"]
|
||||
|
||||
print(f"Combining tutorial into directory: {output_path}")
|
||||
# Rely on Node's built-in retry/fallback
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
# Write index.md
|
||||
index_filepath = os.path.join(output_path, "index.md")
|
||||
with open(index_filepath, "w", encoding="utf-8") as f:
|
||||
f.write(index_content)
|
||||
print(f" - Wrote {index_filepath}")
|
||||
|
||||
# Write chapter files
|
||||
for chapter_info in chapter_files:
|
||||
chapter_filepath = os.path.join(output_path, chapter_info["filename"])
|
||||
with open(chapter_filepath, "w", encoding="utf-8") as f:
|
||||
f.write(chapter_info["content"])
|
||||
print(f" - Wrote {chapter_filepath}")
|
||||
|
||||
return output_path # Return the final path
|
||||
|
||||
|
||||
def post(self, shared, prep_res, exec_res):
|
||||
shared["final_output_dir"] = exec_res # Store the output path
|
||||
print(f"\nTutorial generation complete! Files are in: {exec_res}")
|
||||
@@ -1 +1,4 @@
|
||||
pocketflow>=0.0.1
|
||||
pocketflow>=0.0.1
|
||||
pyyaml>=6.0
|
||||
requests>=2.28.0
|
||||
google-cloud-aiplatform>=1.25.0
|
||||
@@ -1,14 +1,87 @@
|
||||
from openai import OpenAI
|
||||
from google import genai
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Learn more about calling the LLM: https://the-pocket.github.io/PocketFlow/utility_function/llm.html
|
||||
def call_llm(prompt):
|
||||
client = OpenAI(api_key="YOUR_API_KEY_HERE")
|
||||
r = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": prompt}]
|
||||
)
|
||||
return r.choices[0].message.content
|
||||
# Configure logging
|
||||
log_directory = os.getenv("LOG_DIR", "logs")
|
||||
os.makedirs(log_directory, exist_ok=True)
|
||||
log_file = os.path.join(log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log")
|
||||
|
||||
# Set up logger
|
||||
logger = logging.getLogger("llm_logger")
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.propagate = False # Prevent propagation to root logger
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# Simple cache configuration
|
||||
cache_file = "llm_cache.json"
|
||||
|
||||
def call_llm(prompt: str, use_cache: bool = True) -> str:
|
||||
# Log the prompt
|
||||
logger.info(f"PROMPT: {prompt}")
|
||||
|
||||
# Check cache if enabled
|
||||
if use_cache:
|
||||
# Load cache from disk
|
||||
cache = {}
|
||||
if os.path.exists(cache_file):
|
||||
try:
|
||||
with open(cache_file, 'r') as f:
|
||||
cache = json.load(f)
|
||||
except:
|
||||
logger.warning(f"Failed to load cache, starting with empty cache")
|
||||
|
||||
# Return from cache if exists
|
||||
if prompt in cache:
|
||||
logger.info(f"RESPONSE: {cache[prompt]}")
|
||||
return cache[prompt]
|
||||
|
||||
# Call the LLM if not in cache or cache disabled
|
||||
client = genai.Client(
|
||||
vertexai=True,
|
||||
project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"),
|
||||
location=os.getenv("GEMINI_LOCATION", "us-central1")
|
||||
)
|
||||
model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25")
|
||||
response = client.models.generate_content(
|
||||
model=model,
|
||||
contents=[prompt]
|
||||
)
|
||||
response_text = response.text
|
||||
|
||||
# Log the response
|
||||
logger.info(f"RESPONSE: {response_text}")
|
||||
|
||||
# Update cache if enabled
|
||||
if use_cache:
|
||||
# Load cache again to avoid overwrites
|
||||
cache = {}
|
||||
if os.path.exists(cache_file):
|
||||
try:
|
||||
with open(cache_file, 'r') as f:
|
||||
cache = json.load(f)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Add to cache and save
|
||||
cache[prompt] = response_text
|
||||
try:
|
||||
with open(cache_file, 'w') as f:
|
||||
json.dump(cache, f)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save cache: {e}")
|
||||
|
||||
return response_text
|
||||
|
||||
if __name__ == "__main__":
|
||||
prompt = "What is the meaning of life?"
|
||||
print(call_llm(prompt))
|
||||
test_prompt = "Hello, how are you?"
|
||||
|
||||
# First call - should hit the API
|
||||
print("Making call...")
|
||||
response1 = call_llm(test_prompt, use_cache=False)
|
||||
print(f"Response: {response1}")
|
||||
|
||||
236
utils/crawl_github_files.py
Normal file
236
utils/crawl_github_files.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import requests
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
import fnmatch
|
||||
from typing import Union, Set, List, Dict, Tuple, Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
def crawl_github_files(
|
||||
repo_url,
|
||||
token=None,
|
||||
max_file_size: int = 1 * 1024 * 1024, # 1 MB
|
||||
use_relative_paths: bool = False,
|
||||
include_patterns: Union[str, Set[str]] = None,
|
||||
exclude_patterns: Union[str, Set[str]] = None
|
||||
):
|
||||
"""
|
||||
Crawl files from a specific path in a GitHub repository at a specific commit.
|
||||
|
||||
Args:
|
||||
repo_url (str): URL of the GitHub repository with specific path and commit
|
||||
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
|
||||
token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
|
||||
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
|
||||
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
|
||||
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
|
||||
If None, all files are included.
|
||||
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
|
||||
If None, no files are excluded.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary with files and statistics
|
||||
"""
|
||||
# Convert single pattern to set
|
||||
if include_patterns and isinstance(include_patterns, str):
|
||||
include_patterns = {include_patterns}
|
||||
if exclude_patterns and isinstance(exclude_patterns, str):
|
||||
exclude_patterns = {exclude_patterns}
|
||||
|
||||
# Parse GitHub URL to extract owner, repo, commit/branch, and path
|
||||
parsed_url = urlparse(repo_url)
|
||||
path_parts = parsed_url.path.strip('/').split('/')
|
||||
|
||||
if len(path_parts) < 2:
|
||||
raise ValueError(f"Invalid GitHub URL: {repo_url}")
|
||||
|
||||
# Extract the basic components
|
||||
owner = path_parts[0]
|
||||
repo = path_parts[1]
|
||||
|
||||
# Check if URL contains a specific branch/commit
|
||||
if 'tree' in path_parts:
|
||||
tree_index = path_parts.index('tree')
|
||||
ref = path_parts[tree_index + 1]
|
||||
# Combine all parts after the ref as the path
|
||||
path_start = tree_index + 2
|
||||
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
|
||||
else:
|
||||
ref = "main" # Default branch
|
||||
specific_path = ""
|
||||
|
||||
# Setup for GitHub API
|
||||
headers = {"Accept": "application/vnd.github.v3+json"}
|
||||
if token:
|
||||
headers["Authorization"] = f"token {token}"
|
||||
|
||||
# Dictionary to store path -> content mapping
|
||||
files = {}
|
||||
skipped_files = []
|
||||
|
||||
def should_include_file(file_path: str, file_name: str) -> bool:
|
||||
"""Determine if a file should be included based on patterns"""
|
||||
# If no include patterns are specified, include all files
|
||||
if not include_patterns:
|
||||
include_file = True
|
||||
else:
|
||||
# Check if file matches any include pattern
|
||||
include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)
|
||||
|
||||
# If exclude patterns are specified, check if file should be excluded
|
||||
if exclude_patterns and include_file:
|
||||
# Exclude if file matches any exclude pattern
|
||||
exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
|
||||
return not exclude_file
|
||||
|
||||
return include_file
|
||||
|
||||
def fetch_contents(path):
|
||||
"""Fetch contents of the repository at a specific path and commit"""
|
||||
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
|
||||
params = {"ref": ref}
|
||||
|
||||
response = requests.get(url, headers=headers, params=params)
|
||||
|
||||
if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
|
||||
reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
|
||||
wait_time = max(reset_time - time.time(), 0) + 1
|
||||
print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
|
||||
time.sleep(wait_time)
|
||||
return fetch_contents(path)
|
||||
|
||||
if response.status_code == 404:
|
||||
if not token:
|
||||
print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
|
||||
else:
|
||||
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
|
||||
return
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Error fetching {path}: {response.status_code} - {response.text}")
|
||||
return
|
||||
|
||||
contents = response.json()
|
||||
|
||||
# Handle both single file and directory responses
|
||||
if not isinstance(contents, list):
|
||||
contents = [contents]
|
||||
|
||||
for item in contents:
|
||||
item_path = item["path"]
|
||||
|
||||
# Calculate relative path if requested
|
||||
if use_relative_paths and specific_path:
|
||||
# Make sure the path is relative to the specified subdirectory
|
||||
if item_path.startswith(specific_path):
|
||||
rel_path = item_path[len(specific_path):].lstrip('/')
|
||||
else:
|
||||
rel_path = item_path
|
||||
else:
|
||||
rel_path = item_path
|
||||
|
||||
if item["type"] == "file":
|
||||
# Check if file should be included based on patterns
|
||||
if not should_include_file(rel_path, item["name"]):
|
||||
print(f"Skipping {rel_path}: Does not match include/exclude patterns")
|
||||
continue
|
||||
|
||||
# Check file size if available
|
||||
file_size = item.get("size", 0)
|
||||
if file_size > max_file_size:
|
||||
skipped_files.append((item_path, file_size))
|
||||
print(f"Skipping {rel_path}: File size ({file_size} bytes) exceeds limit ({max_file_size} bytes)")
|
||||
continue
|
||||
|
||||
# For files, get raw content
|
||||
if "download_url" in item and item["download_url"]:
|
||||
file_url = item["download_url"]
|
||||
file_response = requests.get(file_url, headers=headers)
|
||||
|
||||
# Final size check in case content-length header is available but differs from metadata
|
||||
content_length = int(file_response.headers.get('content-length', 0))
|
||||
if content_length > max_file_size:
|
||||
skipped_files.append((item_path, content_length))
|
||||
print(f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)")
|
||||
continue
|
||||
|
||||
if file_response.status_code == 200:
|
||||
files[rel_path] = file_response.text
|
||||
print(f"Downloaded: {rel_path} ({file_size} bytes) ")
|
||||
else:
|
||||
print(f"Failed to download {rel_path}: {file_response.status_code}")
|
||||
else:
|
||||
# Alternative method if download_url is not available
|
||||
content_response = requests.get(item["url"], headers=headers)
|
||||
if content_response.status_code == 200:
|
||||
content_data = content_response.json()
|
||||
if content_data.get("encoding") == "base64" and "content" in content_data:
|
||||
# Check size of base64 content before decoding
|
||||
if len(content_data["content"]) * 0.75 > max_file_size: # Approximate size calculation
|
||||
estimated_size = int(len(content_data["content"]) * 0.75)
|
||||
skipped_files.append((item_path, estimated_size))
|
||||
print(f"Skipping {rel_path}: Encoded content exceeds size limit")
|
||||
continue
|
||||
|
||||
file_content = base64.b64decode(content_data["content"]).decode('utf-8')
|
||||
files[rel_path] = file_content
|
||||
print(f"Downloaded: {rel_path} ({file_size} bytes)")
|
||||
else:
|
||||
print(f"Unexpected content format for {rel_path}")
|
||||
else:
|
||||
print(f"Failed to get content for {rel_path}: {content_response.status_code}")
|
||||
|
||||
elif item["type"] == "dir":
|
||||
# Recursively process subdirectories
|
||||
fetch_contents(item_path)
|
||||
|
||||
# Start crawling from the specified path
|
||||
fetch_contents(specific_path)
|
||||
|
||||
return {
|
||||
"files": files,
|
||||
"stats": {
|
||||
"downloaded_count": len(files),
|
||||
"skipped_count": len(skipped_files),
|
||||
"skipped_files": skipped_files,
|
||||
"base_path": specific_path if use_relative_paths else None,
|
||||
"include_patterns": include_patterns,
|
||||
"exclude_patterns": exclude_patterns
|
||||
}
|
||||
}
|
||||
|
||||
# Example usage
|
||||
if __name__ == "__main__":
|
||||
# Get token from environment variable (more secure than hardcoding)
|
||||
github_token = os.environ.get("GITHUB_TOKEN")
|
||||
|
||||
repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
|
||||
|
||||
# Example: Get Python and Markdown files, but exclude test files
|
||||
result = crawl_github_files(
|
||||
repo_url,
|
||||
token=github_token,
|
||||
max_file_size=1 * 1024 * 1024, # 1 MB in bytes
|
||||
use_relative_paths=True, # Enable relative paths
|
||||
include_patterns={"*.py", "*.md"}, # Include Python and Markdown files
|
||||
)
|
||||
|
||||
files = result["files"]
|
||||
stats = result["stats"]
|
||||
|
||||
print(f"\nDownloaded {stats['downloaded_count']} files.")
|
||||
print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
|
||||
print(f"Base path for relative paths: {stats['base_path']}")
|
||||
print(f"Include patterns: {stats['include_patterns']}")
|
||||
print(f"Exclude patterns: {stats['exclude_patterns']}")
|
||||
|
||||
# Display all file paths in the dictionary
|
||||
print("\nFiles in dictionary:")
|
||||
for file_path in sorted(files.keys()):
|
||||
print(f" {file_path}")
|
||||
|
||||
# Example: accessing content of a specific file
|
||||
if files:
|
||||
sample_file = next(iter(files))
|
||||
print(f"\nSample file: {sample_file}")
|
||||
print(f"Content preview: {files[sample_file][:200]}...")
|
||||
Reference in New Issue
Block a user