diff --git a/README.md b/README.md index eca03b2..c24058b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/PocketFlow), a 100-line LLM framework. It crawls GitHub repositories and build a knowledge base from the code. It analyzes entire codebases to identify core abstractions and how they interact, and transforms complex code into beginner-friendly tutorials with clear visualizations. -## Example Tutorials for Popular GitHub Repositories! +## ⭐ Example Tutorials for Popular GitHub Repositories! - [AutoGen Core](https://the-pocket.github.io/Tutorial-Codebase-Knowledge/AutoGen%20Core) - Build AI teams that talk, think, and solve problems together like coworkers! diff --git a/flow.py b/flow.py index ec79f75..51813f8 100644 --- a/flow.py +++ b/flow.py @@ -1,14 +1,33 @@ from pocketflow import Flow -from nodes import GetQuestionNode, AnswerNode +# Import all node classes from nodes.py +from nodes import ( + FetchRepo, + IdentifyAbstractions, + AnalyzeRelationships, + OrderChapters, + WriteChapters, + CombineTutorial +) -def create_qa_flow(): - """Create and return a question-answering flow.""" - # Create nodes - get_question_node = GetQuestionNode() - answer_node = AnswerNode() - - # Connect nodes in sequence - get_question_node >> answer_node - - # Create flow starting with input node - return Flow(start=get_question_node) \ No newline at end of file +def create_tutorial_flow(): + """Creates and returns the codebase tutorial generation flow.""" + + # Instantiate nodes + fetch_repo = FetchRepo() + identify_abstractions = IdentifyAbstractions(max_retries=3, wait=10) + analyze_relationships = AnalyzeRelationships(max_retries=3, wait=10) + order_chapters = OrderChapters(max_retries=3, wait=10) + write_chapters = WriteChapters(max_retries=3, wait=10) # This is a BatchNode + combine_tutorial = CombineTutorial() + + # Connect nodes in sequence based on the design + fetch_repo >> identify_abstractions + identify_abstractions >> analyze_relationships + analyze_relationships >> order_chapters + order_chapters >> write_chapters + write_chapters >> combine_tutorial + + # Create the flow starting with FetchRepo + tutorial_flow = Flow(start=fetch_repo) + + return tutorial_flow \ No newline at end of file diff --git a/main.py b/main.py index 05805c5..8c536c2 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,67 @@ -from flow import qa_flow +import os +import argparse +# Import the function that creates the flow +from flow import create_tutorial_flow -# Example main function -# Please replace this with your own main function +# Default file patterns +DEFAULT_INCLUDE_PATTERNS = { + "*.py", "*.js", "*.ts", "*.go", "*.java", "*.pyi", "*.pyx", + "*.c", "*.cc", "*.cpp", "*.h", "*.md", "*.rst", "Dockerfile", + "Makefile", "*.yaml", "*.yml" +} + +DEFAULT_EXCLUDE_PATTERNS = { + "*test*", "tests/*", "docs/*", "examples/*", "v1/*", + "dist/*", "build/*", "experimental/*", "deprecated/*", + "legacy/*", ".git/*", ".github/*" +} + +# --- Main Function --- def main(): + parser = argparse.ArgumentParser(description="Generate a tutorial for a GitHub codebase.") + parser.add_argument("repo_url", help="URL of the public GitHub repository.") + parser.add_argument("-n", "--name", help="Project name (optional, derived from URL if omitted).") + parser.add_argument("-t", "--token", help="GitHub personal access token (optional, reads from GITHUB_TOKEN env var if not provided).") + parser.add_argument("-o", "--output", default="output", help="Base directory for output (default: ./output).") + parser.add_argument("-i", "--include", nargs="+", help="Include file patterns (e.g. '*.py' '*.js'). Defaults to common code files if not specified.") + parser.add_argument("-e", "--exclude", nargs="+", help="Exclude file patterns (e.g. 'tests/*' 'docs/*'). Defaults to test/build directories if not specified.") + parser.add_argument("-s", "--max-size", type=int, default=100000, help="Maximum file size in bytes (default: 100000, about 100KB).") + + args = parser.parse_args() + + # Get GitHub token from argument or environment variable + github_token = args.token or os.environ.get('GITHUB_TOKEN') + if not github_token: + print("Warning: No GitHub token provided. You might hit rate limits for public repositories.") + + # Initialize the shared dictionary with inputs shared = { - "question": "In one sentence, what's the end of universe?", - "answer": None + "repo_url": args.repo_url, + "project_name": args.name, # Can be None, FetchRepo will derive it + "github_token": github_token, + "output_dir": args.output, # Base directory for CombineTutorial output + + # Add include/exclude patterns and max file size + "include_patterns": set(args.include) if args.include else DEFAULT_INCLUDE_PATTERNS, + "exclude_patterns": set(args.exclude) if args.exclude else DEFAULT_EXCLUDE_PATTERNS, + "max_file_size": args.max_size, + + # Outputs will be populated by the nodes + "files": [], + "abstractions": [], + "relationships": {}, + "chapter_order": [], + "chapters": [], + "final_output_dir": None } - qa_flow.run(shared) - print("Question:", shared["question"]) - print("Answer:", shared["answer"]) + print(f"Starting tutorial generation for: {args.repo_url}") + # Create the flow instance + tutorial_flow = create_tutorial_flow() + + # Run the flow + tutorial_flow.run(shared) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/nodes.py b/nodes.py index 32e7f6e..2dc6df2 100644 --- a/nodes.py +++ b/nodes.py @@ -1,26 +1,642 @@ -from pocketflow import Node -from utils.call_llm import call_llm +import os +import yaml +from pocketflow import Node, BatchNode +from utils.crawl_github_files import crawl_github_files +from utils.call_llm import call_llm # Assuming you have this utility -class GetQuestionNode(Node): - def exec(self, _): - # Get question directly from user input - user_question = input("Enter your question: ") - return user_question - - def post(self, shared, prep_res, exec_res): - # Store the user's question - shared["question"] = exec_res - return "default" # Go to the next node +# Helper to create context from files, respecting limits (basic example) +def create_llm_context(files_data): + context = "" + file_info = [] # Store tuples of (index, path) + for i, (path, content) in enumerate(files_data): + entry = f"--- File Index {i}: {path} ---\n{content}\n\n" + context += entry + file_info.append((i, path)) -class AnswerNode(Node): + return context, file_info # file_info is list of (index, path) + +# Helper to get content for specific file indices +def get_content_for_indices(files_data, indices): + content_map = {} + for i in indices: + if 0 <= i < len(files_data): + path, content = files_data[i] + content_map[f"{i} # {path}"] = content # Use index + path as key for context + return content_map + +class FetchRepo(Node): def prep(self, shared): - # Read question from shared - return shared["question"] - - def exec(self, question): - # Call LLM to get the answer - return call_llm(question) - + repo_url = shared["repo_url"] + project_name = shared.get("project_name") + if not project_name: + # Basic name derivation from URL + project_name = repo_url.split('/')[-1].replace('.git', '') + shared["project_name"] = project_name + + # Get file patterns directly from shared (defaults are defined in main.py) + include_patterns = shared["include_patterns"] + exclude_patterns = shared["exclude_patterns"] + max_file_size = shared["max_file_size"] + + return { + "repo_url": repo_url, + "token": shared.get("github_token"), + "include_patterns": include_patterns, + "exclude_patterns": exclude_patterns, + "max_file_size": max_file_size, + "use_relative_paths": True + } + + def exec(self, prep_res): + print(f"Crawling repository: {prep_res['repo_url']}...") + result = crawl_github_files( + repo_url=prep_res["repo_url"], + token=prep_res["token"], + include_patterns=prep_res["include_patterns"], + exclude_patterns=prep_res["exclude_patterns"], + max_file_size=prep_res["max_file_size"], + use_relative_paths=prep_res["use_relative_paths"] + ) + # Convert dict to list of tuples: [(path, content), ...] + files_list = list(result.get("files", {}).items()) + print(f"Fetched {len(files_list)} files.") + return files_list + def post(self, shared, prep_res, exec_res): - # Store the answer in shared - shared["answer"] = exec_res \ No newline at end of file + shared["files"] = exec_res # List of (path, content) tuples + +class IdentifyAbstractions(Node): + def prep(self, shared): + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + context, file_info = create_llm_context(files_data) + # Format file info for the prompt (comment is just a hint for LLM) + file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info]) + return context, file_listing_for_prompt, len(files_data), project_name # Return project name + + def exec(self, prep_res): + context, file_listing_for_prompt, file_count, project_name = prep_res # Unpack project name + print("Identifying abstractions using LLM...") + prompt = f""" +For the project `{project_name}`: + +Codebase Context: +{context} + +Analyze the codebase context. +Identify the top 5-10 core most important abstractions to help those new to the codebase. + +For each abstraction, provide: +1. A concise `name`. +2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words. +3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`. + +List of file indices and paths present in the context: +{file_listing_for_prompt} + +Format the output as a YAML list of dictionaries: + +```yaml +- name: Query Processing + description: | + Explains what the abstraction does. + It's like a central dispatcher routing requests. + file_indices: + - 0 # path/to/file1.py + - 3 # path/to/related.py +- name: Query Optimization + description: | + Another core concept, similar to a blueprint for objects. + file_indices: + - 5 # path/to/another.js +# ... up to 10 abstractions +```""" + response = call_llm(prompt) + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + abstractions = yaml.safe_load(yaml_str) + + if not isinstance(abstractions, list): + raise ValueError("LLM Output is not a list") + + validated_abstractions = [] + for item in abstractions: + if not isinstance(item, dict) or not all(k in item for k in ["name", "description", "file_indices"]): + raise ValueError(f"Missing keys in abstraction item: {item}") + if not isinstance(item["description"], str): + raise ValueError(f"description is not a string in item: {item}") + if not isinstance(item["file_indices"], list): + raise ValueError(f"file_indices is not a list in item: {item}") + + # Validate indices + validated_indices = [] + for idx_entry in item["file_indices"]: + try: + if isinstance(idx_entry, int): + idx = idx_entry + elif isinstance(idx_entry, str) and '#' in idx_entry: + idx = int(idx_entry.split('#')[0].strip()) + else: + idx = int(str(idx_entry).strip()) + + if not (0 <= idx < file_count): + raise ValueError(f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}.") + validated_indices.append(idx) + except (ValueError, TypeError): + raise ValueError(f"Could not parse index from entry: {idx_entry} in item {item['name']}") + + item["files"] = sorted(list(set(validated_indices))) + # Store only the required fields + validated_abstractions.append({ + "name": item["name"], + "description": item["description"], + "files": item["files"] + }) + + print(f"Identified {len(validated_abstractions)} abstractions.") + return validated_abstractions + + def post(self, shared, prep_res, exec_res): + shared["abstractions"] = exec_res # List of {"name": str, "description": str, "files": [int]} + +class AnalyzeRelationships(Node): + def prep(self, shared): + abstractions = shared["abstractions"] # Now contains 'files' list of indices + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + + # Create context with abstraction names, indices, descriptions, and relevant file snippets + context = "Identified Abstractions:\n" + all_relevant_indices = set() + abstraction_info_for_prompt = [] + for i, abstr in enumerate(abstractions): + # Use 'files' which contains indices directly + file_indices_str = ", ".join(map(str, abstr['files'])) + info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\n Description: {abstr['description']}" + context += info_line + "\n" + abstraction_info_for_prompt.append(f"{i} # {abstr['name']}") + all_relevant_indices.update(abstr['files']) + + context += "\nRelevant File Snippets (Referenced by Index and Path):\n" + # Get content for relevant files using helper + relevant_files_content_map = get_content_for_indices( + files_data, + sorted(list(all_relevant_indices)) + ) + # Format file content for context + file_context_str = "\n\n".join( + f"--- File: {idx_path} ---\n{content}" + for idx_path, content in relevant_files_content_map.items() + ) + context += file_context_str + + return context, "\n".join(abstraction_info_for_prompt), project_name # Return project name + + def exec(self, prep_res): + context, abstraction_listing, project_name = prep_res # Unpack project name + print("Analyzing relationships using LLM...") + prompt = f""" +Based on the following abstractions and relevant code snippets from the project `{project_name}`: + +List of Abstraction Indices and Names: +{abstraction_listing} + +Context (Abstractions, Descriptions, Code): +{context} + +Please provide: +1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences. Use markdown formatting with **bold** and *italic* text to highlight important concepts. +2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify: + - `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`) + - `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`) + - `label`: A brief label for the interaction **in just a few words** (e.g., "Manages", "Inherits", "Uses"). + Ideally the relationship should be backed by one abstraction calling or passing parameters to another. + Simplify the relationship and exclude those non-important ones. + +IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships. + +Format the output as YAML: + +```yaml +summary: | + A brief, simple explanation of the project. + Can span multiple lines with **bold** and *italic* for emphasis. +relationships: + - from_abstraction: 0 # AbstractionName1 + to_abstraction: 1 # AbstractionName2 + label: "Manages" + - from_abstraction: 2 # AbstractionName3 + to_abstraction: 0 # AbstractionName1 + label: "Provides config" + # ... other relationships +``` + +Now, provide the YAML output: +""" + response = call_llm(prompt) + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + relationships_data = yaml.safe_load(yaml_str) + + if not isinstance(relationships_data, dict) or not all(k in relationships_data for k in ["summary", "relationships"]): + raise ValueError("LLM output is not a dict or missing keys ('summary', 'relationships')") + if not isinstance(relationships_data["summary"], str): + raise ValueError("summary is not a string") + if not isinstance(relationships_data["relationships"], list): + raise ValueError("relationships is not a list") + + # Validate relationships structure + validated_relationships = [] + num_abstractions = len(abstraction_listing.split('\n')) + for rel in relationships_data["relationships"]: + # Check for 'label' key + if not isinstance(rel, dict) or not all(k in rel for k in ["from_abstraction", "to_abstraction", "label"]): + raise ValueError(f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}") + # Validate 'label' is a string + if not isinstance(rel["label"], str): + raise ValueError(f"Relationship label is not a string: {rel}") + + # Validate indices + try: + from_idx = int(str(rel["from_abstraction"]).split('#')[0].strip()) + to_idx = int(str(rel["to_abstraction"]).split('#')[0].strip()) + if not (0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions): + raise ValueError(f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}.") + validated_relationships.append({ + "from": from_idx, + "to": to_idx, + "label": rel["label"] + }) + except (ValueError, TypeError): + raise ValueError(f"Could not parse indices from relationship: {rel}") + + print("Generated project summary and relationship details.") + return { + "summary": relationships_data["summary"], + "details": validated_relationships # Store validated, index-based relationships + } + + + def post(self, shared, prep_res, exec_res): + # Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]} + shared["relationships"] = exec_res + +class OrderChapters(Node): + def prep(self, shared): + abstractions = shared["abstractions"] + relationships = shared["relationships"] + project_name = shared["project_name"] # Get project name + + # Prepare context for the LLM + abstraction_info_for_prompt = [] + for i, a in enumerate(abstractions): + abstraction_info_for_prompt.append(f"- {i} # {a['name']}") + abstraction_listing = "\n".join(abstraction_info_for_prompt) + + context = f"Project Summary:\n{relationships['summary']}\n\n" + context += "Relationships (Indices refer to abstractions above):\n" + for rel in relationships['details']: + from_name = abstractions[rel['from']]['name'] + to_name = abstractions[rel['to']]['name'] + # Use 'label' instead of 'desc' + context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n" + + return abstraction_listing, context, len(abstractions), project_name + + def exec(self, prep_res): + abstraction_listing, context, num_abstractions, project_name = prep_res + print("Determining chapter order using LLM...") + prompt = f""" +Given the following project abstractions and their relationships for the project ```` {project_name} ````: + +Abstractions (Index # Name): +{abstraction_listing} + +Context about relationships and project summary: +{context} + +If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last? +Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts. + +Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`. + +```yaml +- 2 # FoundationalConcept +- 0 # CoreClassA +- 1 # CoreClassB (uses CoreClassA) +- ... +``` + +Now, provide the YAML output: +""" + response = call_llm(prompt) + + # --- Validation --- + # Rely on Node's built-in retry/fallback + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + ordered_indices_raw = yaml.safe_load(yaml_str) + + if not isinstance(ordered_indices_raw, list): + raise ValueError("LLM output is not a list") + + ordered_indices = [] + seen_indices = set() + for entry in ordered_indices_raw: + try: + if isinstance(entry, int): + idx = entry + elif isinstance(entry, str) and '#' in entry: + idx = int(entry.split('#')[0].strip()) + else: + idx = int(str(entry).strip()) + + if not (0 <= idx < num_abstractions): + raise ValueError(f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}.") + if idx in seen_indices: + raise ValueError(f"Duplicate index {idx} found in ordered list.") + ordered_indices.append(idx) + seen_indices.add(idx) + + except (ValueError, TypeError): + raise ValueError(f"Could not parse index from ordered list entry: {entry}") + + # Check if all abstractions are included + if len(ordered_indices) != num_abstractions: + raise ValueError(f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}") + + print(f"Determined chapter order (indices): {ordered_indices}") + return ordered_indices # Return the list of indices + + def post(self, shared, prep_res, exec_res): + # exec_res is already the list of ordered indices + shared["chapter_order"] = exec_res # List of indices + +class WriteChapters(BatchNode): + def prep(self, shared): + chapter_order = shared["chapter_order"] # List of indices + abstractions = shared["abstractions"] # List of dicts, now using 'files' with indices + files_data = shared["files"] + # Get already written chapters to provide context + # We store them temporarily during the batch run, not in shared memory yet + # The 'previous_chapters_summary' will be built progressively in the exec context + self.chapters_written_so_far = [] # Use instance variable for temporary storage across exec calls + + # Create a complete list of all chapters + all_chapters = [] + chapter_filenames = {} # Store chapter filename mapping for linking + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + chapter_num = i + 1 + chapter_name = abstractions[abstraction_index]["name"] + # Create safe filename + safe_name = "".join(c if c.isalnum() else '_' for c in chapter_name).lower() + filename = f"{i+1:02d}_{safe_name}.md" + # Format with link + all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})") + # Store mapping of chapter index to filename for linking + chapter_filenames[abstraction_index] = {"num": chapter_num, "name": chapter_name, "filename": filename} + + # Create a formatted string with all chapters + full_chapter_listing = "\n".join(all_chapters) + + items_to_process = [] + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + abstraction_details = abstractions[abstraction_index] + # Use 'files' (list of indices) directly + related_file_indices = abstraction_details.get("files", []) + # Get content using helper, passing indices + related_files_content_map = get_content_for_indices(files_data, related_file_indices) + + # Get previous chapter info for transitions + prev_chapter = None + if i > 0: + prev_idx = chapter_order[i-1] + prev_chapter = chapter_filenames[prev_idx] + + # Get next chapter info for transitions + next_chapter = None + if i < len(chapter_order) - 1: + next_idx = chapter_order[i+1] + next_chapter = chapter_filenames[next_idx] + + items_to_process.append({ + "chapter_num": i + 1, + "abstraction_index": abstraction_index, + "abstraction_details": abstraction_details, + "related_files_content_map": related_files_content_map, + "project_name": shared["project_name"], # Add project name + "full_chapter_listing": full_chapter_listing, # Add the full chapter listing + "chapter_filenames": chapter_filenames, # Add chapter filenames mapping + "prev_chapter": prev_chapter, # Add previous chapter info + "next_chapter": next_chapter, # Add next chapter info + # previous_chapters_summary will be added dynamically in exec + }) + else: + print(f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping.") + + print(f"Preparing to write {len(items_to_process)} chapters...") + return items_to_process # Iterable for BatchNode + + def exec(self, item): + # This runs for each item prepared above + abstraction_name = item["abstraction_details"]["name"] + chapter_num = item["chapter_num"] + project_name = item.get("project_name") # Get from item + print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...") + + # Prepare file context string from the map + file_context_str = "\n\n".join( + f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}" + for idx_path, content in item["related_files_content_map"].items() + ) + + # Get summary of chapters written *before* this one + # Use the temporary instance variable + previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far) + + + prompt = f""" +Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}. + +Concept Details: +- Description: +{item["abstraction_details"]["description"]} + +Complete Tutorial Structure: +{item["full_chapter_listing"]} + +Context from previous chapters (summary): +{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."} + +Relevant Code Snippets: +{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."} + +Instructions for the chapter: +- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`). + +- If this is not the first chapter, begin with a brief transition from the previous chapter, referencing it with a proper Markdown link. + +- Begin with a high-level motivation explaining what problem this abstraction solves. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners. + +- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way. + +- Explain how to use this abstraction to solve the use case. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen). + +- Each code block should be BELOW 20 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it. + +- Describe the internal implementation to help understand what's under the hood. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: +`participant QP as Query Processing` + +- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly. + +- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename. Example: "we will talk about [Query Processing](03_query_processing.md) in Chapter 3". + +- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format). + +- Heavily use analogies and examples throughout to help beginners understand. + +- End the chapter with a brief conclusion that summarizes what was learned and provides a transition to the next chapter. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename). + +- Ensure the tone is welcoming and easy for a newcomer to understand. + +- Output *only* the Markdown content for this chapter. + +Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags): +""" + chapter_content = call_llm(prompt) + # Basic validation/cleanup + actual_heading = f"# Chapter {chapter_num}: {abstraction_name}" + if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"): + # Add heading if missing or incorrect, trying to preserve content + lines = chapter_content.strip().split('\n') + if lines and lines[0].strip().startswith("#"): # If there's some heading, replace it + lines[0] = actual_heading + chapter_content = "\n".join(lines) + else: # Otherwise, prepend it + chapter_content = f"{actual_heading}\n\n{chapter_content}" + + # Add the generated content to our temporary list for the next iteration's context + self.chapters_written_so_far.append(chapter_content) + + return chapter_content # Return the Markdown string + + def post(self, shared, prep_res, exec_res_list): + # exec_res_list contains the generated Markdown for each chapter, in order + shared["chapters"] = exec_res_list + # Clean up the temporary instance variable + del self.chapters_written_so_far + print(f"Finished writing {len(exec_res_list)} chapters.") + +class CombineTutorial(Node): + def prep(self, shared): + project_name = shared["project_name"] + output_base_dir = shared.get("output_dir", "output") # Default output dir + output_path = os.path.join(output_base_dir, project_name) + repo_url = shared["repo_url"] # Get the repository URL + + # Use 'label' from relationships_data['details'] + relationships_data = shared["relationships"] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]} + chapter_order = shared["chapter_order"] # indices + abstractions = shared["abstractions"] # list of dicts + chapters_content = shared["chapters"] # list of strings + + # --- Generate Mermaid Diagram --- + mermaid_lines = ["flowchart TD"] + # Add nodes for each abstraction + for i, abstr in enumerate(abstractions): + # Sanitize name for Mermaid ID and label + node_id = f"A{i}" + sanitized_name = abstr['name'].replace('"', '') + node_label = sanitized_name # Using sanitized name only, no index + mermaid_lines.append(f' {node_id}["{node_label}"]') + # Add edges for relationships using 'label' + for rel in relationships_data['details']: + from_node_id = f"A{rel['from']}" + to_node_id = f"A{rel['to']}" + # Sanitize 'label' for edge label + edge_label = rel['label'].replace('"', '').replace('\n', ' ') # Basic sanitization + # Limit edge label length for readability (optional, but good for diagrams) + max_label_len = 30 # Make it shorter for labels + if len(edge_label) > max_label_len: + edge_label = edge_label[:max_label_len-3] + "..." + mermaid_lines.append(f' {from_node_id} -- "{edge_label}" --> {to_node_id}') + + mermaid_diagram = "\n".join(mermaid_lines) + # --- End Mermaid --- + + + # Prepare index.md content + index_content = f"# Tutorial: {project_name}\n\n" + index_content += f"{relationships_data['summary']}\n\n" + index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n" + + # Add Mermaid diagram for relationships + index_content += "```mermaid\n" + index_content += mermaid_diagram + "\n" + index_content += "```\n\n" + + index_content += "## Chapters\n\n" + + chapter_files = [] + # Generate chapter links based on the determined order + for i, abstraction_index in enumerate(chapter_order): + # Ensure index is valid and we have content for it + if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content): + abstraction_name = abstractions[abstraction_index]["name"] + # Sanitize name for filename + safe_name = "".join(c if c.isalnum() else '_' for c in abstraction_name).lower() + # Use chapter number (i+1) for ordering filename + filename = f"{i+1:02d}_{safe_name}.md" + index_content += f"{i+1}. [{abstraction_name}]({filename})\n" + + # Add attribution to chapter content + chapter_content = chapters_content[i] + if not chapter_content.endswith("\n\n"): + chapter_content += "\n\n" + chapter_content += "---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + # Store filename and corresponding content + chapter_files.append({"filename": filename, "content": chapter_content}) + else: + print(f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry.") + + # Add attribution to index content + index_content += "\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + return { + "output_path": output_path, + "index_content": index_content, + "chapter_files": chapter_files # List of {"filename": str, "content": str} + } + + def exec(self, prep_res): + output_path = prep_res["output_path"] + index_content = prep_res["index_content"] + chapter_files = prep_res["chapter_files"] + + print(f"Combining tutorial into directory: {output_path}") + # Rely on Node's built-in retry/fallback + os.makedirs(output_path, exist_ok=True) + + # Write index.md + index_filepath = os.path.join(output_path, "index.md") + with open(index_filepath, "w", encoding="utf-8") as f: + f.write(index_content) + print(f" - Wrote {index_filepath}") + + # Write chapter files + for chapter_info in chapter_files: + chapter_filepath = os.path.join(output_path, chapter_info["filename"]) + with open(chapter_filepath, "w", encoding="utf-8") as f: + f.write(chapter_info["content"]) + print(f" - Wrote {chapter_filepath}") + + return output_path # Return the final path + + + def post(self, shared, prep_res, exec_res): + shared["final_output_dir"] = exec_res # Store the output path + print(f"\nTutorial generation complete! Files are in: {exec_res}") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fcb64c3..7351fef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ -pocketflow>=0.0.1 \ No newline at end of file +pocketflow>=0.0.1 +pyyaml>=6.0 +requests>=2.28.0 +google-cloud-aiplatform>=1.25.0 \ No newline at end of file diff --git a/utils/call_llm.py b/utils/call_llm.py index c6f68b2..07b0797 100644 --- a/utils/call_llm.py +++ b/utils/call_llm.py @@ -1,14 +1,87 @@ -from openai import OpenAI +from google import genai +import os +import logging +import json +from datetime import datetime -# Learn more about calling the LLM: https://the-pocket.github.io/PocketFlow/utility_function/llm.html -def call_llm(prompt): - client = OpenAI(api_key="YOUR_API_KEY_HERE") - r = client.chat.completions.create( - model="gpt-4o", - messages=[{"role": "user", "content": prompt}] - ) - return r.choices[0].message.content +# Configure logging +log_directory = os.getenv("LOG_DIR", "logs") +os.makedirs(log_directory, exist_ok=True) +log_file = os.path.join(log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log") + +# Set up logger +logger = logging.getLogger("llm_logger") +logger.setLevel(logging.INFO) +logger.propagate = False # Prevent propagation to root logger +file_handler = logging.FileHandler(log_file) +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) +logger.addHandler(file_handler) + +# Simple cache configuration +cache_file = "llm_cache.json" + +def call_llm(prompt: str, use_cache: bool = True) -> str: + # Log the prompt + logger.info(f"PROMPT: {prompt}") + # Check cache if enabled + if use_cache: + # Load cache from disk + cache = {} + if os.path.exists(cache_file): + try: + with open(cache_file, 'r') as f: + cache = json.load(f) + except: + logger.warning(f"Failed to load cache, starting with empty cache") + + # Return from cache if exists + if prompt in cache: + logger.info(f"RESPONSE: {cache[prompt]}") + return cache[prompt] + + # Call the LLM if not in cache or cache disabled + client = genai.Client( + vertexai=True, + project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"), + location=os.getenv("GEMINI_LOCATION", "us-central1") + ) + model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25") + response = client.models.generate_content( + model=model, + contents=[prompt] + ) + response_text = response.text + + # Log the response + logger.info(f"RESPONSE: {response_text}") + + # Update cache if enabled + if use_cache: + # Load cache again to avoid overwrites + cache = {} + if os.path.exists(cache_file): + try: + with open(cache_file, 'r') as f: + cache = json.load(f) + except: + pass + + # Add to cache and save + cache[prompt] = response_text + try: + with open(cache_file, 'w') as f: + json.dump(cache, f) + except Exception as e: + logger.error(f"Failed to save cache: {e}") + + return response_text + if __name__ == "__main__": - prompt = "What is the meaning of life?" - print(call_llm(prompt)) + test_prompt = "Hello, how are you?" + + # First call - should hit the API + print("Making call...") + response1 = call_llm(test_prompt, use_cache=False) + print(f"Response: {response1}") + \ No newline at end of file diff --git a/utils/crawl_github_files.py b/utils/crawl_github_files.py new file mode 100644 index 0000000..7e25b29 --- /dev/null +++ b/utils/crawl_github_files.py @@ -0,0 +1,236 @@ +import requests +import base64 +import os +import time +import fnmatch +from typing import Union, Set, List, Dict, Tuple, Any +from urllib.parse import urlparse + +def crawl_github_files( + repo_url, + token=None, + max_file_size: int = 1 * 1024 * 1024, # 1 MB + use_relative_paths: bool = False, + include_patterns: Union[str, Set[str]] = None, + exclude_patterns: Union[str, Set[str]] = None +): + """ + Crawl files from a specific path in a GitHub repository at a specific commit. + + Args: + repo_url (str): URL of the GitHub repository with specific path and commit + (e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core') + token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits. + max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB) + use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory + include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}). + If None, all files are included. + exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude. + If None, no files are excluded. + + Returns: + dict: Dictionary with files and statistics + """ + # Convert single pattern to set + if include_patterns and isinstance(include_patterns, str): + include_patterns = {include_patterns} + if exclude_patterns and isinstance(exclude_patterns, str): + exclude_patterns = {exclude_patterns} + + # Parse GitHub URL to extract owner, repo, commit/branch, and path + parsed_url = urlparse(repo_url) + path_parts = parsed_url.path.strip('/').split('/') + + if len(path_parts) < 2: + raise ValueError(f"Invalid GitHub URL: {repo_url}") + + # Extract the basic components + owner = path_parts[0] + repo = path_parts[1] + + # Check if URL contains a specific branch/commit + if 'tree' in path_parts: + tree_index = path_parts.index('tree') + ref = path_parts[tree_index + 1] + # Combine all parts after the ref as the path + path_start = tree_index + 2 + specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else "" + else: + ref = "main" # Default branch + specific_path = "" + + # Setup for GitHub API + headers = {"Accept": "application/vnd.github.v3+json"} + if token: + headers["Authorization"] = f"token {token}" + + # Dictionary to store path -> content mapping + files = {} + skipped_files = [] + + def should_include_file(file_path: str, file_name: str) -> bool: + """Determine if a file should be included based on patterns""" + # If no include patterns are specified, include all files + if not include_patterns: + include_file = True + else: + # Check if file matches any include pattern + include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns) + + # If exclude patterns are specified, check if file should be excluded + if exclude_patterns and include_file: + # Exclude if file matches any exclude pattern + exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns) + return not exclude_file + + return include_file + + def fetch_contents(path): + """Fetch contents of the repository at a specific path and commit""" + url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}" + params = {"ref": ref} + + response = requests.get(url, headers=headers, params=params) + + if response.status_code == 403 and 'rate limit exceeded' in response.text.lower(): + reset_time = int(response.headers.get('X-RateLimit-Reset', 0)) + wait_time = max(reset_time - time.time(), 0) + 1 + print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...") + time.sleep(wait_time) + return fetch_contents(path) + + if response.status_code == 404: + if not token: + print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.") + else: + print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.") + return + + if response.status_code != 200: + print(f"Error fetching {path}: {response.status_code} - {response.text}") + return + + contents = response.json() + + # Handle both single file and directory responses + if not isinstance(contents, list): + contents = [contents] + + for item in contents: + item_path = item["path"] + + # Calculate relative path if requested + if use_relative_paths and specific_path: + # Make sure the path is relative to the specified subdirectory + if item_path.startswith(specific_path): + rel_path = item_path[len(specific_path):].lstrip('/') + else: + rel_path = item_path + else: + rel_path = item_path + + if item["type"] == "file": + # Check if file should be included based on patterns + if not should_include_file(rel_path, item["name"]): + print(f"Skipping {rel_path}: Does not match include/exclude patterns") + continue + + # Check file size if available + file_size = item.get("size", 0) + if file_size > max_file_size: + skipped_files.append((item_path, file_size)) + print(f"Skipping {rel_path}: File size ({file_size} bytes) exceeds limit ({max_file_size} bytes)") + continue + + # For files, get raw content + if "download_url" in item and item["download_url"]: + file_url = item["download_url"] + file_response = requests.get(file_url, headers=headers) + + # Final size check in case content-length header is available but differs from metadata + content_length = int(file_response.headers.get('content-length', 0)) + if content_length > max_file_size: + skipped_files.append((item_path, content_length)) + print(f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)") + continue + + if file_response.status_code == 200: + files[rel_path] = file_response.text + print(f"Downloaded: {rel_path} ({file_size} bytes) ") + else: + print(f"Failed to download {rel_path}: {file_response.status_code}") + else: + # Alternative method if download_url is not available + content_response = requests.get(item["url"], headers=headers) + if content_response.status_code == 200: + content_data = content_response.json() + if content_data.get("encoding") == "base64" and "content" in content_data: + # Check size of base64 content before decoding + if len(content_data["content"]) * 0.75 > max_file_size: # Approximate size calculation + estimated_size = int(len(content_data["content"]) * 0.75) + skipped_files.append((item_path, estimated_size)) + print(f"Skipping {rel_path}: Encoded content exceeds size limit") + continue + + file_content = base64.b64decode(content_data["content"]).decode('utf-8') + files[rel_path] = file_content + print(f"Downloaded: {rel_path} ({file_size} bytes)") + else: + print(f"Unexpected content format for {rel_path}") + else: + print(f"Failed to get content for {rel_path}: {content_response.status_code}") + + elif item["type"] == "dir": + # Recursively process subdirectories + fetch_contents(item_path) + + # Start crawling from the specified path + fetch_contents(specific_path) + + return { + "files": files, + "stats": { + "downloaded_count": len(files), + "skipped_count": len(skipped_files), + "skipped_files": skipped_files, + "base_path": specific_path if use_relative_paths else None, + "include_patterns": include_patterns, + "exclude_patterns": exclude_patterns + } + } + +# Example usage +if __name__ == "__main__": + # Get token from environment variable (more secure than hardcoding) + github_token = os.environ.get("GITHUB_TOKEN") + + repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic" + + # Example: Get Python and Markdown files, but exclude test files + result = crawl_github_files( + repo_url, + token=github_token, + max_file_size=1 * 1024 * 1024, # 1 MB in bytes + use_relative_paths=True, # Enable relative paths + include_patterns={"*.py", "*.md"}, # Include Python and Markdown files + ) + + files = result["files"] + stats = result["stats"] + + print(f"\nDownloaded {stats['downloaded_count']} files.") + print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.") + print(f"Base path for relative paths: {stats['base_path']}") + print(f"Include patterns: {stats['include_patterns']}") + print(f"Exclude patterns: {stats['exclude_patterns']}") + + # Display all file paths in the dictionary + print("\nFiles in dictionary:") + for file_path in sorted(files.keys()): + print(f" {file_path}") + + # Example: accessing content of a specific file + if files: + sample_file = next(iter(files)) + print(f"\nSample file: {sample_file}") + print(f"Content preview: {files[sample_file][:200]}...") \ No newline at end of file