update readme examples

2025-12-18 15:04:20 +01:00 · 2025-04-04 15:06:41 -04:00
parent b63a1c14b0
commit dc4d445676
7 changed files with 1053 additions and 55 deletions
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@

 This is a tutorial project of [Pocket Flow](https://github.com/The-Pocket/PocketFlow), a 100-line LLM framework. It crawls GitHub repositories and build a knowledge base from the code. It analyzes entire codebases to identify core abstractions and how they interact, and transforms complex code into beginner-friendly tutorials with clear visualizations.

-## Example Tutorials for Popular GitHub Repositories!
+## ⭐ Example Tutorials for Popular GitHub Repositories!

 - [AutoGen Core](https://the-pocket.github.io/Tutorial-Codebase-Knowledge/AutoGen%20Core) - Build AI teams that talk, think, and solve problems together like coworkers!

--- a/flow.py
+++ b/flow.py
@@ -1,14 +1,33 @@
 from pocketflow import Flow
-from nodes import GetQuestionNode, AnswerNode
+# Import all node classes from nodes.py
+from nodes import (
+    FetchRepo,
+    IdentifyAbstractions,
+    AnalyzeRelationships,
+    OrderChapters,
+    WriteChapters,
+    CombineTutorial
+)

-def create_qa_flow():
-    """Create and return a question-answering flow."""
-    # Create nodes
-    get_question_node = GetQuestionNode()
-    answer_node = AnswerNode()
-    
-    # Connect nodes in sequence
-    get_question_node >> answer_node
-    
-    # Create flow starting with input node
-    return Flow(start=get_question_node)
+def create_tutorial_flow():
+    """Creates and returns the codebase tutorial generation flow."""
+
+    # Instantiate nodes
+    fetch_repo = FetchRepo()
+    identify_abstractions = IdentifyAbstractions(max_retries=3, wait=10)
+    analyze_relationships = AnalyzeRelationships(max_retries=3, wait=10)
+    order_chapters = OrderChapters(max_retries=3, wait=10)
+    write_chapters = WriteChapters(max_retries=3, wait=10) # This is a BatchNode
+    combine_tutorial = CombineTutorial()
+
+    # Connect nodes in sequence based on the design
+    fetch_repo >> identify_abstractions
+    identify_abstractions >> analyze_relationships
+    analyze_relationships >> order_chapters
+    order_chapters >> write_chapters
+    write_chapters >> combine_tutorial
+
+    # Create the flow starting with FetchRepo
+    tutorial_flow = Flow(start=fetch_repo)
+
+    return tutorial_flow
--- a/main.py
+++ b/main.py
@@ -1,16 +1,67 @@
-from flow import qa_flow
+import os
+import argparse
+# Import the function that creates the flow
+from flow import create_tutorial_flow

-# Example main function
-# Please replace this with your own main function
+# Default file patterns
+DEFAULT_INCLUDE_PATTERNS = {
+    "*.py", "*.js", "*.ts", "*.go", "*.java", "*.pyi", "*.pyx", 
+    "*.c", "*.cc", "*.cpp", "*.h", "*.md", "*.rst", "Dockerfile", 
+    "Makefile", "*.yaml", "*.yml"
+}
+
+DEFAULT_EXCLUDE_PATTERNS = {
+    "*test*", "tests/*", "docs/*", "examples/*", "v1/*", 
+    "dist/*", "build/*", "experimental/*", "deprecated/*", 
+    "legacy/*", ".git/*", ".github/*"
+}
+
+# --- Main Function ---
 def main():
+    parser = argparse.ArgumentParser(description="Generate a tutorial for a GitHub codebase.")
+    parser.add_argument("repo_url", help="URL of the public GitHub repository.")
+    parser.add_argument("-n", "--name", help="Project name (optional, derived from URL if omitted).")
+    parser.add_argument("-t", "--token", help="GitHub personal access token (optional, reads from GITHUB_TOKEN env var if not provided).")
+    parser.add_argument("-o", "--output", default="output", help="Base directory for output (default: ./output).")
+    parser.add_argument("-i", "--include", nargs="+", help="Include file patterns (e.g. '*.py' '*.js'). Defaults to common code files if not specified.")
+    parser.add_argument("-e", "--exclude", nargs="+", help="Exclude file patterns (e.g. 'tests/*' 'docs/*'). Defaults to test/build directories if not specified.")
+    parser.add_argument("-s", "--max-size", type=int, default=100000, help="Maximum file size in bytes (default: 100000, about 100KB).")
+
+    args = parser.parse_args()
+
+    # Get GitHub token from argument or environment variable
+    github_token = args.token or os.environ.get('GITHUB_TOKEN')
+    if not github_token:
+        print("Warning: No GitHub token provided. You might hit rate limits for public repositories.")
+
+    # Initialize the shared dictionary with inputs
    shared = {
-        "question": "In one sentence, what's the end of universe?",
-        "answer": None
+        "repo_url": args.repo_url,
+        "project_name": args.name, # Can be None, FetchRepo will derive it
+        "github_token": github_token,
+        "output_dir": args.output, # Base directory for CombineTutorial output
+
+        # Add include/exclude patterns and max file size
+        "include_patterns": set(args.include) if args.include else DEFAULT_INCLUDE_PATTERNS,
+        "exclude_patterns": set(args.exclude) if args.exclude else DEFAULT_EXCLUDE_PATTERNS,
+        "max_file_size": args.max_size,
+
+        # Outputs will be populated by the nodes
+        "files": [],
+        "abstractions": [],
+        "relationships": {},
+        "chapter_order": [],
+        "chapters": [],
+        "final_output_dir": None
    }

-    qa_flow.run(shared)
-    print("Question:", shared["question"])
-    print("Answer:", shared["answer"])
+    print(f"Starting tutorial generation for: {args.repo_url}")

+    # Create the flow instance
+    tutorial_flow = create_tutorial_flow()
+
+    # Run the flow
+    tutorial_flow.run(shared)
+    
 if __name__ == "__main__":
    main()
--- a/nodes.py
+++ b/nodes.py
@@ -1,26 +1,642 @@
-from pocketflow import Node
-from utils.call_llm import call_llm
+import os
+import yaml
+from pocketflow import Node, BatchNode
+from utils.crawl_github_files import crawl_github_files
+from utils.call_llm import call_llm # Assuming you have this utility

-class GetQuestionNode(Node):
-    def exec(self, _):
-        # Get question directly from user input
-        user_question = input("Enter your question: ")
-        return user_question
-    
-    def post(self, shared, prep_res, exec_res):
-        # Store the user's question
-        shared["question"] = exec_res
-        return "default"  # Go to the next node
+# Helper to create context from files, respecting limits (basic example)
+def create_llm_context(files_data):
+    context = ""
+    file_info = [] # Store tuples of (index, path)
+    for i, (path, content) in enumerate(files_data):
+        entry = f"--- File Index {i}: {path} ---\n{content}\n\n"
+        context += entry
+        file_info.append((i, path))

-class AnswerNode(Node):
+    return context, file_info # file_info is list of (index, path)
+
+# Helper to get content for specific file indices
+def get_content_for_indices(files_data, indices):
+    content_map = {}
+    for i in indices:
+        if 0 <= i < len(files_data):
+            path, content = files_data[i]
+            content_map[f"{i} # {path}"] = content # Use index + path as key for context
+    return content_map
+
+class FetchRepo(Node):
    def prep(self, shared):
-        # Read question from shared
-        return shared["question"]
-    
-    def exec(self, question):
-        # Call LLM to get the answer
-        return call_llm(question)
-    
+        repo_url = shared["repo_url"]
+        project_name = shared.get("project_name")
+        if not project_name:
+            # Basic name derivation from URL
+            project_name = repo_url.split('/')[-1].replace('.git', '')
+            shared["project_name"] = project_name
+
+        # Get file patterns directly from shared (defaults are defined in main.py)
+        include_patterns = shared["include_patterns"]
+        exclude_patterns = shared["exclude_patterns"]
+        max_file_size = shared["max_file_size"]
+
+        return {
+            "repo_url": repo_url,
+            "token": shared.get("github_token"),
+            "include_patterns": include_patterns,
+            "exclude_patterns": exclude_patterns,
+            "max_file_size": max_file_size,
+            "use_relative_paths": True
+        }
+
+    def exec(self, prep_res):
+        print(f"Crawling repository: {prep_res['repo_url']}...")
+        result = crawl_github_files(
+            repo_url=prep_res["repo_url"],
+            token=prep_res["token"],
+            include_patterns=prep_res["include_patterns"],
+            exclude_patterns=prep_res["exclude_patterns"],
+            max_file_size=prep_res["max_file_size"],
+            use_relative_paths=prep_res["use_relative_paths"]
+        )
+        # Convert dict to list of tuples: [(path, content), ...]
+        files_list = list(result.get("files", {}).items())
+        print(f"Fetched {len(files_list)} files.")
+        return files_list
+
    def post(self, shared, prep_res, exec_res):
-        # Store the answer in shared
-        shared["answer"] = exec_res
+        shared["files"] = exec_res # List of (path, content) tuples
+
+class IdentifyAbstractions(Node):
+    def prep(self, shared):
+        files_data = shared["files"]
+        project_name = shared["project_name"]  # Get project name
+        context, file_info = create_llm_context(files_data)
+        # Format file info for the prompt (comment is just a hint for LLM)
+        file_listing_for_prompt = "\n".join([f"- {idx} # {path}" for idx, path in file_info])
+        return context, file_listing_for_prompt, len(files_data), project_name  # Return project name
+
+    def exec(self, prep_res):
+        context, file_listing_for_prompt, file_count, project_name = prep_res  # Unpack project name
+        print("Identifying abstractions using LLM...")
+        prompt = f"""
+For the project `{project_name}`:
+
+Codebase Context:
+{context}
+
+Analyze the codebase context.
+Identify the top 5-10 core most important abstractions to help those new to the codebase.
+
+For each abstraction, provide:
+1. A concise `name`.
+2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words.
+3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`.
+
+List of file indices and paths present in the context:
+{file_listing_for_prompt}
+
+Format the output as a YAML list of dictionaries:
+
+```yaml
+- name: Query Processing
+  description: | 
+    Explains what the abstraction does.
+    It's like a central dispatcher routing requests.
+  file_indices:
+    - 0 # path/to/file1.py
+    - 3 # path/to/related.py
+- name: Query Optimization
+  description: |
+    Another core concept, similar to a blueprint for objects.
+  file_indices:
+    - 5 # path/to/another.js
+# ... up to 10 abstractions
+```"""
+        response = call_llm(prompt)
+
+        # --- Validation ---
+        yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
+        abstractions = yaml.safe_load(yaml_str)
+
+        if not isinstance(abstractions, list):
+            raise ValueError("LLM Output is not a list")
+
+        validated_abstractions = []
+        for item in abstractions:
+            if not isinstance(item, dict) or not all(k in item for k in ["name", "description", "file_indices"]):
+                raise ValueError(f"Missing keys in abstraction item: {item}")
+            if not isinstance(item["description"], str):
+                 raise ValueError(f"description is not a string in item: {item}")
+            if not isinstance(item["file_indices"], list):
+                 raise ValueError(f"file_indices is not a list in item: {item}")
+
+            # Validate indices
+            validated_indices = []
+            for idx_entry in item["file_indices"]:
+                 try:
+                     if isinstance(idx_entry, int):
+                         idx = idx_entry
+                     elif isinstance(idx_entry, str) and '#' in idx_entry:
+                          idx = int(idx_entry.split('#')[0].strip())
+                     else:
+                          idx = int(str(idx_entry).strip())
+
+                     if not (0 <= idx < file_count):
+                         raise ValueError(f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}.")
+                     validated_indices.append(idx)
+                 except (ValueError, TypeError):
+                      raise ValueError(f"Could not parse index from entry: {idx_entry} in item {item['name']}")
+
+            item["files"] = sorted(list(set(validated_indices)))
+            # Store only the required fields
+            validated_abstractions.append({
+                "name": item["name"],
+                "description": item["description"],
+                "files": item["files"]
+            })
+
+        print(f"Identified {len(validated_abstractions)} abstractions.")
+        return validated_abstractions
+
+    def post(self, shared, prep_res, exec_res):
+        shared["abstractions"] = exec_res # List of {"name": str, "description": str, "files": [int]}
+
+class AnalyzeRelationships(Node):
+    def prep(self, shared):
+        abstractions = shared["abstractions"] # Now contains 'files' list of indices
+        files_data = shared["files"]
+        project_name = shared["project_name"]  # Get project name
+
+        # Create context with abstraction names, indices, descriptions, and relevant file snippets
+        context = "Identified Abstractions:\n"
+        all_relevant_indices = set()
+        abstraction_info_for_prompt = []
+        for i, abstr in enumerate(abstractions):
+            # Use 'files' which contains indices directly
+            file_indices_str = ", ".join(map(str, abstr['files']))
+            info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\n  Description: {abstr['description']}"
+            context += info_line + "\n"
+            abstraction_info_for_prompt.append(f"{i} # {abstr['name']}")
+            all_relevant_indices.update(abstr['files'])
+
+        context += "\nRelevant File Snippets (Referenced by Index and Path):\n"
+        # Get content for relevant files using helper
+        relevant_files_content_map = get_content_for_indices(
+            files_data,
+            sorted(list(all_relevant_indices))
+        )
+        # Format file content for context
+        file_context_str = "\n\n".join(
+            f"--- File: {idx_path} ---\n{content}"
+            for idx_path, content in relevant_files_content_map.items()
+        )
+        context += file_context_str
+
+        return context, "\n".join(abstraction_info_for_prompt), project_name  # Return project name
+
+    def exec(self, prep_res):
+        context, abstraction_listing, project_name = prep_res  # Unpack project name
+        print("Analyzing relationships using LLM...")
+        prompt = f"""
+Based on the following abstractions and relevant code snippets from the project `{project_name}`:
+
+List of Abstraction Indices and Names:
+{abstraction_listing}
+
+Context (Abstractions, Descriptions, Code):
+{context}
+
+Please provide:
+1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences. Use markdown formatting with **bold** and *italic* text to highlight important concepts.
+2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify:
+    - `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`)
+    - `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`)
+    - `label`: A brief label for the interaction **in just a few words** (e.g., "Manages", "Inherits", "Uses").
+    Ideally the relationship should be backed by one abstraction calling or passing parameters to another.
+    Simplify the relationship and exclude those non-important ones.
+
+IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships.
+
+Format the output as YAML:
+
+```yaml
+summary: |
+  A brief, simple explanation of the project.
+  Can span multiple lines with **bold** and *italic* for emphasis.
+relationships:
+  - from_abstraction: 0 # AbstractionName1
+    to_abstraction: 1 # AbstractionName2
+    label: "Manages"
+  - from_abstraction: 2 # AbstractionName3
+    to_abstraction: 0 # AbstractionName1
+    label: "Provides config"
+  # ... other relationships
+```
+
+Now, provide the YAML output:
+"""
+        response = call_llm(prompt)
+
+        # --- Validation ---
+        yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
+        relationships_data = yaml.safe_load(yaml_str)
+
+        if not isinstance(relationships_data, dict) or not all(k in relationships_data for k in ["summary", "relationships"]):
+            raise ValueError("LLM output is not a dict or missing keys ('summary', 'relationships')")
+        if not isinstance(relationships_data["summary"], str):
+             raise ValueError("summary is not a string")
+        if not isinstance(relationships_data["relationships"], list):
+             raise ValueError("relationships is not a list")
+
+        # Validate relationships structure
+        validated_relationships = []
+        num_abstractions = len(abstraction_listing.split('\n'))
+        for rel in relationships_data["relationships"]:
+             # Check for 'label' key
+             if not isinstance(rel, dict) or not all(k in rel for k in ["from_abstraction", "to_abstraction", "label"]):
+                  raise ValueError(f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}")
+             # Validate 'label' is a string
+             if not isinstance(rel["label"], str):
+                  raise ValueError(f"Relationship label is not a string: {rel}")
+
+             # Validate indices
+             try:
+                 from_idx = int(str(rel["from_abstraction"]).split('#')[0].strip())
+                 to_idx = int(str(rel["to_abstraction"]).split('#')[0].strip())
+                 if not (0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions):
+                      raise ValueError(f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}.")
+                 validated_relationships.append({
+                     "from": from_idx,
+                     "to": to_idx,
+                     "label": rel["label"] 
+                 })
+             except (ValueError, TypeError):
+                  raise ValueError(f"Could not parse indices from relationship: {rel}")
+
+        print("Generated project summary and relationship details.")
+        return {
+            "summary": relationships_data["summary"],
+            "details": validated_relationships # Store validated, index-based relationships
+        }
+
+
+    def post(self, shared, prep_res, exec_res):
+        # Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
+        shared["relationships"] = exec_res
+
+class OrderChapters(Node):
+    def prep(self, shared):
+        abstractions = shared["abstractions"]
+        relationships = shared["relationships"]
+        project_name = shared["project_name"]  # Get project name
+
+        # Prepare context for the LLM
+        abstraction_info_for_prompt = []
+        for i, a in enumerate(abstractions):
+            abstraction_info_for_prompt.append(f"- {i} # {a['name']}")
+        abstraction_listing = "\n".join(abstraction_info_for_prompt)
+
+        context = f"Project Summary:\n{relationships['summary']}\n\n"
+        context += "Relationships (Indices refer to abstractions above):\n"
+        for rel in relationships['details']:
+             from_name = abstractions[rel['from']]['name']
+             to_name = abstractions[rel['to']]['name']
+             # Use 'label' instead of 'desc'
+             context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n"
+
+        return abstraction_listing, context, len(abstractions), project_name
+
+    def exec(self, prep_res):
+        abstraction_listing, context, num_abstractions, project_name = prep_res
+        print("Determining chapter order using LLM...")
+        prompt = f"""
+Given the following project abstractions and their relationships for the project ```` {project_name} ````:
+
+Abstractions (Index # Name):
+{abstraction_listing}
+
+Context about relationships and project summary:
+{context}
+
+If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last?
+Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts.
+
+Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`.
+
+```yaml
+- 2 # FoundationalConcept
+- 0 # CoreClassA
+- 1 # CoreClassB (uses CoreClassA)
+- ...
+```
+
+Now, provide the YAML output:
+"""
+        response = call_llm(prompt)
+
+        # --- Validation ---
+        # Rely on Node's built-in retry/fallback
+        yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip()
+        ordered_indices_raw = yaml.safe_load(yaml_str)
+
+        if not isinstance(ordered_indices_raw, list):
+            raise ValueError("LLM output is not a list")
+
+        ordered_indices = []
+        seen_indices = set()
+        for entry in ordered_indices_raw:
+            try:
+                 if isinstance(entry, int):
+                     idx = entry
+                 elif isinstance(entry, str) and '#' in entry:
+                      idx = int(entry.split('#')[0].strip())
+                 else:
+                      idx = int(str(entry).strip())
+
+                 if not (0 <= idx < num_abstractions):
+                      raise ValueError(f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}.")
+                 if idx in seen_indices:
+                     raise ValueError(f"Duplicate index {idx} found in ordered list.")
+                 ordered_indices.append(idx)
+                 seen_indices.add(idx)
+
+            except (ValueError, TypeError):
+                 raise ValueError(f"Could not parse index from ordered list entry: {entry}")
+
+        # Check if all abstractions are included
+        if len(ordered_indices) != num_abstractions:
+             raise ValueError(f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}")
+
+        print(f"Determined chapter order (indices): {ordered_indices}")
+        return ordered_indices # Return the list of indices
+
+    def post(self, shared, prep_res, exec_res):
+        # exec_res is already the list of ordered indices
+        shared["chapter_order"] = exec_res # List of indices
+
+class WriteChapters(BatchNode):
+    def prep(self, shared):
+        chapter_order = shared["chapter_order"] # List of indices
+        abstractions = shared["abstractions"]   # List of dicts, now using 'files' with indices
+        files_data = shared["files"]
+        # Get already written chapters to provide context
+        # We store them temporarily during the batch run, not in shared memory yet
+        # The 'previous_chapters_summary' will be built progressively in the exec context
+        self.chapters_written_so_far = [] # Use instance variable for temporary storage across exec calls
+
+        # Create a complete list of all chapters
+        all_chapters = []
+        chapter_filenames = {} # Store chapter filename mapping for linking
+        for i, abstraction_index in enumerate(chapter_order):
+            if 0 <= abstraction_index < len(abstractions):
+                chapter_num = i + 1
+                chapter_name = abstractions[abstraction_index]["name"]
+                # Create safe filename
+                safe_name = "".join(c if c.isalnum() else '_' for c in chapter_name).lower()
+                filename = f"{i+1:02d}_{safe_name}.md"
+                # Format with link
+                all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})")
+                # Store mapping of chapter index to filename for linking
+                chapter_filenames[abstraction_index] = {"num": chapter_num, "name": chapter_name, "filename": filename}
+        
+        # Create a formatted string with all chapters
+        full_chapter_listing = "\n".join(all_chapters)
+
+        items_to_process = []
+        for i, abstraction_index in enumerate(chapter_order):
+            if 0 <= abstraction_index < len(abstractions):
+                abstraction_details = abstractions[abstraction_index]
+                # Use 'files' (list of indices) directly
+                related_file_indices = abstraction_details.get("files", [])
+                # Get content using helper, passing indices
+                related_files_content_map = get_content_for_indices(files_data, related_file_indices)
+                
+                # Get previous chapter info for transitions
+                prev_chapter = None
+                if i > 0:
+                    prev_idx = chapter_order[i-1]
+                    prev_chapter = chapter_filenames[prev_idx]
+                
+                # Get next chapter info for transitions
+                next_chapter = None
+                if i < len(chapter_order) - 1:
+                    next_idx = chapter_order[i+1]
+                    next_chapter = chapter_filenames[next_idx]
+
+                items_to_process.append({
+                    "chapter_num": i + 1,
+                    "abstraction_index": abstraction_index,
+                    "abstraction_details": abstraction_details,
+                    "related_files_content_map": related_files_content_map,
+                    "project_name": shared["project_name"],  # Add project name
+                    "full_chapter_listing": full_chapter_listing,  # Add the full chapter listing
+                    "chapter_filenames": chapter_filenames,  # Add chapter filenames mapping
+                    "prev_chapter": prev_chapter,  # Add previous chapter info
+                    "next_chapter": next_chapter,  # Add next chapter info
+                    # previous_chapters_summary will be added dynamically in exec
+                })
+            else:
+                print(f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping.")
+
+        print(f"Preparing to write {len(items_to_process)} chapters...")
+        return items_to_process # Iterable for BatchNode
+
+    def exec(self, item):
+        # This runs for each item prepared above
+        abstraction_name = item["abstraction_details"]["name"]
+        chapter_num = item["chapter_num"]
+        project_name = item.get("project_name")  # Get from item
+        print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...")
+
+        # Prepare file context string from the map
+        file_context_str = "\n\n".join(
+            f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}"
+            for idx_path, content in item["related_files_content_map"].items()
+        )
+
+        # Get summary of chapters written *before* this one
+        # Use the temporary instance variable
+        previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far)
+
+
+        prompt = f"""
+Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}.
+
+Concept Details:
+- Description:
+{item["abstraction_details"]["description"]}
+
+Complete Tutorial Structure:
+{item["full_chapter_listing"]}
+
+Context from previous chapters (summary):
+{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."}
+
+Relevant Code Snippets:
+{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."}
+
+Instructions for the chapter:
+- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`).
+
+- If this is not the first chapter, begin with a brief transition from the previous chapter, referencing it with a proper Markdown link.
+
+- Begin with a high-level motivation explaining what problem this abstraction solves. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners.
+
+- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way.
+
+- Explain how to use this abstraction to solve the use case. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen). 
+
+- Each code block should be BELOW 20 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it.
+
+- Describe the internal implementation to help understand what's under the hood. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: 
+`participant QP as Query Processing`
+
+- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly.
+
+- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename. Example: "we will talk about [Query Processing](03_query_processing.md) in Chapter 3".
+
+- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format).
+
+- Heavily use analogies and examples throughout to help beginners understand.
+
+- End the chapter with a brief conclusion that summarizes what was learned and provides a transition to the next chapter. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename).
+
+- Ensure the tone is welcoming and easy for a newcomer to understand.
+
+- Output *only* the Markdown content for this chapter.
+
+Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags):
+"""
+        chapter_content = call_llm(prompt)
+        # Basic validation/cleanup
+        actual_heading = f"# Chapter {chapter_num}: {abstraction_name}"
+        if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"):
+             # Add heading if missing or incorrect, trying to preserve content
+             lines = chapter_content.strip().split('\n')
+             if lines and lines[0].strip().startswith("#"): # If there's some heading, replace it
+                 lines[0] = actual_heading
+                 chapter_content = "\n".join(lines)
+             else: # Otherwise, prepend it
+                 chapter_content = f"{actual_heading}\n\n{chapter_content}"
+
+        # Add the generated content to our temporary list for the next iteration's context
+        self.chapters_written_so_far.append(chapter_content)
+
+        return chapter_content # Return the Markdown string
+
+    def post(self, shared, prep_res, exec_res_list):
+        # exec_res_list contains the generated Markdown for each chapter, in order
+        shared["chapters"] = exec_res_list
+        # Clean up the temporary instance variable
+        del self.chapters_written_so_far
+        print(f"Finished writing {len(exec_res_list)} chapters.")
+
+class CombineTutorial(Node):
+    def prep(self, shared):
+        project_name = shared["project_name"]
+        output_base_dir = shared.get("output_dir", "output") # Default output dir
+        output_path = os.path.join(output_base_dir, project_name)
+        repo_url = shared["repo_url"]  # Get the repository URL
+
+        # Use 'label' from relationships_data['details']
+        relationships_data = shared["relationships"] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]}
+        chapter_order = shared["chapter_order"] # indices
+        abstractions = shared["abstractions"]   # list of dicts
+        chapters_content = shared["chapters"]   # list of strings
+
+        # --- Generate Mermaid Diagram ---
+        mermaid_lines = ["flowchart TD"]
+        # Add nodes for each abstraction
+        for i, abstr in enumerate(abstractions):
+            # Sanitize name for Mermaid ID and label
+            node_id = f"A{i}"
+            sanitized_name = abstr['name'].replace('"', '')
+            node_label = sanitized_name # Using sanitized name only, no index
+            mermaid_lines.append(f'    {node_id}["{node_label}"]')
+        # Add edges for relationships using 'label'
+        for rel in relationships_data['details']:
+            from_node_id = f"A{rel['from']}"
+            to_node_id = f"A{rel['to']}"
+            # Sanitize 'label' for edge label
+            edge_label = rel['label'].replace('"', '').replace('\n', ' ') # Basic sanitization
+            # Limit edge label length for readability (optional, but good for diagrams)
+            max_label_len = 30 # Make it shorter for labels
+            if len(edge_label) > max_label_len:
+                edge_label = edge_label[:max_label_len-3] + "..."
+            mermaid_lines.append(f'    {from_node_id} -- "{edge_label}" --> {to_node_id}')
+
+        mermaid_diagram = "\n".join(mermaid_lines)
+        # --- End Mermaid ---
+
+
+        # Prepare index.md content
+        index_content = f"# Tutorial: {project_name}\n\n"
+        index_content += f"{relationships_data['summary']}\n\n"
+        index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n"
+
+        # Add Mermaid diagram for relationships
+        index_content += "```mermaid\n"
+        index_content += mermaid_diagram + "\n"
+        index_content += "```\n\n"
+
+        index_content += "## Chapters\n\n"
+
+        chapter_files = []
+        # Generate chapter links based on the determined order
+        for i, abstraction_index in enumerate(chapter_order):
+            # Ensure index is valid and we have content for it
+            if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content):
+                abstraction_name = abstractions[abstraction_index]["name"]
+                # Sanitize name for filename
+                safe_name = "".join(c if c.isalnum() else '_' for c in abstraction_name).lower()
+                # Use chapter number (i+1) for ordering filename
+                filename = f"{i+1:02d}_{safe_name}.md"
+                index_content += f"{i+1}. [{abstraction_name}]({filename})\n"
+                
+                # Add attribution to chapter content
+                chapter_content = chapters_content[i]
+                if not chapter_content.endswith("\n\n"):
+                    chapter_content += "\n\n"
+                chapter_content += "---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
+                
+                # Store filename and corresponding content
+                chapter_files.append({"filename": filename, "content": chapter_content})
+            else:
+                 print(f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry.")
+
+        # Add attribution to index content
+        index_content += "\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)"
+
+        return {
+            "output_path": output_path,
+            "index_content": index_content,
+            "chapter_files": chapter_files # List of {"filename": str, "content": str}
+        }
+
+    def exec(self, prep_res):
+        output_path = prep_res["output_path"]
+        index_content = prep_res["index_content"]
+        chapter_files = prep_res["chapter_files"]
+
+        print(f"Combining tutorial into directory: {output_path}")
+        # Rely on Node's built-in retry/fallback
+        os.makedirs(output_path, exist_ok=True)
+
+        # Write index.md
+        index_filepath = os.path.join(output_path, "index.md")
+        with open(index_filepath, "w", encoding="utf-8") as f:
+            f.write(index_content)
+        print(f"  - Wrote {index_filepath}")
+
+        # Write chapter files
+        for chapter_info in chapter_files:
+            chapter_filepath = os.path.join(output_path, chapter_info["filename"])
+            with open(chapter_filepath, "w", encoding="utf-8") as f:
+                f.write(chapter_info["content"])
+            print(f"  - Wrote {chapter_filepath}")
+
+        return output_path # Return the final path
+
+
+    def post(self, shared, prep_res, exec_res):
+        shared["final_output_dir"] = exec_res # Store the output path
+        print(f"\nTutorial generation complete! Files are in: {exec_res}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,4 @@
-pocketflow>=0.0.1
+pocketflow>=0.0.1
+pyyaml>=6.0
+requests>=2.28.0
+google-cloud-aiplatform>=1.25.0
--- a/utils/call_llm.py
+++ b/utils/call_llm.py
@@ -1,14 +1,87 @@
-from openai import OpenAI
+from google import genai
+import os
+import logging
+import json
+from datetime import datetime

-# Learn more about calling the LLM: https://the-pocket.github.io/PocketFlow/utility_function/llm.html
-def call_llm(prompt):    
-    client = OpenAI(api_key="YOUR_API_KEY_HERE")
-    r = client.chat.completions.create(
-        model="gpt-4o",
-        messages=[{"role": "user", "content": prompt}]
-    )
-    return r.choices[0].message.content
+# Configure logging
+log_directory = os.getenv("LOG_DIR", "logs")
+os.makedirs(log_directory, exist_ok=True)
+log_file = os.path.join(log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log")
+
+# Set up logger
+logger = logging.getLogger("llm_logger")
+logger.setLevel(logging.INFO)
+logger.propagate = False  # Prevent propagation to root logger
+file_handler = logging.FileHandler(log_file)
+file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
+logger.addHandler(file_handler)
+
+# Simple cache configuration
+cache_file = "llm_cache.json"
+
+def call_llm(prompt: str, use_cache: bool = True) -> str:
+    # Log the prompt
+    logger.info(f"PROMPT: {prompt}")
    
+    # Check cache if enabled
+    if use_cache:
+        # Load cache from disk
+        cache = {}
+        if os.path.exists(cache_file):
+            try:
+                with open(cache_file, 'r') as f:
+                    cache = json.load(f)
+            except:
+                logger.warning(f"Failed to load cache, starting with empty cache")
+        
+        # Return from cache if exists
+        if prompt in cache:
+            logger.info(f"RESPONSE: {cache[prompt]}")
+            return cache[prompt]
+    
+    # Call the LLM if not in cache or cache disabled
+    client = genai.Client(
+        vertexai=True, 
+        project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"),
+        location=os.getenv("GEMINI_LOCATION", "us-central1")
+    )
+    model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25")
+    response = client.models.generate_content(
+        model=model,
+        contents=[prompt]
+    )
+    response_text = response.text
+    
+    # Log the response
+    logger.info(f"RESPONSE: {response_text}")
+    
+    # Update cache if enabled
+    if use_cache:
+        # Load cache again to avoid overwrites
+        cache = {}
+        if os.path.exists(cache_file):
+            try:
+                with open(cache_file, 'r') as f:
+                    cache = json.load(f)
+            except:
+                pass
+        
+        # Add to cache and save
+        cache[prompt] = response_text
+        try:
+            with open(cache_file, 'w') as f:
+                json.dump(cache, f)
+        except Exception as e:
+            logger.error(f"Failed to save cache: {e}")
+    
+    return response_text
+
 if __name__ == "__main__":
-    prompt = "What is the meaning of life?"
-    print(call_llm(prompt))
+    test_prompt = "Hello, how are you?"
+    
+    # First call - should hit the API
+    print("Making call...")
+    response1 = call_llm(test_prompt, use_cache=False)
+    print(f"Response: {response1}")
+    
--- a/utils/crawl_github_files.py
+++ b/utils/crawl_github_files.py
@@ -0,0 +1,236 @@
+import requests
+import base64
+import os
+import time
+import fnmatch
+from typing import Union, Set, List, Dict, Tuple, Any
+from urllib.parse import urlparse
+
+def crawl_github_files(
+    repo_url, 
+    token=None, 
+    max_file_size: int = 1 * 1024 * 1024,  # 1 MB
+    use_relative_paths: bool = False,
+    include_patterns: Union[str, Set[str]] = None,
+    exclude_patterns: Union[str, Set[str]] = None
+):
+    """
+    Crawl files from a specific path in a GitHub repository at a specific commit.
+    
+    Args:
+        repo_url (str): URL of the GitHub repository with specific path and commit
+                        (e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
+        token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
+        max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
+        use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
+        include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
+                                                       If None, all files are included.
+        exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
+                                                       If None, no files are excluded.
+    
+    Returns:
+        dict: Dictionary with files and statistics
+    """
+    # Convert single pattern to set
+    if include_patterns and isinstance(include_patterns, str):
+        include_patterns = {include_patterns}
+    if exclude_patterns and isinstance(exclude_patterns, str):
+        exclude_patterns = {exclude_patterns}
+    
+    # Parse GitHub URL to extract owner, repo, commit/branch, and path
+    parsed_url = urlparse(repo_url)
+    path_parts = parsed_url.path.strip('/').split('/')
+    
+    if len(path_parts) < 2:
+        raise ValueError(f"Invalid GitHub URL: {repo_url}")
+    
+    # Extract the basic components
+    owner = path_parts[0]
+    repo = path_parts[1]
+    
+    # Check if URL contains a specific branch/commit
+    if 'tree' in path_parts:
+        tree_index = path_parts.index('tree')
+        ref = path_parts[tree_index + 1]
+        # Combine all parts after the ref as the path
+        path_start = tree_index + 2
+        specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
+    else:
+        ref = "main"  # Default branch
+        specific_path = ""
+    
+    # Setup for GitHub API
+    headers = {"Accept": "application/vnd.github.v3+json"}
+    if token:
+        headers["Authorization"] = f"token {token}"
+    
+    # Dictionary to store path -> content mapping
+    files = {}
+    skipped_files = []
+    
+    def should_include_file(file_path: str, file_name: str) -> bool:
+        """Determine if a file should be included based on patterns"""
+        # If no include patterns are specified, include all files
+        if not include_patterns:
+            include_file = True
+        else:
+            # Check if file matches any include pattern
+            include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)
+        
+        # If exclude patterns are specified, check if file should be excluded
+        if exclude_patterns and include_file:
+            # Exclude if file matches any exclude pattern
+            exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
+            return not exclude_file
+        
+        return include_file
+    
+    def fetch_contents(path):
+        """Fetch contents of the repository at a specific path and commit"""
+        url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+        params = {"ref": ref}
+        
+        response = requests.get(url, headers=headers, params=params)
+        
+        if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
+            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
+            wait_time = max(reset_time - time.time(), 0) + 1
+            print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
+            time.sleep(wait_time)
+            return fetch_contents(path)
+            
+        if response.status_code == 404:
+            if not token:
+                print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
+            else:
+                print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
+            return
+            
+        if response.status_code != 200:
+            print(f"Error fetching {path}: {response.status_code} - {response.text}")
+            return
+        
+        contents = response.json()
+        
+        # Handle both single file and directory responses
+        if not isinstance(contents, list):
+            contents = [contents]
+        
+        for item in contents:
+            item_path = item["path"]
+            
+            # Calculate relative path if requested
+            if use_relative_paths and specific_path:
+                # Make sure the path is relative to the specified subdirectory
+                if item_path.startswith(specific_path):
+                    rel_path = item_path[len(specific_path):].lstrip('/')
+                else:
+                    rel_path = item_path
+            else:
+                rel_path = item_path
+            
+            if item["type"] == "file":
+                # Check if file should be included based on patterns
+                if not should_include_file(rel_path, item["name"]):
+                    print(f"Skipping {rel_path}: Does not match include/exclude patterns")
+                    continue
+                
+                # Check file size if available
+                file_size = item.get("size", 0)
+                if file_size > max_file_size:
+                    skipped_files.append((item_path, file_size))
+                    print(f"Skipping {rel_path}: File size ({file_size} bytes) exceeds limit ({max_file_size} bytes)")
+                    continue
+                
+                # For files, get raw content
+                if "download_url" in item and item["download_url"]:
+                    file_url = item["download_url"]
+                    file_response = requests.get(file_url, headers=headers)
+                    
+                    # Final size check in case content-length header is available but differs from metadata
+                    content_length = int(file_response.headers.get('content-length', 0))
+                    if content_length > max_file_size:
+                        skipped_files.append((item_path, content_length))
+                        print(f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)")
+                        continue
+                        
+                    if file_response.status_code == 200:
+                        files[rel_path] = file_response.text
+                        print(f"Downloaded: {rel_path} ({file_size} bytes) ")
+                    else:
+                        print(f"Failed to download {rel_path}: {file_response.status_code}")
+                else:
+                    # Alternative method if download_url is not available
+                    content_response = requests.get(item["url"], headers=headers)
+                    if content_response.status_code == 200:
+                        content_data = content_response.json()
+                        if content_data.get("encoding") == "base64" and "content" in content_data:
+                            # Check size of base64 content before decoding
+                            if len(content_data["content"]) * 0.75 > max_file_size:  # Approximate size calculation
+                                estimated_size = int(len(content_data["content"]) * 0.75)
+                                skipped_files.append((item_path, estimated_size))
+                                print(f"Skipping {rel_path}: Encoded content exceeds size limit")
+                                continue
+                                
+                            file_content = base64.b64decode(content_data["content"]).decode('utf-8')
+                            files[rel_path] = file_content
+                            print(f"Downloaded: {rel_path} ({file_size} bytes)")
+                        else:
+                            print(f"Unexpected content format for {rel_path}")
+                    else:
+                        print(f"Failed to get content for {rel_path}: {content_response.status_code}")
+            
+            elif item["type"] == "dir":
+                # Recursively process subdirectories
+                fetch_contents(item_path)
+    
+    # Start crawling from the specified path
+    fetch_contents(specific_path)
+    
+    return {
+        "files": files,
+        "stats": {
+            "downloaded_count": len(files),
+            "skipped_count": len(skipped_files),
+            "skipped_files": skipped_files,
+            "base_path": specific_path if use_relative_paths else None,
+            "include_patterns": include_patterns,
+            "exclude_patterns": exclude_patterns
+        }
+    }
+
+# Example usage
+if __name__ == "__main__":
+    # Get token from environment variable (more secure than hardcoding)
+    github_token = os.environ.get("GITHUB_TOKEN")
+    
+    repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
+    
+    # Example: Get Python and Markdown files, but exclude test files
+    result = crawl_github_files(
+        repo_url, 
+        token=github_token,
+        max_file_size=1 * 1024 * 1024,  # 1 MB in bytes
+        use_relative_paths=True,  # Enable relative paths
+        include_patterns={"*.py", "*.md"},  # Include Python and Markdown files
+    )
+    
+    files = result["files"]
+    stats = result["stats"]
+    
+    print(f"\nDownloaded {stats['downloaded_count']} files.")
+    print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
+    print(f"Base path for relative paths: {stats['base_path']}")
+    print(f"Include patterns: {stats['include_patterns']}")
+    print(f"Exclude patterns: {stats['exclude_patterns']}")
+    
+    # Display all file paths in the dictionary
+    print("\nFiles in dictionary:")
+    for file_path in sorted(files.keys()):
+        print(f"  {file_path}")
+    
+    # Example: accessing content of a specific file
+    if files:
+        sample_file = next(iter(files))
+        print(f"\nSample file: {sample_file}")
+        print(f"Content preview: {files[sample_file][:200]}...")