update readme examples

This commit is contained in:
zachary62
2025-04-04 15:06:41 -04:00
parent b63a1c14b0
commit dc4d445676
7 changed files with 1053 additions and 55 deletions

View File

@@ -1,14 +1,87 @@
from openai import OpenAI
from google import genai
import os
import logging
import json
from datetime import datetime
# Learn more about calling the LLM: https://the-pocket.github.io/PocketFlow/utility_function/llm.html
def call_llm(prompt):
client = OpenAI(api_key="YOUR_API_KEY_HERE")
r = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return r.choices[0].message.content
# Configure logging
log_directory = os.getenv("LOG_DIR", "logs")
os.makedirs(log_directory, exist_ok=True)
log_file = os.path.join(log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log")
# Set up logger
logger = logging.getLogger("llm_logger")
logger.setLevel(logging.INFO)
logger.propagate = False # Prevent propagation to root logger
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
logger.addHandler(file_handler)
# Simple cache configuration
cache_file = "llm_cache.json"
def call_llm(prompt: str, use_cache: bool = True) -> str:
# Log the prompt
logger.info(f"PROMPT: {prompt}")
# Check cache if enabled
if use_cache:
# Load cache from disk
cache = {}
if os.path.exists(cache_file):
try:
with open(cache_file, 'r') as f:
cache = json.load(f)
except:
logger.warning(f"Failed to load cache, starting with empty cache")
# Return from cache if exists
if prompt in cache:
logger.info(f"RESPONSE: {cache[prompt]}")
return cache[prompt]
# Call the LLM if not in cache or cache disabled
client = genai.Client(
vertexai=True,
project=os.getenv("GEMINI_PROJECT_ID", "your-project-id"),
location=os.getenv("GEMINI_LOCATION", "us-central1")
)
model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25")
response = client.models.generate_content(
model=model,
contents=[prompt]
)
response_text = response.text
# Log the response
logger.info(f"RESPONSE: {response_text}")
# Update cache if enabled
if use_cache:
# Load cache again to avoid overwrites
cache = {}
if os.path.exists(cache_file):
try:
with open(cache_file, 'r') as f:
cache = json.load(f)
except:
pass
# Add to cache and save
cache[prompt] = response_text
try:
with open(cache_file, 'w') as f:
json.dump(cache, f)
except Exception as e:
logger.error(f"Failed to save cache: {e}")
return response_text
if __name__ == "__main__":
prompt = "What is the meaning of life?"
print(call_llm(prompt))
test_prompt = "Hello, how are you?"
# First call - should hit the API
print("Making call...")
response1 = call_llm(test_prompt, use_cache=False)
print(f"Response: {response1}")

236
utils/crawl_github_files.py Normal file
View File

@@ -0,0 +1,236 @@
import requests
import base64
import os
import time
import fnmatch
from typing import Union, Set, List, Dict, Tuple, Any
from urllib.parse import urlparse
def crawl_github_files(
repo_url,
token=None,
max_file_size: int = 1 * 1024 * 1024, # 1 MB
use_relative_paths: bool = False,
include_patterns: Union[str, Set[str]] = None,
exclude_patterns: Union[str, Set[str]] = None
):
"""
Crawl files from a specific path in a GitHub repository at a specific commit.
Args:
repo_url (str): URL of the GitHub repository with specific path and commit
(e.g., 'https://github.com/microsoft/autogen/tree/e45a15766746d95f8cfaaa705b0371267bec812e/python/packages/autogen-core/src/autogen_core')
token (str, optional): GitHub personal access token. Required for private repositories and recommended for public repos to avoid rate limits.
max_file_size (int, optional): Maximum file size in bytes to download (default: 1 MB)
use_relative_paths (bool, optional): If True, file paths will be relative to the specified subdirectory
include_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to include (e.g., "*.py", {"*.md", "*.txt"}).
If None, all files are included.
exclude_patterns (str or set of str, optional): Pattern or set of patterns specifying which files to exclude.
If None, no files are excluded.
Returns:
dict: Dictionary with files and statistics
"""
# Convert single pattern to set
if include_patterns and isinstance(include_patterns, str):
include_patterns = {include_patterns}
if exclude_patterns and isinstance(exclude_patterns, str):
exclude_patterns = {exclude_patterns}
# Parse GitHub URL to extract owner, repo, commit/branch, and path
parsed_url = urlparse(repo_url)
path_parts = parsed_url.path.strip('/').split('/')
if len(path_parts) < 2:
raise ValueError(f"Invalid GitHub URL: {repo_url}")
# Extract the basic components
owner = path_parts[0]
repo = path_parts[1]
# Check if URL contains a specific branch/commit
if 'tree' in path_parts:
tree_index = path_parts.index('tree')
ref = path_parts[tree_index + 1]
# Combine all parts after the ref as the path
path_start = tree_index + 2
specific_path = '/'.join(path_parts[path_start:]) if path_start < len(path_parts) else ""
else:
ref = "main" # Default branch
specific_path = ""
# Setup for GitHub API
headers = {"Accept": "application/vnd.github.v3+json"}
if token:
headers["Authorization"] = f"token {token}"
# Dictionary to store path -> content mapping
files = {}
skipped_files = []
def should_include_file(file_path: str, file_name: str) -> bool:
"""Determine if a file should be included based on patterns"""
# If no include patterns are specified, include all files
if not include_patterns:
include_file = True
else:
# Check if file matches any include pattern
include_file = any(fnmatch.fnmatch(file_name, pattern) for pattern in include_patterns)
# If exclude patterns are specified, check if file should be excluded
if exclude_patterns and include_file:
# Exclude if file matches any exclude pattern
exclude_file = any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude_patterns)
return not exclude_file
return include_file
def fetch_contents(path):
"""Fetch contents of the repository at a specific path and commit"""
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
params = {"ref": ref}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 403 and 'rate limit exceeded' in response.text.lower():
reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
wait_time = max(reset_time - time.time(), 0) + 1
print(f"Rate limit exceeded. Waiting for {wait_time:.0f} seconds...")
time.sleep(wait_time)
return fetch_contents(path)
if response.status_code == 404:
if not token:
print(f"Error 404: Repository not found or is private. If this is a private repository, you need to provide a token.")
else:
print(f"Error 404: Path '{path}' not found in repository or insufficient permissions.")
return
if response.status_code != 200:
print(f"Error fetching {path}: {response.status_code} - {response.text}")
return
contents = response.json()
# Handle both single file and directory responses
if not isinstance(contents, list):
contents = [contents]
for item in contents:
item_path = item["path"]
# Calculate relative path if requested
if use_relative_paths and specific_path:
# Make sure the path is relative to the specified subdirectory
if item_path.startswith(specific_path):
rel_path = item_path[len(specific_path):].lstrip('/')
else:
rel_path = item_path
else:
rel_path = item_path
if item["type"] == "file":
# Check if file should be included based on patterns
if not should_include_file(rel_path, item["name"]):
print(f"Skipping {rel_path}: Does not match include/exclude patterns")
continue
# Check file size if available
file_size = item.get("size", 0)
if file_size > max_file_size:
skipped_files.append((item_path, file_size))
print(f"Skipping {rel_path}: File size ({file_size} bytes) exceeds limit ({max_file_size} bytes)")
continue
# For files, get raw content
if "download_url" in item and item["download_url"]:
file_url = item["download_url"]
file_response = requests.get(file_url, headers=headers)
# Final size check in case content-length header is available but differs from metadata
content_length = int(file_response.headers.get('content-length', 0))
if content_length > max_file_size:
skipped_files.append((item_path, content_length))
print(f"Skipping {rel_path}: Content length ({content_length} bytes) exceeds limit ({max_file_size} bytes)")
continue
if file_response.status_code == 200:
files[rel_path] = file_response.text
print(f"Downloaded: {rel_path} ({file_size} bytes) ")
else:
print(f"Failed to download {rel_path}: {file_response.status_code}")
else:
# Alternative method if download_url is not available
content_response = requests.get(item["url"], headers=headers)
if content_response.status_code == 200:
content_data = content_response.json()
if content_data.get("encoding") == "base64" and "content" in content_data:
# Check size of base64 content before decoding
if len(content_data["content"]) * 0.75 > max_file_size: # Approximate size calculation
estimated_size = int(len(content_data["content"]) * 0.75)
skipped_files.append((item_path, estimated_size))
print(f"Skipping {rel_path}: Encoded content exceeds size limit")
continue
file_content = base64.b64decode(content_data["content"]).decode('utf-8')
files[rel_path] = file_content
print(f"Downloaded: {rel_path} ({file_size} bytes)")
else:
print(f"Unexpected content format for {rel_path}")
else:
print(f"Failed to get content for {rel_path}: {content_response.status_code}")
elif item["type"] == "dir":
# Recursively process subdirectories
fetch_contents(item_path)
# Start crawling from the specified path
fetch_contents(specific_path)
return {
"files": files,
"stats": {
"downloaded_count": len(files),
"skipped_count": len(skipped_files),
"skipped_files": skipped_files,
"base_path": specific_path if use_relative_paths else None,
"include_patterns": include_patterns,
"exclude_patterns": exclude_patterns
}
}
# Example usage
if __name__ == "__main__":
# Get token from environment variable (more secure than hardcoding)
github_token = os.environ.get("GITHUB_TOKEN")
repo_url = "https://github.com/pydantic/pydantic/tree/6c38dc93f40a47f4d1350adca9ec0d72502e223f/pydantic"
# Example: Get Python and Markdown files, but exclude test files
result = crawl_github_files(
repo_url,
token=github_token,
max_file_size=1 * 1024 * 1024, # 1 MB in bytes
use_relative_paths=True, # Enable relative paths
include_patterns={"*.py", "*.md"}, # Include Python and Markdown files
)
files = result["files"]
stats = result["stats"]
print(f"\nDownloaded {stats['downloaded_count']} files.")
print(f"Skipped {stats['skipped_count']} files due to size limits or patterns.")
print(f"Base path for relative paths: {stats['base_path']}")
print(f"Include patterns: {stats['include_patterns']}")
print(f"Exclude patterns: {stats['exclude_patterns']}")
# Display all file paths in the dictionary
print("\nFiles in dictionary:")
for file_path in sorted(files.keys()):
print(f" {file_path}")
# Example: accessing content of a specific file
if files:
sample_file = next(iter(files))
print(f"\nSample file: {sample_file}")
print(f"Content preview: {files[sample_file][:200]}...")