move crawl local to utils

This commit is contained in:
zachary62
2025-04-09 11:09:33 -04:00
parent 9bd29cde73
commit 0d2c29e7c7
2 changed files with 74 additions and 65 deletions

View File

@@ -1,72 +1,9 @@
import os import os
import yaml import yaml
import fnmatch
from pocketflow import Node, BatchNode from pocketflow import Node, BatchNode
from utils.crawl_github_files import crawl_github_files from utils.crawl_github_files import crawl_github_files
from utils.call_llm import call_llm # Assuming you have this utility from utils.call_llm import call_llm
from utils.crawl_local_files import crawl_local_files
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
"""
Crawl files in a local directory with similar interface as crawl_github_files.
Args:
directory (str): Path to local directory
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
max_file_size (int): Maximum file size in bytes
use_relative_paths (bool): Whether to use paths relative to directory
Returns:
dict: {"files": {filepath: content}}
"""
if not os.path.isdir(directory):
raise ValueError(f"Directory does not exist: {directory}")
files_dict = {}
for root, _, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
# Get path relative to directory if requested
if use_relative_paths:
relpath = os.path.relpath(filepath, directory)
else:
relpath = filepath
# Check if file matches any include pattern
included = False
if include_patterns:
for pattern in include_patterns:
if fnmatch.fnmatch(relpath, pattern):
included = True
break
else:
included = True
# Check if file matches any exclude pattern
excluded = False
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch.fnmatch(relpath, pattern):
excluded = True
break
if not included or excluded:
continue
# Check file size
if max_file_size and os.path.getsize(filepath) > max_file_size:
continue
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
files_dict[relpath] = content
except Exception as e:
print(f"Warning: Could not read file {filepath}: {e}")
return {"files": files_dict}
# Helper to create context from files, respecting limits (basic example) # Helper to create context from files, respecting limits (basic example)
def create_llm_context(files_data): def create_llm_context(files_data):

View File

@@ -0,0 +1,72 @@
import os
import fnmatch
def crawl_local_files(directory, include_patterns=None, exclude_patterns=None, max_file_size=None, use_relative_paths=True):
"""
Crawl files in a local directory with similar interface as crawl_github_files.
Args:
directory (str): Path to local directory
include_patterns (set): File patterns to include (e.g. {"*.py", "*.js"})
exclude_patterns (set): File patterns to exclude (e.g. {"tests/*"})
max_file_size (int): Maximum file size in bytes
use_relative_paths (bool): Whether to use paths relative to directory
Returns:
dict: {"files": {filepath: content}}
"""
if not os.path.isdir(directory):
raise ValueError(f"Directory does not exist: {directory}")
files_dict = {}
for root, _, files in os.walk(directory):
for filename in files:
filepath = os.path.join(root, filename)
# Get path relative to directory if requested
if use_relative_paths:
relpath = os.path.relpath(filepath, directory)
else:
relpath = filepath
# Check if file matches any include pattern
included = False
if include_patterns:
for pattern in include_patterns:
if fnmatch.fnmatch(relpath, pattern):
included = True
break
else:
included = True
# Check if file matches any exclude pattern
excluded = False
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch.fnmatch(relpath, pattern):
excluded = True
break
if not included or excluded:
continue
# Check file size
if max_file_size and os.path.getsize(filepath) > max_file_size:
continue
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
files_dict[relpath] = content
except Exception as e:
print(f"Warning: Could not read file {filepath}: {e}")
return {"files": files_dict}
if __name__ == "__main__":
print("--- Crawling parent directory ('..') ---")
files_data = crawl_local_files("..", exclude_patterns={"*.pyc", "__pycache__/*", ".git/*", "output/*"})
print(f"Found {len(files_data['files'])} files:")
for path in files_data["files"]:
print(f" {path}")