mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-22 00:14:23 +01:00
Add file ingestion methode in file_operations.py
Add the following functions to ingest data into memory before Auto-GPT run. - split_file: given a content, split it in chunks of max_length with (or without) a specified overlap - ingest_file: read a file, use split_file to split it in chunks and load each chunk in memory - ingest_directory: ingest all files in a directory in memory
This commit is contained in:
@@ -1,5 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
from config import Config
|
||||||
|
from memory import get_memory
|
||||||
|
|
||||||
|
cfg = Config()
|
||||||
|
memory = get_memory(cfg)
|
||||||
|
|
||||||
# Set a dedicated folder for file I/O
|
# Set a dedicated folder for file I/O
|
||||||
working_directory = "auto_gpt_workspace"
|
working_directory = "auto_gpt_workspace"
|
||||||
@@ -20,6 +25,30 @@ def safe_join(base, *paths):
|
|||||||
return norm_new_path
|
return norm_new_path
|
||||||
|
|
||||||
|
|
||||||
|
def split_file(content, max_length=4000, overlap=0):
|
||||||
|
"""
|
||||||
|
Split text into chunks of a specified maximum length with a specified overlap
|
||||||
|
between chunks.
|
||||||
|
|
||||||
|
:param text: The input text to be split into chunks
|
||||||
|
:param max_length: The maximum length of each chunk, default is 4000 (about 1k token)
|
||||||
|
:param overlap: The number of overlapping characters between chunks, default is no overlap
|
||||||
|
:return: A generator yielding chunks of text
|
||||||
|
"""
|
||||||
|
start = 0
|
||||||
|
content_length = len(content)
|
||||||
|
|
||||||
|
while start < content_length:
|
||||||
|
end = start + max_length
|
||||||
|
chunk = content[start:end]
|
||||||
|
yield chunk
|
||||||
|
start += max_length - overlap
|
||||||
|
if start + max_length - overlap >= content_length:
|
||||||
|
break
|
||||||
|
if end + overlap > content_length:
|
||||||
|
start = content_length - max_length
|
||||||
|
|
||||||
|
|
||||||
def read_file(filename):
|
def read_file(filename):
|
||||||
"""Read a file and return the contents"""
|
"""Read a file and return the contents"""
|
||||||
try:
|
try:
|
||||||
@@ -31,6 +60,52 @@ def read_file(filename):
|
|||||||
return "Error: " + str(e)
|
return "Error: " + str(e)
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_file(filename, memory, max_length=4000, overlap=200):
|
||||||
|
"""
|
||||||
|
Ingest a file by reading its content, splitting it into chunks with a specified
|
||||||
|
maximum length and overlap, and adding the chunks to the memory storage.
|
||||||
|
|
||||||
|
:param filename: The name of the file to ingest
|
||||||
|
:param memory: An object with an add() method to store the chunks in memory
|
||||||
|
:param max_length: The maximum length of each chunk, default is 4000
|
||||||
|
:param overlap: The number of overlapping characters between chunks, default is 200
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
print(f"Working with file {filename}")
|
||||||
|
content = read_file(filename)
|
||||||
|
content_length = len(content)
|
||||||
|
print(f"File length: {content_length} characters")
|
||||||
|
|
||||||
|
chunks = list(split_file(content, max_length=max_length, overlap=overlap))
|
||||||
|
|
||||||
|
num_chunks = len(chunks)
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
print(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
|
||||||
|
memory_to_add = f"Filename: {filename}\n" \
|
||||||
|
f"Content part#{i + 1}/{num_chunks}: {chunk}"
|
||||||
|
|
||||||
|
memory.add(memory_to_add)
|
||||||
|
|
||||||
|
print(f"Done ingesting {num_chunks} chunks from {filename}.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error while ingesting file '{filename}': {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_directory(directory, memory):
|
||||||
|
"""
|
||||||
|
Ingest all files in a directory by calling the ingest_file function for each file.
|
||||||
|
|
||||||
|
:param directory: The directory containing the files to ingest
|
||||||
|
:param memory: An object with an add() method to store the chunks in memory
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
files = search_files(directory)
|
||||||
|
for file in files:
|
||||||
|
ingest_file(file, memory)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error while ingesting directory '{directory}': {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
def write_to_file(filename, text):
|
def write_to_file(filename, text):
|
||||||
"""Write text to a file"""
|
"""Write text to a file"""
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user