mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2026-01-04 23:04:28 +01:00
Merge remote-tracking branch 'origin/master'
This commit is contained in:
@@ -1,10 +1,15 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from memory import get_memory
|
||||
from config import Config
|
||||
from llm_utils import create_chat_completion
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
cfg = Config()
|
||||
memory = get_memory(cfg)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({'User-Agent': cfg.user_agent})
|
||||
|
||||
|
||||
# Function to check if the URL is valid
|
||||
@@ -27,7 +32,7 @@ def check_local_file_access(url):
|
||||
return any(url.startswith(prefix) for prefix in local_prefixes)
|
||||
|
||||
|
||||
def get_response(url, headers=cfg.user_agent_header, timeout=10):
|
||||
def get_response(url, timeout=10):
|
||||
try:
|
||||
# Restrict access to local files
|
||||
if check_local_file_access(url):
|
||||
@@ -39,7 +44,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
|
||||
|
||||
sanitized_url = sanitize_url(url)
|
||||
|
||||
response = requests.get(sanitized_url, headers=headers, timeout=timeout)
|
||||
response = session.get(sanitized_url, timeout=timeout)
|
||||
|
||||
# Check if the response contains an HTTP error
|
||||
if response.status_code >= 400:
|
||||
@@ -106,7 +111,7 @@ def scrape_links(url):
|
||||
return format_hyperlinks(hyperlinks)
|
||||
|
||||
|
||||
def split_text(text, max_length=8192):
|
||||
def split_text(text, max_length=cfg.browse_chunk_max_length):
|
||||
"""Split text into chunks of a maximum length"""
|
||||
paragraphs = text.split("\n")
|
||||
current_length = 0
|
||||
@@ -133,7 +138,7 @@ def create_message(chunk, question):
|
||||
}
|
||||
|
||||
|
||||
def summarize_text(text, question):
|
||||
def summarize_text(url, text, question):
|
||||
"""Summarize text using the LLM model"""
|
||||
if not text:
|
||||
return "Error: No text to summarize"
|
||||
@@ -145,15 +150,28 @@ def summarize_text(text, question):
|
||||
chunks = list(split_text(text))
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
|
||||
|
||||
memory_to_add = f"Source: {url}\n" \
|
||||
f"Raw content part#{i + 1}: {chunk}"
|
||||
|
||||
memory.add(memory_to_add)
|
||||
|
||||
print(f"Summarizing chunk {i + 1} / {len(chunks)}")
|
||||
messages = [create_message(chunk, question)]
|
||||
|
||||
summary = create_chat_completion(
|
||||
model=cfg.fast_llm_model,
|
||||
messages=messages,
|
||||
max_tokens=300,
|
||||
max_tokens=cfg.browse_summary_max_token,
|
||||
)
|
||||
summaries.append(summary)
|
||||
print(f"Added chunk {i + 1} summary to memory")
|
||||
|
||||
memory_to_add = f"Source: {url}\n" \
|
||||
f"Content summary part#{i + 1}: {summary}"
|
||||
|
||||
memory.add(memory_to_add)
|
||||
|
||||
print(f"Summarized {len(chunks)} chunks.")
|
||||
|
||||
@@ -163,7 +181,7 @@ def summarize_text(text, question):
|
||||
final_summary = create_chat_completion(
|
||||
model=cfg.fast_llm_model,
|
||||
messages=messages,
|
||||
max_tokens=300,
|
||||
max_tokens=cfg.browse_summary_max_token,
|
||||
)
|
||||
|
||||
return final_summary
|
||||
|
||||
@@ -194,7 +194,7 @@ def browse_website(url, question):
|
||||
def get_text_summary(url, question):
|
||||
"""Return the results of a google search"""
|
||||
text = browse.scrape_text(url)
|
||||
summary = browse.summarize_text(text, question)
|
||||
summary = browse.summarize_text(url, text, question)
|
||||
return """ "Result" : """ + summary
|
||||
|
||||
|
||||
|
||||
@@ -38,11 +38,15 @@ class Config(metaclass=Singleton):
|
||||
self.continuous_mode = False
|
||||
self.continuous_limit = 0
|
||||
self.speak_mode = False
|
||||
self.skip_reprompt = False
|
||||
|
||||
self.ai_settings_file = os.getenv("AI_SETTINGS_FILE", "ai_settings.yaml")
|
||||
self.fast_llm_model = os.getenv("FAST_LLM_MODEL", "gpt-3.5-turbo")
|
||||
self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
|
||||
self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
|
||||
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
|
||||
self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192))
|
||||
self.browse_summary_max_token = int(os.getenv("BROWSE_SUMMARY_MAX_TOKEN", 300))
|
||||
|
||||
self.openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
self.temperature = float(os.getenv("TEMPERATURE", "1"))
|
||||
@@ -76,7 +80,7 @@ class Config(metaclass=Singleton):
|
||||
|
||||
# User agent headers to use when browsing web
|
||||
# Some websites might just completely deny request with an error code if no user agent was found.
|
||||
self.user_agent_header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
|
||||
self.user_agent = os.getenv("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36")
|
||||
self.redis_host = os.getenv("REDIS_HOST", "localhost")
|
||||
self.redis_port = os.getenv("REDIS_PORT", "6379")
|
||||
self.redis_password = os.getenv("REDIS_PASSWORD", "")
|
||||
@@ -157,6 +161,14 @@ class Config(metaclass=Singleton):
|
||||
"""Set the smart token limit value."""
|
||||
self.smart_token_limit = value
|
||||
|
||||
def set_browse_chunk_max_length(self, value: int):
|
||||
"""Set the browse_website command chunk max length value."""
|
||||
self.browse_chunk_max_length = value
|
||||
|
||||
def set_browse_summary_max_token(self, value: int):
|
||||
"""Set the browse_website command summary max token value."""
|
||||
self.browse_summary_max_token = value
|
||||
|
||||
def set_openai_api_key(self, value: str):
|
||||
"""Set the OpenAI API key value."""
|
||||
self.openai_api_key = value
|
||||
|
||||
70
scripts/data_ingestion.py
Normal file
70
scripts/data_ingestion.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import argparse
|
||||
import logging
|
||||
from config import Config
|
||||
from memory import get_memory
|
||||
from file_operations import ingest_file, search_files
|
||||
|
||||
cfg = Config()
|
||||
|
||||
|
||||
def configure_logging():
|
||||
logging.basicConfig(filename='log-ingestion.txt',
|
||||
filemode='a',
|
||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=logging.DEBUG)
|
||||
return logging.getLogger('AutoGPT-Ingestion')
|
||||
|
||||
|
||||
def ingest_directory(directory, memory, args):
|
||||
"""
|
||||
Ingest all files in a directory by calling the ingest_file function for each file.
|
||||
|
||||
:param directory: The directory containing the files to ingest
|
||||
:param memory: An object with an add() method to store the chunks in memory
|
||||
"""
|
||||
try:
|
||||
files = search_files(directory)
|
||||
for file in files:
|
||||
ingest_file(file, memory, args.max_length, args.overlap)
|
||||
except Exception as e:
|
||||
print(f"Error while ingesting directory '{directory}': {str(e)}")
|
||||
|
||||
|
||||
def main():
|
||||
logger = configure_logging()
|
||||
|
||||
parser = argparse.ArgumentParser(description="Ingest a file or a directory with multiple files into memory. Make sure to set your .env before running this script.")
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--file", type=str, help="The file to ingest.")
|
||||
group.add_argument("--dir", type=str, help="The directory containing the files to ingest.")
|
||||
parser.add_argument("--init", action='store_true', help="Init the memory and wipe its content (default: False)", default=False)
|
||||
parser.add_argument("--overlap", type=int, help="The overlap size between chunks when ingesting files (default: 200)", default=200)
|
||||
parser.add_argument("--max_length", type=int, help="The max_length of each chunk when ingesting files (default: 4000)", default=4000)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize memory
|
||||
memory = get_memory(cfg, init=args.init)
|
||||
print('Using memory of type: ' + memory.__class__.__name__)
|
||||
|
||||
if args.file:
|
||||
try:
|
||||
ingest_file(args.file, memory, args.max_length, args.overlap)
|
||||
print(f"File '{args.file}' ingested successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while ingesting file '{args.file}': {str(e)}")
|
||||
print(f"Error while ingesting file '{args.file}': {str(e)}")
|
||||
elif args.dir:
|
||||
try:
|
||||
ingest_directory(args.dir, memory, args)
|
||||
print(f"Directory '{args.dir}' ingested successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while ingesting directory '{args.dir}': {str(e)}")
|
||||
print(f"Error while ingesting directory '{args.dir}': {str(e)}")
|
||||
else:
|
||||
print("Please provide either a file path (--file) or a directory name (--dir) inside the auto_gpt_workspace directory as input.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -19,53 +19,60 @@ def execute_python_file(file):
|
||||
if not os.path.isfile(file_path):
|
||||
return f"Error: File '{file}' does not exist."
|
||||
|
||||
try:
|
||||
client = docker.from_env()
|
||||
|
||||
image_name = 'python:3.10'
|
||||
if we_are_running_in_a_docker_container():
|
||||
result = subprocess.run(f'python {file_path}', capture_output=True, encoding="utf8", shell=True)
|
||||
if result.returncode == 0:
|
||||
return result.stdout
|
||||
else:
|
||||
return f"Error: {result.stderr}"
|
||||
else:
|
||||
try:
|
||||
client.images.get(image_name)
|
||||
print(f"Image '{image_name}' found locally")
|
||||
except docker.errors.ImageNotFound:
|
||||
print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
|
||||
# Use the low-level API to stream the pull response
|
||||
low_level_client = docker.APIClient()
|
||||
for line in low_level_client.pull(image_name, stream=True, decode=True):
|
||||
# Print the status and progress, if available
|
||||
status = line.get('status')
|
||||
progress = line.get('progress')
|
||||
if status and progress:
|
||||
print(f"{status}: {progress}")
|
||||
elif status:
|
||||
print(status)
|
||||
client = docker.from_env()
|
||||
|
||||
# You can replace 'python:3.8' with the desired Python image/version
|
||||
# You can find available Python images on Docker Hub:
|
||||
# https://hub.docker.com/_/python
|
||||
container = client.containers.run(
|
||||
image_name,
|
||||
f'python {file}',
|
||||
volumes={
|
||||
os.path.abspath(WORKSPACE_FOLDER): {
|
||||
'bind': '/workspace',
|
||||
'mode': 'ro'}},
|
||||
working_dir='/workspace',
|
||||
stderr=True,
|
||||
stdout=True,
|
||||
detach=True,
|
||||
)
|
||||
image_name = 'python:3.10'
|
||||
try:
|
||||
client.images.get(image_name)
|
||||
print(f"Image '{image_name}' found locally")
|
||||
except docker.errors.ImageNotFound:
|
||||
print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
|
||||
# Use the low-level API to stream the pull response
|
||||
low_level_client = docker.APIClient()
|
||||
for line in low_level_client.pull(image_name, stream=True, decode=True):
|
||||
# Print the status and progress, if available
|
||||
status = line.get('status')
|
||||
progress = line.get('progress')
|
||||
if status and progress:
|
||||
print(f"{status}: {progress}")
|
||||
elif status:
|
||||
print(status)
|
||||
|
||||
output = container.wait()
|
||||
logs = container.logs().decode('utf-8')
|
||||
container.remove()
|
||||
# You can replace 'python:3.8' with the desired Python image/version
|
||||
# You can find available Python images on Docker Hub:
|
||||
# https://hub.docker.com/_/python
|
||||
container = client.containers.run(
|
||||
image_name,
|
||||
f'python {file}',
|
||||
volumes={
|
||||
os.path.abspath(WORKSPACE_FOLDER): {
|
||||
'bind': '/workspace',
|
||||
'mode': 'ro'}},
|
||||
working_dir='/workspace',
|
||||
stderr=True,
|
||||
stdout=True,
|
||||
detach=True,
|
||||
)
|
||||
|
||||
# print(f"Execution complete. Output: {output}")
|
||||
# print(f"Logs: {logs}")
|
||||
output = container.wait()
|
||||
logs = container.logs().decode('utf-8')
|
||||
container.remove()
|
||||
|
||||
return logs
|
||||
# print(f"Execution complete. Output: {output}")
|
||||
# print(f"Logs: {logs}")
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
return logs
|
||||
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
|
||||
def execute_shell(command_line):
|
||||
@@ -86,3 +93,7 @@ def execute_shell(command_line):
|
||||
os.chdir(current_dir)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def we_are_running_in_a_docker_container():
|
||||
os.path.exists('/.dockerenv')
|
||||
|
||||
@@ -20,6 +20,29 @@ def safe_join(base, *paths):
|
||||
return norm_new_path
|
||||
|
||||
|
||||
def split_file(content, max_length=4000, overlap=0):
|
||||
"""
|
||||
Split text into chunks of a specified maximum length with a specified overlap
|
||||
between chunks.
|
||||
|
||||
:param text: The input text to be split into chunks
|
||||
:param max_length: The maximum length of each chunk, default is 4000 (about 1k token)
|
||||
:param overlap: The number of overlapping characters between chunks, default is no overlap
|
||||
:return: A generator yielding chunks of text
|
||||
"""
|
||||
start = 0
|
||||
content_length = len(content)
|
||||
|
||||
while start < content_length:
|
||||
end = start + max_length
|
||||
if end + overlap < content_length:
|
||||
chunk = content[start:end+overlap]
|
||||
else:
|
||||
chunk = content[start:content_length]
|
||||
yield chunk
|
||||
start += max_length - overlap
|
||||
|
||||
|
||||
def read_file(filename):
|
||||
"""Read a file and return the contents"""
|
||||
try:
|
||||
@@ -31,6 +54,37 @@ def read_file(filename):
|
||||
return "Error: " + str(e)
|
||||
|
||||
|
||||
def ingest_file(filename, memory, max_length=4000, overlap=200):
|
||||
"""
|
||||
Ingest a file by reading its content, splitting it into chunks with a specified
|
||||
maximum length and overlap, and adding the chunks to the memory storage.
|
||||
|
||||
:param filename: The name of the file to ingest
|
||||
:param memory: An object with an add() method to store the chunks in memory
|
||||
:param max_length: The maximum length of each chunk, default is 4000
|
||||
:param overlap: The number of overlapping characters between chunks, default is 200
|
||||
"""
|
||||
try:
|
||||
print(f"Working with file {filename}")
|
||||
content = read_file(filename)
|
||||
content_length = len(content)
|
||||
print(f"File length: {content_length} characters")
|
||||
|
||||
chunks = list(split_file(content, max_length=max_length, overlap=overlap))
|
||||
|
||||
num_chunks = len(chunks)
|
||||
for i, chunk in enumerate(chunks):
|
||||
print(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
|
||||
memory_to_add = f"Filename: {filename}\n" \
|
||||
f"Content part#{i + 1}/{num_chunks}: {chunk}"
|
||||
|
||||
memory.add(memory_to_add)
|
||||
|
||||
print(f"Done ingesting {num_chunks} chunks from {filename}.")
|
||||
except Exception as e:
|
||||
print(f"Error while ingesting file '{filename}': {str(e)}")
|
||||
|
||||
|
||||
def write_to_file(filename, text):
|
||||
"""Write text to a file"""
|
||||
try:
|
||||
|
||||
@@ -131,8 +131,12 @@ def print_assistant_thoughts(assistant_reply):
|
||||
|
||||
def construct_prompt():
|
||||
"""Construct the prompt for the AI to respond to"""
|
||||
config = AIConfig.load()
|
||||
if config.ai_name:
|
||||
config = AIConfig.load(cfg.ai_settings_file)
|
||||
if cfg.skip_reprompt and config.ai_name:
|
||||
logger.typewriter_log("Name :", Fore.GREEN, config.ai_name)
|
||||
logger.typewriter_log("Role :", Fore.GREEN, config.ai_role)
|
||||
logger.typewriter_log("Goals:", Fore.GREEN, config.ai_goals)
|
||||
elif config.ai_name:
|
||||
logger.typewriter_log(
|
||||
f"Welcome back! ",
|
||||
Fore.GREEN,
|
||||
@@ -220,13 +224,15 @@ def parse_arguments():
|
||||
cfg.set_speak_mode(False)
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process arguments.')
|
||||
parser.add_argument('--continuous', action='store_true', help='Enable Continuous Mode')
|
||||
parser.add_argument('--continuous', '-c', action='store_true', help='Enable Continuous Mode')
|
||||
parser.add_argument('--continuous-limit', '-l', type=int, dest="continuous_limit", help='Defines the number of times to run in continuous mode')
|
||||
parser.add_argument('--speak', action='store_true', help='Enable Speak Mode')
|
||||
parser.add_argument('--debug', action='store_true', help='Enable Debug Mode')
|
||||
parser.add_argument('--gpt3only', action='store_true', help='Enable GPT3.5 Only Mode')
|
||||
parser.add_argument('--gpt4only', action='store_true', help='Enable GPT4 Only Mode')
|
||||
parser.add_argument('--use-memory', '-m', dest="memory_type", help='Defines which Memory backend to use')
|
||||
parser.add_argument('--skip-reprompt', '-y', dest='skip_reprompt', action='store_true', help='Skips the re-prompting messages at the beginning of the script')
|
||||
parser.add_argument('--ai-settings', '-C', dest='ai_settings_file', help="Specifies which ai_settings.yaml file to use, will also automatically skip the re-prompt.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.debug:
|
||||
@@ -273,6 +279,24 @@ def parse_arguments():
|
||||
else:
|
||||
cfg.memory_backend = chosen
|
||||
|
||||
if args.skip_reprompt:
|
||||
logger.typewriter_log("Skip Re-prompt: ", Fore.GREEN, "ENABLED")
|
||||
cfg.skip_reprompt = True
|
||||
|
||||
if args.ai_settings_file:
|
||||
file = args.ai_settings_file
|
||||
|
||||
# Validate file
|
||||
(validated, message) = utils.validate_yaml_file(file)
|
||||
if not validated:
|
||||
logger.typewriter_log("FAILED FILE VALIDATION", Fore.RED, message)
|
||||
logger.double_check()
|
||||
exit(1)
|
||||
|
||||
logger.typewriter_log("Using AI Settings File:", Fore.GREEN, file)
|
||||
cfg.ai_settings_file = file
|
||||
cfg.skip_reprompt = True
|
||||
|
||||
|
||||
def main():
|
||||
global ai_name, memory
|
||||
|
||||
@@ -3,7 +3,7 @@ from memory.no_memory import NoMemory
|
||||
|
||||
# List of supported memory backends
|
||||
# Add a backend to this list if the import attempt is successful
|
||||
supported_memory = ['local']
|
||||
supported_memory = ['local', 'no_memory']
|
||||
|
||||
try:
|
||||
from memory.redismem import RedisMemory
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
import yaml
|
||||
from colorama import Fore
|
||||
|
||||
|
||||
def clean_input(prompt: str=''):
|
||||
try:
|
||||
return input(prompt)
|
||||
@@ -6,3 +10,14 @@ def clean_input(prompt: str=''):
|
||||
print("Quitting...")
|
||||
exit(0)
|
||||
|
||||
|
||||
def validate_yaml_file(file: str):
|
||||
try:
|
||||
with open(file) as file:
|
||||
yaml.load(file, Loader=yaml.FullLoader)
|
||||
except FileNotFoundError:
|
||||
return (False, f"The file {Fore.CYAN}`{file}`{Fore.RESET} wasn't found")
|
||||
except yaml.YAMLError as e:
|
||||
return (False, f"There was an issue while trying to read with your AI Settings file: {e}")
|
||||
|
||||
return (True, f"Successfully validated {Fore.CYAN}`{file}`{Fore.RESET}!")
|
||||
|
||||
Reference in New Issue
Block a user