Merge remote-tracking branch 'origin/master'

This commit is contained in:
Jedakiah
2023-04-14 20:18:04 +02:00
12 changed files with 321 additions and 53 deletions

View File

@@ -1,10 +1,15 @@
import requests
from bs4 import BeautifulSoup
from memory import get_memory
from config import Config
from llm_utils import create_chat_completion
from urllib.parse import urlparse, urljoin
cfg = Config()
memory = get_memory(cfg)
session = requests.Session()
session.headers.update({'User-Agent': cfg.user_agent})
# Function to check if the URL is valid
@@ -27,7 +32,7 @@ def check_local_file_access(url):
return any(url.startswith(prefix) for prefix in local_prefixes)
def get_response(url, headers=cfg.user_agent_header, timeout=10):
def get_response(url, timeout=10):
try:
# Restrict access to local files
if check_local_file_access(url):
@@ -39,7 +44,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):
sanitized_url = sanitize_url(url)
response = requests.get(sanitized_url, headers=headers, timeout=timeout)
response = session.get(sanitized_url, timeout=timeout)
# Check if the response contains an HTTP error
if response.status_code >= 400:
@@ -106,7 +111,7 @@ def scrape_links(url):
return format_hyperlinks(hyperlinks)
def split_text(text, max_length=8192):
def split_text(text, max_length=cfg.browse_chunk_max_length):
"""Split text into chunks of a maximum length"""
paragraphs = text.split("\n")
current_length = 0
@@ -133,7 +138,7 @@ def create_message(chunk, question):
}
def summarize_text(text, question):
def summarize_text(url, text, question):
"""Summarize text using the LLM model"""
if not text:
return "Error: No text to summarize"
@@ -145,15 +150,28 @@ def summarize_text(text, question):
chunks = list(split_text(text))
for i, chunk in enumerate(chunks):
print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
memory_to_add = f"Source: {url}\n" \
f"Raw content part#{i + 1}: {chunk}"
memory.add(memory_to_add)
print(f"Summarizing chunk {i + 1} / {len(chunks)}")
messages = [create_message(chunk, question)]
summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
max_tokens=cfg.browse_summary_max_token,
)
summaries.append(summary)
print(f"Added chunk {i + 1} summary to memory")
memory_to_add = f"Source: {url}\n" \
f"Content summary part#{i + 1}: {summary}"
memory.add(memory_to_add)
print(f"Summarized {len(chunks)} chunks.")
@@ -163,7 +181,7 @@ def summarize_text(text, question):
final_summary = create_chat_completion(
model=cfg.fast_llm_model,
messages=messages,
max_tokens=300,
max_tokens=cfg.browse_summary_max_token,
)
return final_summary

View File

@@ -194,7 +194,7 @@ def browse_website(url, question):
def get_text_summary(url, question):
"""Return the results of a google search"""
text = browse.scrape_text(url)
summary = browse.summarize_text(text, question)
summary = browse.summarize_text(url, text, question)
return """ "Result" : """ + summary

View File

@@ -38,11 +38,15 @@ class Config(metaclass=Singleton):
self.continuous_mode = False
self.continuous_limit = 0
self.speak_mode = False
self.skip_reprompt = False
self.ai_settings_file = os.getenv("AI_SETTINGS_FILE", "ai_settings.yaml")
self.fast_llm_model = os.getenv("FAST_LLM_MODEL", "gpt-3.5-turbo")
self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192))
self.browse_summary_max_token = int(os.getenv("BROWSE_SUMMARY_MAX_TOKEN", 300))
self.openai_api_key = os.getenv("OPENAI_API_KEY")
self.temperature = float(os.getenv("TEMPERATURE", "1"))
@@ -76,7 +80,7 @@ class Config(metaclass=Singleton):
# User agent headers to use when browsing web
# Some websites might just completely deny request with an error code if no user agent was found.
self.user_agent_header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
self.user_agent = os.getenv("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36")
self.redis_host = os.getenv("REDIS_HOST", "localhost")
self.redis_port = os.getenv("REDIS_PORT", "6379")
self.redis_password = os.getenv("REDIS_PASSWORD", "")
@@ -157,6 +161,14 @@ class Config(metaclass=Singleton):
"""Set the smart token limit value."""
self.smart_token_limit = value
def set_browse_chunk_max_length(self, value: int):
"""Set the browse_website command chunk max length value."""
self.browse_chunk_max_length = value
def set_browse_summary_max_token(self, value: int):
"""Set the browse_website command summary max token value."""
self.browse_summary_max_token = value
def set_openai_api_key(self, value: str):
"""Set the OpenAI API key value."""
self.openai_api_key = value

70
scripts/data_ingestion.py Normal file
View File

@@ -0,0 +1,70 @@
import argparse
import logging
from config import Config
from memory import get_memory
from file_operations import ingest_file, search_files
cfg = Config()
def configure_logging():
logging.basicConfig(filename='log-ingestion.txt',
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
datefmt='%H:%M:%S',
level=logging.DEBUG)
return logging.getLogger('AutoGPT-Ingestion')
def ingest_directory(directory, memory, args):
"""
Ingest all files in a directory by calling the ingest_file function for each file.
:param directory: The directory containing the files to ingest
:param memory: An object with an add() method to store the chunks in memory
"""
try:
files = search_files(directory)
for file in files:
ingest_file(file, memory, args.max_length, args.overlap)
except Exception as e:
print(f"Error while ingesting directory '{directory}': {str(e)}")
def main():
logger = configure_logging()
parser = argparse.ArgumentParser(description="Ingest a file or a directory with multiple files into memory. Make sure to set your .env before running this script.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--file", type=str, help="The file to ingest.")
group.add_argument("--dir", type=str, help="The directory containing the files to ingest.")
parser.add_argument("--init", action='store_true', help="Init the memory and wipe its content (default: False)", default=False)
parser.add_argument("--overlap", type=int, help="The overlap size between chunks when ingesting files (default: 200)", default=200)
parser.add_argument("--max_length", type=int, help="The max_length of each chunk when ingesting files (default: 4000)", default=4000)
args = parser.parse_args()
# Initialize memory
memory = get_memory(cfg, init=args.init)
print('Using memory of type: ' + memory.__class__.__name__)
if args.file:
try:
ingest_file(args.file, memory, args.max_length, args.overlap)
print(f"File '{args.file}' ingested successfully.")
except Exception as e:
logger.error(f"Error while ingesting file '{args.file}': {str(e)}")
print(f"Error while ingesting file '{args.file}': {str(e)}")
elif args.dir:
try:
ingest_directory(args.dir, memory, args)
print(f"Directory '{args.dir}' ingested successfully.")
except Exception as e:
logger.error(f"Error while ingesting directory '{args.dir}': {str(e)}")
print(f"Error while ingesting directory '{args.dir}': {str(e)}")
else:
print("Please provide either a file path (--file) or a directory name (--dir) inside the auto_gpt_workspace directory as input.")
if __name__ == "__main__":
main()

View File

@@ -19,53 +19,60 @@ def execute_python_file(file):
if not os.path.isfile(file_path):
return f"Error: File '{file}' does not exist."
try:
client = docker.from_env()
image_name = 'python:3.10'
if we_are_running_in_a_docker_container():
result = subprocess.run(f'python {file_path}', capture_output=True, encoding="utf8", shell=True)
if result.returncode == 0:
return result.stdout
else:
return f"Error: {result.stderr}"
else:
try:
client.images.get(image_name)
print(f"Image '{image_name}' found locally")
except docker.errors.ImageNotFound:
print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
# Use the low-level API to stream the pull response
low_level_client = docker.APIClient()
for line in low_level_client.pull(image_name, stream=True, decode=True):
# Print the status and progress, if available
status = line.get('status')
progress = line.get('progress')
if status and progress:
print(f"{status}: {progress}")
elif status:
print(status)
client = docker.from_env()
# You can replace 'python:3.8' with the desired Python image/version
# You can find available Python images on Docker Hub:
# https://hub.docker.com/_/python
container = client.containers.run(
image_name,
f'python {file}',
volumes={
os.path.abspath(WORKSPACE_FOLDER): {
'bind': '/workspace',
'mode': 'ro'}},
working_dir='/workspace',
stderr=True,
stdout=True,
detach=True,
)
image_name = 'python:3.10'
try:
client.images.get(image_name)
print(f"Image '{image_name}' found locally")
except docker.errors.ImageNotFound:
print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
# Use the low-level API to stream the pull response
low_level_client = docker.APIClient()
for line in low_level_client.pull(image_name, stream=True, decode=True):
# Print the status and progress, if available
status = line.get('status')
progress = line.get('progress')
if status and progress:
print(f"{status}: {progress}")
elif status:
print(status)
output = container.wait()
logs = container.logs().decode('utf-8')
container.remove()
# You can replace 'python:3.8' with the desired Python image/version
# You can find available Python images on Docker Hub:
# https://hub.docker.com/_/python
container = client.containers.run(
image_name,
f'python {file}',
volumes={
os.path.abspath(WORKSPACE_FOLDER): {
'bind': '/workspace',
'mode': 'ro'}},
working_dir='/workspace',
stderr=True,
stdout=True,
detach=True,
)
# print(f"Execution complete. Output: {output}")
# print(f"Logs: {logs}")
output = container.wait()
logs = container.logs().decode('utf-8')
container.remove()
return logs
# print(f"Execution complete. Output: {output}")
# print(f"Logs: {logs}")
except Exception as e:
return f"Error: {str(e)}"
return logs
except Exception as e:
return f"Error: {str(e)}"
def execute_shell(command_line):
@@ -86,3 +93,7 @@ def execute_shell(command_line):
os.chdir(current_dir)
return output
def we_are_running_in_a_docker_container():
os.path.exists('/.dockerenv')

View File

@@ -20,6 +20,29 @@ def safe_join(base, *paths):
return norm_new_path
def split_file(content, max_length=4000, overlap=0):
"""
Split text into chunks of a specified maximum length with a specified overlap
between chunks.
:param text: The input text to be split into chunks
:param max_length: The maximum length of each chunk, default is 4000 (about 1k token)
:param overlap: The number of overlapping characters between chunks, default is no overlap
:return: A generator yielding chunks of text
"""
start = 0
content_length = len(content)
while start < content_length:
end = start + max_length
if end + overlap < content_length:
chunk = content[start:end+overlap]
else:
chunk = content[start:content_length]
yield chunk
start += max_length - overlap
def read_file(filename):
"""Read a file and return the contents"""
try:
@@ -31,6 +54,37 @@ def read_file(filename):
return "Error: " + str(e)
def ingest_file(filename, memory, max_length=4000, overlap=200):
"""
Ingest a file by reading its content, splitting it into chunks with a specified
maximum length and overlap, and adding the chunks to the memory storage.
:param filename: The name of the file to ingest
:param memory: An object with an add() method to store the chunks in memory
:param max_length: The maximum length of each chunk, default is 4000
:param overlap: The number of overlapping characters between chunks, default is 200
"""
try:
print(f"Working with file {filename}")
content = read_file(filename)
content_length = len(content)
print(f"File length: {content_length} characters")
chunks = list(split_file(content, max_length=max_length, overlap=overlap))
num_chunks = len(chunks)
for i, chunk in enumerate(chunks):
print(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
memory_to_add = f"Filename: {filename}\n" \
f"Content part#{i + 1}/{num_chunks}: {chunk}"
memory.add(memory_to_add)
print(f"Done ingesting {num_chunks} chunks from {filename}.")
except Exception as e:
print(f"Error while ingesting file '{filename}': {str(e)}")
def write_to_file(filename, text):
"""Write text to a file"""
try:

View File

@@ -131,8 +131,12 @@ def print_assistant_thoughts(assistant_reply):
def construct_prompt():
"""Construct the prompt for the AI to respond to"""
config = AIConfig.load()
if config.ai_name:
config = AIConfig.load(cfg.ai_settings_file)
if cfg.skip_reprompt and config.ai_name:
logger.typewriter_log("Name :", Fore.GREEN, config.ai_name)
logger.typewriter_log("Role :", Fore.GREEN, config.ai_role)
logger.typewriter_log("Goals:", Fore.GREEN, config.ai_goals)
elif config.ai_name:
logger.typewriter_log(
f"Welcome back! ",
Fore.GREEN,
@@ -220,13 +224,15 @@ def parse_arguments():
cfg.set_speak_mode(False)
parser = argparse.ArgumentParser(description='Process arguments.')
parser.add_argument('--continuous', action='store_true', help='Enable Continuous Mode')
parser.add_argument('--continuous', '-c', action='store_true', help='Enable Continuous Mode')
parser.add_argument('--continuous-limit', '-l', type=int, dest="continuous_limit", help='Defines the number of times to run in continuous mode')
parser.add_argument('--speak', action='store_true', help='Enable Speak Mode')
parser.add_argument('--debug', action='store_true', help='Enable Debug Mode')
parser.add_argument('--gpt3only', action='store_true', help='Enable GPT3.5 Only Mode')
parser.add_argument('--gpt4only', action='store_true', help='Enable GPT4 Only Mode')
parser.add_argument('--use-memory', '-m', dest="memory_type", help='Defines which Memory backend to use')
parser.add_argument('--skip-reprompt', '-y', dest='skip_reprompt', action='store_true', help='Skips the re-prompting messages at the beginning of the script')
parser.add_argument('--ai-settings', '-C', dest='ai_settings_file', help="Specifies which ai_settings.yaml file to use, will also automatically skip the re-prompt.")
args = parser.parse_args()
if args.debug:
@@ -273,6 +279,24 @@ def parse_arguments():
else:
cfg.memory_backend = chosen
if args.skip_reprompt:
logger.typewriter_log("Skip Re-prompt: ", Fore.GREEN, "ENABLED")
cfg.skip_reprompt = True
if args.ai_settings_file:
file = args.ai_settings_file
# Validate file
(validated, message) = utils.validate_yaml_file(file)
if not validated:
logger.typewriter_log("FAILED FILE VALIDATION", Fore.RED, message)
logger.double_check()
exit(1)
logger.typewriter_log("Using AI Settings File:", Fore.GREEN, file)
cfg.ai_settings_file = file
cfg.skip_reprompt = True
def main():
global ai_name, memory

View File

@@ -3,7 +3,7 @@ from memory.no_memory import NoMemory
# List of supported memory backends
# Add a backend to this list if the import attempt is successful
supported_memory = ['local']
supported_memory = ['local', 'no_memory']
try:
from memory.redismem import RedisMemory

View File

@@ -1,3 +1,7 @@
import yaml
from colorama import Fore
def clean_input(prompt: str=''):
try:
return input(prompt)
@@ -6,3 +10,14 @@ def clean_input(prompt: str=''):
print("Quitting...")
exit(0)
def validate_yaml_file(file: str):
try:
with open(file) as file:
yaml.load(file, Loader=yaml.FullLoader)
except FileNotFoundError:
return (False, f"The file {Fore.CYAN}`{file}`{Fore.RESET} wasn't found")
except yaml.YAMLError as e:
return (False, f"There was an issue while trying to read with your AI Settings file: {e}")
return (True, f"Successfully validated {Fore.CYAN}`{file}`{Fore.RESET}!")