Merge remote-tracking branch 'origin/master'

2026-01-04 23:04:28 +01:00 · 2023-04-14 20:18:04 +02:00
parent d0dd107f39 a17a850b25
commit 7daa3fc8f9
12 changed files with 321 additions and 53 deletions
--- a/scripts/browse.py
+++ b/scripts/browse.py
@@ -1,10 +1,15 @@
 import requests
 from bs4 import BeautifulSoup
+from memory import get_memory
 from config import Config
 from llm_utils import create_chat_completion
 from urllib.parse import urlparse, urljoin

 cfg = Config()
+memory = get_memory(cfg)
+
+session = requests.Session()
+session.headers.update({'User-Agent': cfg.user_agent})


 # Function to check if the URL is valid
@@ -27,7 +32,7 @@ def check_local_file_access(url):
    return any(url.startswith(prefix) for prefix in local_prefixes)


-def get_response(url, headers=cfg.user_agent_header, timeout=10):
+def get_response(url, timeout=10):
    try:
        # Restrict access to local files
        if check_local_file_access(url):
@@ -39,7 +44,7 @@ def get_response(url, headers=cfg.user_agent_header, timeout=10):

        sanitized_url = sanitize_url(url)

-        response = requests.get(sanitized_url, headers=headers, timeout=timeout)
+        response = session.get(sanitized_url, timeout=timeout)

        # Check if the response contains an HTTP error
        if response.status_code >= 400:
@@ -106,7 +111,7 @@ def scrape_links(url):
    return format_hyperlinks(hyperlinks)


-def split_text(text, max_length=8192):
+def split_text(text, max_length=cfg.browse_chunk_max_length):
    """Split text into chunks of a maximum length"""
    paragraphs = text.split("\n")
    current_length = 0
@@ -133,7 +138,7 @@ def create_message(chunk, question):
    }


-def summarize_text(text, question):
+def summarize_text(url, text, question):
    """Summarize text using the LLM model"""
    if not text:
        return "Error: No text to summarize"
@@ -145,15 +150,28 @@ def summarize_text(text, question):
    chunks = list(split_text(text))

    for i, chunk in enumerate(chunks):
+        print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
+
+        memory_to_add = f"Source: {url}\n" \
+                        f"Raw content part#{i + 1}: {chunk}"
+
+        memory.add(memory_to_add)
+
        print(f"Summarizing chunk {i + 1} / {len(chunks)}")
        messages = [create_message(chunk, question)]

        summary = create_chat_completion(
            model=cfg.fast_llm_model,
            messages=messages,
-            max_tokens=300,
+            max_tokens=cfg.browse_summary_max_token,
        )
        summaries.append(summary)
+        print(f"Added chunk {i + 1} summary to memory")
+
+        memory_to_add = f"Source: {url}\n" \
+                        f"Content summary part#{i + 1}: {summary}"
+
+        memory.add(memory_to_add)

    print(f"Summarized {len(chunks)} chunks.")

@@ -163,7 +181,7 @@ def summarize_text(text, question):
    final_summary = create_chat_completion(
        model=cfg.fast_llm_model,
        messages=messages,
-        max_tokens=300,
+        max_tokens=cfg.browse_summary_max_token,
    )

    return final_summary
--- a/scripts/commands.py
+++ b/scripts/commands.py
@@ -194,7 +194,7 @@ def browse_website(url, question):
 def get_text_summary(url, question):
    """Return the results of a google search"""
    text = browse.scrape_text(url)
-    summary = browse.summarize_text(text, question)
+    summary = browse.summarize_text(url, text, question)
    return """ "Result" : """ + summary


--- a/scripts/config.py
+++ b/scripts/config.py
@@ -38,11 +38,15 @@ class Config(metaclass=Singleton):
        self.continuous_mode = False
        self.continuous_limit = 0
        self.speak_mode = False
+        self.skip_reprompt = False

+        self.ai_settings_file = os.getenv("AI_SETTINGS_FILE", "ai_settings.yaml")
        self.fast_llm_model = os.getenv("FAST_LLM_MODEL", "gpt-3.5-turbo")
        self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4")
        self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 4000))
        self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
+        self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192))
+        self.browse_summary_max_token =  int(os.getenv("BROWSE_SUMMARY_MAX_TOKEN", 300))

        self.openai_api_key = os.getenv("OPENAI_API_KEY")
        self.temperature = float(os.getenv("TEMPERATURE", "1"))
@@ -76,7 +80,7 @@ class Config(metaclass=Singleton):

        # User agent headers to use when browsing web
        # Some websites might just completely deny request with an error code if no user agent was found.
-        self.user_agent_header = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
+        self.user_agent = os.getenv("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36")
        self.redis_host = os.getenv("REDIS_HOST", "localhost")
        self.redis_port = os.getenv("REDIS_PORT", "6379")
        self.redis_password = os.getenv("REDIS_PASSWORD", "")
@@ -157,6 +161,14 @@ class Config(metaclass=Singleton):
        """Set the smart token limit value."""
        self.smart_token_limit = value

+    def set_browse_chunk_max_length(self, value: int):
+        """Set the browse_website command chunk max length value."""
+        self.browse_chunk_max_length = value
+
+    def set_browse_summary_max_token(self, value: int):
+        """Set the browse_website command summary max token value."""
+        self.browse_summary_max_token = value
+
    def set_openai_api_key(self, value: str):
        """Set the OpenAI API key value."""
        self.openai_api_key = value
--- a/scripts/data_ingestion.py
+++ b/scripts/data_ingestion.py
@@ -0,0 +1,70 @@
+import argparse
+import logging
+from config import Config
+from memory import get_memory
+from file_operations import ingest_file, search_files
+
+cfg = Config()
+
+
+def configure_logging():
+    logging.basicConfig(filename='log-ingestion.txt',
+                    filemode='a',
+                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
+                    datefmt='%H:%M:%S',
+                    level=logging.DEBUG)
+    return logging.getLogger('AutoGPT-Ingestion')
+
+
+def ingest_directory(directory, memory, args):
+    """
+    Ingest all files in a directory by calling the ingest_file function for each file.
+
+    :param directory: The directory containing the files to ingest
+    :param memory: An object with an add() method to store the chunks in memory
+    """
+    try:
+        files = search_files(directory)
+        for file in files:
+            ingest_file(file, memory, args.max_length, args.overlap)
+    except Exception as e:
+        print(f"Error while ingesting directory '{directory}': {str(e)}")
+
+
+def main():
+    logger = configure_logging()
+
+    parser = argparse.ArgumentParser(description="Ingest a file or a directory with multiple files into memory. Make sure to set your .env before running this script.")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--file", type=str, help="The file to ingest.")
+    group.add_argument("--dir", type=str, help="The directory containing the files to ingest.")
+    parser.add_argument("--init", action='store_true', help="Init the memory and wipe its content (default: False)", default=False)
+    parser.add_argument("--overlap", type=int, help="The overlap size between chunks when ingesting files (default: 200)", default=200)
+    parser.add_argument("--max_length", type=int, help="The max_length of each chunk when ingesting files (default: 4000)", default=4000)
+
+    args = parser.parse_args()
+
+    # Initialize memory
+    memory = get_memory(cfg, init=args.init)
+    print('Using memory of type: ' + memory.__class__.__name__)
+
+    if args.file:
+        try:
+            ingest_file(args.file, memory, args.max_length, args.overlap)
+            print(f"File '{args.file}' ingested successfully.")
+        except Exception as e:
+            logger.error(f"Error while ingesting file '{args.file}': {str(e)}")
+            print(f"Error while ingesting file '{args.file}': {str(e)}")
+    elif args.dir:
+        try:
+            ingest_directory(args.dir, memory, args)
+            print(f"Directory '{args.dir}' ingested successfully.")
+        except Exception as e:
+            logger.error(f"Error while ingesting directory '{args.dir}': {str(e)}")
+            print(f"Error while ingesting directory '{args.dir}': {str(e)}")
+    else:
+        print("Please provide either a file path (--file) or a directory name (--dir) inside the auto_gpt_workspace directory as input.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/execute_code.py
+++ b/scripts/execute_code.py
@@ -19,53 +19,60 @@ def execute_python_file(file):
    if not os.path.isfile(file_path):
        return f"Error: File '{file}' does not exist."

-    try:
-        client = docker.from_env()
-
-        image_name = 'python:3.10'
+    if we_are_running_in_a_docker_container():
+        result = subprocess.run(f'python {file_path}', capture_output=True, encoding="utf8", shell=True)
+        if result.returncode == 0:
+            return result.stdout
+        else:
+            return f"Error: {result.stderr}"
+    else:
        try:
-            client.images.get(image_name)
-            print(f"Image '{image_name}' found locally")
-        except docker.errors.ImageNotFound:
-            print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
-            # Use the low-level API to stream the pull response
-            low_level_client = docker.APIClient()
-            for line in low_level_client.pull(image_name, stream=True, decode=True):
-                # Print the status and progress, if available
-                status = line.get('status')
-                progress = line.get('progress')
-                if status and progress:
-                    print(f"{status}: {progress}")
-                elif status:
-                    print(status)
+            client = docker.from_env()

-        # You can replace 'python:3.8' with the desired Python image/version
-        # You can find available Python images on Docker Hub:
-        # https://hub.docker.com/_/python
-        container = client.containers.run(
-            image_name,
-            f'python {file}',
-            volumes={
-                os.path.abspath(WORKSPACE_FOLDER): {
-                    'bind': '/workspace',
-                    'mode': 'ro'}},
-            working_dir='/workspace',
-            stderr=True,
-            stdout=True,
-            detach=True,
-        )
+            image_name = 'python:3.10'
+            try:
+                client.images.get(image_name)
+                print(f"Image '{image_name}' found locally")
+            except docker.errors.ImageNotFound:
+                print(f"Image '{image_name}' not found locally, pulling from Docker Hub")
+                # Use the low-level API to stream the pull response
+                low_level_client = docker.APIClient()
+                for line in low_level_client.pull(image_name, stream=True, decode=True):
+                    # Print the status and progress, if available
+                    status = line.get('status')
+                    progress = line.get('progress')
+                    if status and progress:
+                        print(f"{status}: {progress}")
+                    elif status:
+                        print(status)

-        output = container.wait()
-        logs = container.logs().decode('utf-8')
-        container.remove()
+            # You can replace 'python:3.8' with the desired Python image/version
+            # You can find available Python images on Docker Hub:
+            # https://hub.docker.com/_/python
+            container = client.containers.run(
+                image_name,
+                f'python {file}',
+                volumes={
+                    os.path.abspath(WORKSPACE_FOLDER): {
+                        'bind': '/workspace',
+                        'mode': 'ro'}},
+                working_dir='/workspace',
+                stderr=True,
+                stdout=True,
+                detach=True,
+            )

-        # print(f"Execution complete. Output: {output}")
-        # print(f"Logs: {logs}")
+            output = container.wait()
+            logs = container.logs().decode('utf-8')
+            container.remove()

-        return logs
+            # print(f"Execution complete. Output: {output}")
+            # print(f"Logs: {logs}")

-    except Exception as e:
-        return f"Error: {str(e)}"
+            return logs
+
+        except Exception as e:
+            return f"Error: {str(e)}"


 def execute_shell(command_line):
@@ -86,3 +93,7 @@ def execute_shell(command_line):
    os.chdir(current_dir)

    return output
+
+
+def we_are_running_in_a_docker_container():
+    os.path.exists('/.dockerenv')
--- a/scripts/file_operations.py
+++ b/scripts/file_operations.py
@@ -20,6 +20,29 @@ def safe_join(base, *paths):
    return norm_new_path


+def split_file(content, max_length=4000, overlap=0):
+    """
+    Split text into chunks of a specified maximum length with a specified overlap
+    between chunks.
+
+    :param text: The input text to be split into chunks
+    :param max_length: The maximum length of each chunk, default is 4000 (about 1k token)
+    :param overlap: The number of overlapping characters between chunks, default is no overlap
+    :return: A generator yielding chunks of text
+    """
+    start = 0
+    content_length = len(content)
+
+    while start < content_length:
+        end = start + max_length
+        if end + overlap < content_length:
+            chunk = content[start:end+overlap]
+        else:
+            chunk = content[start:content_length]
+        yield chunk
+        start += max_length - overlap
+
+
 def read_file(filename):
    """Read a file and return the contents"""
    try:
@@ -31,6 +54,37 @@ def read_file(filename):
        return "Error: " + str(e)


+def ingest_file(filename, memory, max_length=4000, overlap=200):
+    """
+    Ingest a file by reading its content, splitting it into chunks with a specified
+    maximum length and overlap, and adding the chunks to the memory storage.
+
+    :param filename: The name of the file to ingest
+    :param memory: An object with an add() method to store the chunks in memory
+    :param max_length: The maximum length of each chunk, default is 4000
+    :param overlap: The number of overlapping characters between chunks, default is 200
+    """
+    try:
+        print(f"Working with file {filename}")
+        content = read_file(filename)
+        content_length = len(content)
+        print(f"File length: {content_length} characters")
+
+        chunks = list(split_file(content, max_length=max_length, overlap=overlap))
+
+        num_chunks = len(chunks)
+        for i, chunk in enumerate(chunks):
+            print(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
+            memory_to_add = f"Filename: {filename}\n" \
+                            f"Content part#{i + 1}/{num_chunks}: {chunk}"
+
+            memory.add(memory_to_add)
+
+        print(f"Done ingesting {num_chunks} chunks from {filename}.")
+    except Exception as e:
+        print(f"Error while ingesting file '{filename}': {str(e)}")
+
+
 def write_to_file(filename, text):
    """Write text to a file"""
    try:
--- a/scripts/main.py
+++ b/scripts/main.py
@@ -131,8 +131,12 @@ def print_assistant_thoughts(assistant_reply):

 def construct_prompt():
    """Construct the prompt for the AI to respond to"""
-    config = AIConfig.load()
-    if config.ai_name:
+    config = AIConfig.load(cfg.ai_settings_file)
+    if cfg.skip_reprompt and config.ai_name:
+        logger.typewriter_log("Name :", Fore.GREEN, config.ai_name)
+        logger.typewriter_log("Role :", Fore.GREEN, config.ai_role)
+        logger.typewriter_log("Goals:", Fore.GREEN, config.ai_goals)
+    elif config.ai_name:
        logger.typewriter_log(
            f"Welcome back! ",
            Fore.GREEN,
@@ -220,13 +224,15 @@ def parse_arguments():
    cfg.set_speak_mode(False)

    parser = argparse.ArgumentParser(description='Process arguments.')
-    parser.add_argument('--continuous', action='store_true', help='Enable Continuous Mode')
+    parser.add_argument('--continuous', '-c', action='store_true', help='Enable Continuous Mode')
    parser.add_argument('--continuous-limit', '-l', type=int, dest="continuous_limit", help='Defines the number of times to run in continuous mode')
    parser.add_argument('--speak', action='store_true', help='Enable Speak Mode')
    parser.add_argument('--debug', action='store_true', help='Enable Debug Mode')
    parser.add_argument('--gpt3only', action='store_true', help='Enable GPT3.5 Only Mode')
    parser.add_argument('--gpt4only', action='store_true', help='Enable GPT4 Only Mode')
    parser.add_argument('--use-memory', '-m', dest="memory_type", help='Defines which Memory backend to use')
+    parser.add_argument('--skip-reprompt', '-y', dest='skip_reprompt', action='store_true', help='Skips the re-prompting messages at the beginning of the script')
+    parser.add_argument('--ai-settings', '-C', dest='ai_settings_file', help="Specifies which ai_settings.yaml file to use, will also automatically skip the re-prompt.")
    args = parser.parse_args()

    if args.debug:
@@ -273,6 +279,24 @@ def parse_arguments():
        else:
            cfg.memory_backend = chosen

+    if args.skip_reprompt:
+        logger.typewriter_log("Skip Re-prompt: ", Fore.GREEN, "ENABLED")
+        cfg.skip_reprompt = True
+
+    if args.ai_settings_file:
+        file = args.ai_settings_file
+
+        # Validate file
+        (validated, message) = utils.validate_yaml_file(file)
+        if not validated:
+            logger.typewriter_log("FAILED FILE VALIDATION", Fore.RED, message)
+            logger.double_check()
+            exit(1)
+
+        logger.typewriter_log("Using AI Settings File:", Fore.GREEN, file)
+        cfg.ai_settings_file = file
+        cfg.skip_reprompt = True
+

 def main():
    global ai_name, memory
--- a/scripts/memory/init.py
+++ b/scripts/memory/init.py
@@ -3,7 +3,7 @@ from memory.no_memory import NoMemory

 # List of supported memory backends
 # Add a backend to this list if the import attempt is successful
-supported_memory = ['local']
+supported_memory = ['local', 'no_memory']

 try:
    from memory.redismem import RedisMemory
--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -1,3 +1,7 @@
+import yaml
+from colorama import Fore
+
+
 def clean_input(prompt: str=''):
    try:
        return input(prompt)
@@ -6,3 +10,14 @@ def clean_input(prompt: str=''):
        print("Quitting...")
        exit(0)

+
+def validate_yaml_file(file: str):
+    try:
+        with open(file) as file:
+            yaml.load(file, Loader=yaml.FullLoader)
+    except FileNotFoundError:
+        return (False, f"The file {Fore.CYAN}`{file}`{Fore.RESET} wasn't found")
+    except yaml.YAMLError as e:
+        return (False, f"There was an issue while trying to read with your AI Settings file: {e}")
+
+    return (True, f"Successfully validated {Fore.CYAN}`{file}`{Fore.RESET}!")