Vector memory revamp (part 1: refactoring) (#4208)

Additional changes: * Improve typing * Modularize message history memory & fix/refactor lots of things * Fix summarization * Move memory relevance calculation to MemoryItem & improve test * Fix import warnings in web_selenium.py * Remove `memory_add` ghost command * Implement overlap in `split_text` * Move memory tests into subdirectory * Remove deprecated `get_ada_embedding()` and helpers * Fix used token calculation in `chat_with_ai` * Replace Message TypedDict by dataclass * Fix AgentManager singleton issues in tests --------- Co-authored-by: Auto-GPT-Bot <github-bot@agpt.co>
2025-12-23 08:54:24 +01:00 · 2023-05-25 20:31:11 +02:00
parent 10489e0df2
commit bfbe613960
92 changed files with 7282 additions and 7989 deletions
--- a/autogpt/commands/file_operations.py
+++ b/autogpt/commands/file_operations.py
@@ -4,7 +4,7 @@ from __future__ import annotations
 import hashlib
 import os
 import os.path
-from typing import Dict, Generator, Literal, Tuple
+from typing import Generator, Literal

 import charset_normalizer
 import requests
@@ -15,6 +15,7 @@ from autogpt.commands.command import command
 from autogpt.commands.file_operations_utils import read_textual_file
 from autogpt.config import Config
 from autogpt.logs import logger
+from autogpt.memory.vector import MemoryItem, VectorMemory
 from autogpt.spinner import Spinner
 from autogpt.utils import readable_file_size

@@ -28,7 +29,9 @@ def text_checksum(text: str) -> str:
    return hashlib.md5(text.encode("utf-8")).hexdigest()


-def operations_from_log(log_path: str) -> Generator[Tuple[Operation, str, str | None]]:
+def operations_from_log(
+    log_path: str,
+) -> Generator[tuple[Operation, str, str | None], None, None]:
    """Parse the file operations log and return a tuple containing the log entries"""
    try:
        log = open(log_path, "r", encoding="utf-8")
@@ -45,6 +48,7 @@ def operations_from_log(log_path: str) -> Generator[Tuple[Operation, str, str |
            try:
                path, checksum = (x.strip() for x in tail.rsplit(" #", maxsplit=1))
            except ValueError:
+                logger.warn(f"File log entry lacks checksum: '{line}'")
                path, checksum = tail.strip(), None
            yield (operation, path, checksum)
        elif operation == "delete":
@@ -53,7 +57,7 @@ def operations_from_log(log_path: str) -> Generator[Tuple[Operation, str, str |
    log.close()


-def file_operations_state(log_path: str) -> Dict:
+def file_operations_state(log_path: str) -> dict[str, str]:
    """Iterates over the operations log and returns the expected state.

    Parses a log file at CFG.file_logger_path to construct a dictionary that maps
@@ -156,43 +160,41 @@ def read_file(filename: str) -> str:
    """
    try:
        content = read_textual_file(filename, logger)
+
+        # TODO: invalidate/update memory when file is edited
+        file_memory = MemoryItem.from_text_file(content, filename)
+        if len(file_memory.chunks) > 1:
+            return file_memory.summary
+
        return content
    except Exception as e:
        return f"Error: {str(e)}"


 def ingest_file(
-    filename: str, memory, max_length: int = 4000, overlap: int = 200
+    filename: str,
+    memory: VectorMemory,
 ) -> None:
    """
    Ingest a file by reading its content, splitting it into chunks with a specified
    maximum length and overlap, and adding the chunks to the memory storage.

-    :param filename: The name of the file to ingest
-    :param memory: An object with an add() method to store the chunks in memory
-    :param max_length: The maximum length of each chunk, default is 4000
-    :param overlap: The number of overlapping characters between chunks, default is 200
+    Args:
+        filename: The name of the file to ingest
+        memory: An object with an add() method to store the chunks in memory
    """
    try:
-        logger.info(f"Working with file {filename}")
+        logger.info(f"Ingesting file {filename}")
        content = read_file(filename)
-        content_length = len(content)
-        logger.info(f"File length: {content_length} characters")

-        chunks = list(split_file(content, max_length=max_length, overlap=overlap))
+        # TODO: differentiate between different types of files
+        file_memory = MemoryItem.from_text_file(content, filename)
+        logger.debug(f"Created memory: {file_memory.dump()}")
+        memory.add(file_memory)

-        num_chunks = len(chunks)
-        for i, chunk in enumerate(chunks):
-            logger.info(f"Ingesting chunk {i + 1} / {num_chunks} into memory")
-            memory_to_add = (
-                f"Filename: {filename}\n" f"Content part#{i + 1}/{num_chunks}: {chunk}"
-            )
-
-            memory.add(memory_to_add)
-
-        logger.info(f"Done ingesting {num_chunks} chunks from {filename}.")
+        logger.info(f"Ingested {len(file_memory.e_chunks)} chunks from {filename}")
    except Exception as err:
-        logger.info(f"Error while ingesting file '{filename}': {err}")
+        logger.warn(f"Error while ingesting file '{filename}': {err}")


@command("write_to_file", "Write to file", '"filename": "<filename>", "text": "<text>"')