refactor(agent/file_operations): Refactor file opening/reading and parsing

- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend - Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality - Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
2025-12-17 14:04:27 +01:00 · 2023-12-12 17:41:04 +01:00
parent 198a0ecad6
commit d95e3b5b54
8 changed files with 125 additions and 111 deletions
--- a/autogpts/autogpt/autogpt/commands/file_operations.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations.py
@@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema
 from autogpt.memory.vector import MemoryItemFactory, VectorMemory

 from .decorators import sanitize_path_arg
-from .file_operations_utils import read_textual_file
+from .file_operations_utils import decode_textual_file

 COMMAND_CATEGORY = "file_operations"
 COMMAND_CATEGORY_TITLE = "File Operations"
@@ -140,8 +140,7 @@ def log_operation(
        )
    },
 )
-@sanitize_path_arg("filename")
-def read_file(filename: Path, agent: Agent) -> str:
+def read_file(filename: str | Path, agent: Agent) -> str:
    """Read a file and return the contents

    Args:
@@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
    Returns:
        str: The contents of the file
    """
-    content = read_textual_file(filename, logger)
-    # TODO: content = agent.workspace.read_file(filename)
+    file = agent.workspace.open_file(filename, binary=True)
+    content = decode_textual_file(file, logger)

    # # TODO: invalidate/update memory when file is edited
    # file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)
--- a/autogpts/autogpt/autogpt/commands/file_operations_utils.py
+++ b/autogpts/autogpt/autogpt/commands/file_operations_utils.py
@@ -1,11 +1,11 @@
 import json
 import logging
 import os
-from pathlib import Path
+from abc import ABC, abstractmethod
+from typing import BinaryIO

 import charset_normalizer
 import docx
-import markdown
 import pypdf
 import yaml
 from bs4 import BeautifulSoup
@@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text
 logger = logging.getLogger(__name__)


-class ParserStrategy:
-    def read(self, file_path: Path) -> str:
-        raise NotImplementedError
+class ParserStrategy(ABC):
+    @abstractmethod
+    def read(self, file: BinaryIO) -> str:
+        ...


 # Basic text file reading
 class TXTParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        charset_match = charset_normalizer.from_path(file_path).best()
-        logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
+    def read(self, file: BinaryIO) -> str:
+        charset_match = charset_normalizer.from_bytes(file.read()).best()
+        logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
        return str(charset_match)


 # Reading text from binary file using pdf parser
 class PDFParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        parser = pypdf.PdfReader(file_path)
+    def read(self, file: BinaryIO) -> str:
+        parser = pypdf.PdfReader(file)
        text = ""
        for page_idx in range(len(parser.pages)):
            text += parser.pages[page_idx].extract_text()
@@ -39,8 +40,8 @@ class PDFParser(ParserStrategy):

 # Reading text from binary file using docs parser
 class DOCXParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        doc_file = docx.Document(file_path)
+    def read(self, file: BinaryIO) -> str:
+        doc_file = docx.Document(file)
        text = ""
        for para in doc_file.paragraphs:
            text += para.text
@@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy):

 # Reading as dictionary and returning string format
 class JSONParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            data = json.load(f)
+    def read(self, file: BinaryIO) -> str:
+        data = json.load(file)
        text = str(data)
        return text


 class XMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            soup = BeautifulSoup(f, "xml")
+    def read(self, file: BinaryIO) -> str:
+        soup = BeautifulSoup(file, "xml")
        text = soup.get_text()
        return text


 # Reading as dictionary and returning string format
 class YAMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            data = yaml.load(f, Loader=yaml.FullLoader)
+    def read(self, file: BinaryIO) -> str:
+        data = yaml.load(file, Loader=yaml.FullLoader)
        text = str(data)
        return text


 class HTMLParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            soup = BeautifulSoup(f, "html.parser")
+    def read(self, file: BinaryIO) -> str:
+        soup = BeautifulSoup(file, "html.parser")
        text = soup.get_text()
        return text


-class MarkdownParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            html = markdown.markdown(f.read())
-            text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
-        return text
-
-
 class LaTeXParser(ParserStrategy):
-    def read(self, file_path: Path) -> str:
-        with open(file_path, "r") as f:
-            latex = f.read()
+    def read(self, file: BinaryIO) -> str:
+        latex = file.read().decode()
        text = LatexNodes2Text().latex_to_text(latex)
        return text

@@ -106,13 +94,15 @@ class FileContext:
        self.logger.debug(f"Setting Context Parser to {parser}")
        self.parser = parser

-    def read_file(self, file_path) -> str:
-        self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
-        return self.parser.read(file_path)
+    def decode_file(self, file: BinaryIO) -> str:
+        self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
+        return self.parser.read(file)


 extension_to_parser = {
    ".txt": TXTParser(),
+    ".md": TXTParser(),
+    ".markdown": TXTParser(),
    ".csv": TXTParser(),
    ".pdf": PDFParser(),
    ".docx": DOCXParser(),
@@ -123,47 +113,36 @@ extension_to_parser = {
    ".html": HTMLParser(),
    ".htm": HTMLParser(),
    ".xhtml": HTMLParser(),
-    ".md": MarkdownParser(),
-    ".markdown": MarkdownParser(),
    ".tex": LaTeXParser(),
 }


-def is_file_binary_fn(file_path: Path):
+def is_file_binary_fn(file: BinaryIO):
    """Given a file path load all its content and checks if the null bytes is present

    Args:
-        file_path (_type_): _description_
+        file (_type_): _description_

    Returns:
        bool: is_binary
    """
-    with open(file_path, "rb") as f:
-        file_data = f.read()
+    file_data = file.read()
+    file.seek(0)
    if b"\x00" in file_data:
        return True
    return False


-def read_textual_file(file_path: Path, logger: logging.Logger) -> str:
-    if not file_path.is_absolute():
-        raise ValueError("File path must be absolute")
+def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
+    if not file.readable():
+        raise ValueError(f"read_file failed: {file.name} is not a file")

-    if not file_path.is_file():
-        if not file_path.exists():
-            raise FileNotFoundError(
-                f"read_file {file_path} failed: no such file or directory"
-            )
-        else:
-            raise ValueError(f"read_file failed: {file_path} is not a file")
-
-    is_binary = is_file_binary_fn(file_path)
-    file_extension = os.path.splitext(file_path)[1].lower()
+    file_extension = os.path.splitext(file.name)[1].lower()
    parser = extension_to_parser.get(file_extension)
    if not parser:
-        if is_binary:
+        if is_file_binary_fn(file):
            raise ValueError(f"Unsupported binary file format: {file_extension}")
        # fallback to txt file parser (to support script and code files loading)
        parser = TXTParser()
    file_context = FileContext(parser, logger)
-    return file_context.read_file(file_path)
+    return file_context.decode_file(file)
--- a/autogpts/autogpt/autogpt/file_workspace/base.py
+++ b/autogpts/autogpt/autogpt/file_workspace/base.py
@@ -5,8 +5,9 @@ from __future__ import annotations

 import logging
 from abc import ABC, abstractmethod
+from io import IOBase, TextIOBase
 from pathlib import Path
-from typing import Any, Callable, Literal, Optional, overload
+from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload

 from autogpt.core.configuration.schema import SystemConfiguration

@@ -47,9 +48,23 @@ class FileWorkspace(ABC):
        doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
        """

+    @overload
    @abstractmethod
-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def open_file(
+        self, path: str | Path, binary: Literal[False] = False
+    ) -> TextIO | TextIOBase:
+        """Returns a readable text file-like object representing the file."""
+
+    @overload
+    @abstractmethod
+    def open_file(
+        self, path: str | Path, binary: Literal[True] = True
+    ) -> BinaryIO | IOBase:
+        """Returns a readable binary file-like object representing the file."""
+
+    @abstractmethod
+    def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
+        """Returns a readable file-like object representing the file."""

    @overload
    @abstractmethod
--- a/autogpts/autogpt/autogpt/file_workspace/gcs.py
+++ b/autogpts/autogpt/autogpt/file_workspace/gcs.py
@@ -6,6 +6,7 @@ from __future__ import annotations

 import inspect
 import logging
+from io import IOBase
 from pathlib import Path

 from google.cloud import storage
@@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace):
        return self._root

    @property
-    def restrict_to_root(self):
+    def restrict_to_root(self) -> bool:
        """Whether to restrict generated paths to the root."""
        return True

@@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace):
    def get_path(self, relative_path: str | Path) -> Path:
        return super().get_path(relative_path).relative_to("/")

-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def _get_blob(self, path: str | Path) -> storage.Blob:
        path = self.get_path(path)
-        blob = self._bucket.blob(str(path))
-        return blob
+        return self._bucket.blob(str(path))
+
+    def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+        """Open a file in the workspace."""
+        blob = self._get_blob(path)
+        blob.reload()  # pin revision number to prevent version mixing while reading
+        return blob.open("rb" if binary else "r")

    def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
        """Read a file in the workspace."""
-        blob = self.open_file(path, "r")
-        file_content = (
-            blob.download_as_text() if not binary else blob.download_as_bytes()
-        )
-        return file_content
+        return self.open_file(path, binary).read()

-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
        """Write to a file in the workspace."""
-        blob = self.open_file(path, "w")
-        blob.upload_from_string(content) if isinstance(
-            content, str
-        ) else blob.upload_from_file(content)
+        blob = self._get_blob(path)
+
+        if isinstance(content, str):
+            blob.upload_from_string(content)
+        else:
+            blob.upload_from_file(content)

        if self.on_write_file:
            path = Path(path)
--- a/autogpts/autogpt/autogpt/file_workspace/local.py
+++ b/autogpts/autogpt/autogpt/file_workspace/local.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 import inspect
 import logging
 from pathlib import Path
+from typing import IO

 from .base import FileWorkspace, FileWorkspaceConfiguration

@@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace):
        return self._root

    @property
-    def restrict_to_root(self):
+    def restrict_to_root(self) -> bool:
        """Whether to restrict generated paths to the root."""
        return self._restrict_to_root

    def initialize(self) -> None:
        self.root.mkdir(exist_ok=True, parents=True)

-    def open_file(self, path: str | Path, mode: str = "r"):
+    def open_file(self, path: str | Path, binary: bool = False) -> IO:
        """Open a file in the workspace."""
-        full_path = self.get_path(path)
-        return open(full_path, mode)
+        return self._open_file(path, "rb" if binary else "r")

-    def read_file(self, path: str | Path, binary: bool = False):
+    def _open_file(self, path: str | Path, mode: str = "r") -> IO:
+        full_path = self.get_path(path)
+        return open(full_path, mode)  # type: ignore
+
+    def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
        """Read a file in the workspace."""
-        with self.open_file(path, "rb" if binary else "r") as file:
+        with self._open_file(path, "rb" if binary else "r") as file:
            return file.read()

-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
        """Write to a file in the workspace."""
-        with self.open_file(path, "wb" if type(content) is bytes else "w") as file:
+        with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
            file.write(content)

        if self.on_write_file:
@@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace):
        path = self.get_path(path)
        return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]

-    def delete_file(self, path: str | Path):
+    def delete_file(self, path: str | Path) -> None:
        """Delete a file in the workspace."""
        full_path = self.get_path(path)
        full_path.unlink()
--- a/autogpts/autogpt/autogpt/file_workspace/s3.py
+++ b/autogpts/autogpt/autogpt/file_workspace/s3.py
@@ -8,6 +8,7 @@ import contextlib
 import inspect
 import logging
 import os
+from io import IOBase, TextIOWrapper
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional

@@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace):
    def get_path(self, relative_path: str | Path) -> Path:
        return super().get_path(relative_path).relative_to("/")

-    def open_file(self, path: str | Path, mode: str = "r"):
-        """Open a file in the workspace."""
+    def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object:
+        """Get an S3 object."""
        path = self.get_path(path)
        obj = self._bucket.Object(str(path))
        with contextlib.suppress(botocore.exceptions.ClientError):
            obj.load()
        return obj

+    def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
+        """Open a file in the workspace."""
+        obj = self._get_obj(path)
+        return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"])
+
    def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
        """Read a file in the workspace."""
-        file_content = self.open_file(path, "r").get()["Body"].read()
+        file_content = self.open_file(path, binary).read()
        return file_content if binary else file_content.decode()

-    async def write_file(self, path: str | Path, content: str | bytes):
+    async def write_file(self, path: str | Path, content: str | bytes) -> None:
        """Write to a file in the workspace."""
-        obj = self.open_file(path, "w")
+        obj = self._get_obj(path)
        obj.put(Body=content)

        if self.on_write_file:
--- a/autogpts/autogpt/autogpt/models/context_item.py
+++ b/autogpts/autogpt/autogpt/models/context_item.py
@@ -5,7 +5,7 @@ from typing import Optional

 from pydantic import BaseModel, Field

-from autogpt.commands.file_operations_utils import read_textual_file
+from autogpt.commands.file_operations_utils import decode_textual_file

 logger = logging.getLogger(__name__)

@@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem):

    @property
    def content(self) -> str:
-        return read_textual_file(self.file_path, logger)
+        with open(self.file_path, "rb") as file:
+            return decode_textual_file(file, logger)


 class FolderContextItem(BaseModel, ContextItem):
--- a/autogpts/autogpt/tests/unit/test_text_file_parsers.py
+++ b/autogpts/autogpt/tests/unit/test_text_file_parsers.py
@@ -5,10 +5,14 @@ from pathlib import Path
 from xml.etree import ElementTree

 import docx
+import pytest
 import yaml
 from bs4 import BeautifulSoup

-from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
+from autogpt.commands.file_operations_utils import (
+    decode_textual_file,
+    is_file_binary_fn,
+)

 logger = logging.getLogger(__name__)

@@ -148,15 +152,18 @@ respective_file_creation_functions = {
 binary_files_extensions = [".pdf", ".docx"]


-def test_parsers():
-    for (
-        file_extension,
-        c_file_creator,
-    ) in respective_file_creation_functions.items():
+@pytest.mark.parametrize(
+    "file_extension, c_file_creator",
+    respective_file_creation_functions.items(),
+)
+def test_parsers(file_extension, c_file_creator):
    created_file_path = Path(c_file_creator())
-        loaded_text = read_textual_file(created_file_path, logger)
+    with open(created_file_path, "rb") as file:
+        loaded_text = decode_textual_file(file, logger)

        assert plain_text_str in loaded_text

        should_be_binary = file_extension in binary_files_extensions
-        assert should_be_binary == is_file_binary_fn(created_file_path)
+        assert should_be_binary == is_file_binary_fn(file)
+
+    created_file_path.unlink()  # cleanup