refactor(agent/file_operations): Refactor file opening/reading and parsing

- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend
- Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality
- Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
This commit is contained in:
Reinier van der Leer
2023-12-12 17:41:04 +01:00
parent 198a0ecad6
commit d95e3b5b54
8 changed files with 125 additions and 111 deletions

View File

@@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema
from autogpt.memory.vector import MemoryItemFactory, VectorMemory from autogpt.memory.vector import MemoryItemFactory, VectorMemory
from .decorators import sanitize_path_arg from .decorators import sanitize_path_arg
from .file_operations_utils import read_textual_file from .file_operations_utils import decode_textual_file
COMMAND_CATEGORY = "file_operations" COMMAND_CATEGORY = "file_operations"
COMMAND_CATEGORY_TITLE = "File Operations" COMMAND_CATEGORY_TITLE = "File Operations"
@@ -140,8 +140,7 @@ def log_operation(
) )
}, },
) )
@sanitize_path_arg("filename") def read_file(filename: str | Path, agent: Agent) -> str:
def read_file(filename: Path, agent: Agent) -> str:
"""Read a file and return the contents """Read a file and return the contents
Args: Args:
@@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
Returns: Returns:
str: The contents of the file str: The contents of the file
""" """
content = read_textual_file(filename, logger) file = agent.workspace.open_file(filename, binary=True)
# TODO: content = agent.workspace.read_file(filename) content = decode_textual_file(file, logger)
# # TODO: invalidate/update memory when file is edited # # TODO: invalidate/update memory when file is edited
# file_memory = MemoryItem.from_text_file(content, str(filename), agent.config) # file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)

View File

@@ -1,11 +1,11 @@
import json import json
import logging import logging
import os import os
from pathlib import Path from abc import ABC, abstractmethod
from typing import BinaryIO
import charset_normalizer import charset_normalizer
import docx import docx
import markdown
import pypdf import pypdf
import yaml import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ParserStrategy: class ParserStrategy(ABC):
def read(self, file_path: Path) -> str: @abstractmethod
raise NotImplementedError def read(self, file: BinaryIO) -> str:
...
# Basic text file reading # Basic text file reading
class TXTParser(ParserStrategy): class TXTParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
charset_match = charset_normalizer.from_path(file_path).best() charset_match = charset_normalizer.from_bytes(file.read()).best()
logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'") logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
return str(charset_match) return str(charset_match)
# Reading text from binary file using pdf parser # Reading text from binary file using pdf parser
class PDFParser(ParserStrategy): class PDFParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
parser = pypdf.PdfReader(file_path) parser = pypdf.PdfReader(file)
text = "" text = ""
for page_idx in range(len(parser.pages)): for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text() text += parser.pages[page_idx].extract_text()
@@ -39,8 +40,8 @@ class PDFParser(ParserStrategy):
# Reading text from binary file using docs parser # Reading text from binary file using docs parser
class DOCXParser(ParserStrategy): class DOCXParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
doc_file = docx.Document(file_path) doc_file = docx.Document(file)
text = "" text = ""
for para in doc_file.paragraphs: for para in doc_file.paragraphs:
text += para.text text += para.text
@@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy):
# Reading as dictionary and returning string format # Reading as dictionary and returning string format
class JSONParser(ParserStrategy): class JSONParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
with open(file_path, "r") as f: data = json.load(file)
data = json.load(f)
text = str(data) text = str(data)
return text return text
class XMLParser(ParserStrategy): class XMLParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
with open(file_path, "r") as f: soup = BeautifulSoup(file, "xml")
soup = BeautifulSoup(f, "xml")
text = soup.get_text() text = soup.get_text()
return text return text
# Reading as dictionary and returning string format # Reading as dictionary and returning string format
class YAMLParser(ParserStrategy): class YAMLParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
with open(file_path, "r") as f: data = yaml.load(file, Loader=yaml.FullLoader)
data = yaml.load(f, Loader=yaml.FullLoader)
text = str(data) text = str(data)
return text return text
class HTMLParser(ParserStrategy): class HTMLParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
with open(file_path, "r") as f: soup = BeautifulSoup(file, "html.parser")
soup = BeautifulSoup(f, "html.parser")
text = soup.get_text() text = soup.get_text()
return text return text
class MarkdownParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
html = markdown.markdown(f.read())
text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
return text
class LaTeXParser(ParserStrategy): class LaTeXParser(ParserStrategy):
def read(self, file_path: Path) -> str: def read(self, file: BinaryIO) -> str:
with open(file_path, "r") as f: latex = file.read().decode()
latex = f.read()
text = LatexNodes2Text().latex_to_text(latex) text = LatexNodes2Text().latex_to_text(latex)
return text return text
@@ -106,13 +94,15 @@ class FileContext:
self.logger.debug(f"Setting Context Parser to {parser}") self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser self.parser = parser
def read_file(self, file_path) -> str: def decode_file(self, file: BinaryIO) -> str:
self.logger.debug(f"Reading file {file_path} with parser {self.parser}") self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
return self.parser.read(file_path) return self.parser.read(file)
extension_to_parser = { extension_to_parser = {
".txt": TXTParser(), ".txt": TXTParser(),
".md": TXTParser(),
".markdown": TXTParser(),
".csv": TXTParser(), ".csv": TXTParser(),
".pdf": PDFParser(), ".pdf": PDFParser(),
".docx": DOCXParser(), ".docx": DOCXParser(),
@@ -123,47 +113,36 @@ extension_to_parser = {
".html": HTMLParser(), ".html": HTMLParser(),
".htm": HTMLParser(), ".htm": HTMLParser(),
".xhtml": HTMLParser(), ".xhtml": HTMLParser(),
".md": MarkdownParser(),
".markdown": MarkdownParser(),
".tex": LaTeXParser(), ".tex": LaTeXParser(),
} }
def is_file_binary_fn(file_path: Path): def is_file_binary_fn(file: BinaryIO):
"""Given a file path load all its content and checks if the null bytes is present """Given a file path load all its content and checks if the null bytes is present
Args: Args:
file_path (_type_): _description_ file (_type_): _description_
Returns: Returns:
bool: is_binary bool: is_binary
""" """
with open(file_path, "rb") as f: file_data = file.read()
file_data = f.read() file.seek(0)
if b"\x00" in file_data: if b"\x00" in file_data:
return True return True
return False return False
def read_textual_file(file_path: Path, logger: logging.Logger) -> str: def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
if not file_path.is_absolute(): if not file.readable():
raise ValueError("File path must be absolute") raise ValueError(f"read_file failed: {file.name} is not a file")
if not file_path.is_file(): file_extension = os.path.splitext(file.name)[1].lower()
if not file_path.exists():
raise FileNotFoundError(
f"read_file {file_path} failed: no such file or directory"
)
else:
raise ValueError(f"read_file failed: {file_path} is not a file")
is_binary = is_file_binary_fn(file_path)
file_extension = os.path.splitext(file_path)[1].lower()
parser = extension_to_parser.get(file_extension) parser = extension_to_parser.get(file_extension)
if not parser: if not parser:
if is_binary: if is_file_binary_fn(file):
raise ValueError(f"Unsupported binary file format: {file_extension}") raise ValueError(f"Unsupported binary file format: {file_extension}")
# fallback to txt file parser (to support script and code files loading) # fallback to txt file parser (to support script and code files loading)
parser = TXTParser() parser = TXTParser()
file_context = FileContext(parser, logger) file_context = FileContext(parser, logger)
return file_context.read_file(file_path) return file_context.decode_file(file)

View File

@@ -5,8 +5,9 @@ from __future__ import annotations
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from io import IOBase, TextIOBase
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Literal, Optional, overload from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload
from autogpt.core.configuration.schema import SystemConfiguration from autogpt.core.configuration.schema import SystemConfiguration
@@ -47,9 +48,23 @@ class FileWorkspace(ABC):
doesn't exist yet. E.g. a folder on disk, or an S3 Bucket. doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
""" """
@overload
@abstractmethod @abstractmethod
def open_file(self, path: str | Path, mode: str = "r"): def open_file(
"""Open a file in the workspace.""" self, path: str | Path, binary: Literal[False] = False
) -> TextIO | TextIOBase:
"""Returns a readable text file-like object representing the file."""
@overload
@abstractmethod
def open_file(
self, path: str | Path, binary: Literal[True] = True
) -> BinaryIO | IOBase:
"""Returns a readable binary file-like object representing the file."""
@abstractmethod
def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
"""Returns a readable file-like object representing the file."""
@overload @overload
@abstractmethod @abstractmethod

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect import inspect
import logging import logging
from io import IOBase
from pathlib import Path from pathlib import Path
from google.cloud import storage from google.cloud import storage
@@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace):
return self._root return self._root
@property @property
def restrict_to_root(self): def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root.""" """Whether to restrict generated paths to the root."""
return True return True
@@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path: def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/") return super().get_path(relative_path).relative_to("/")
def open_file(self, path: str | Path, mode: str = "r"): def _get_blob(self, path: str | Path) -> storage.Blob:
"""Open a file in the workspace."""
path = self.get_path(path) path = self.get_path(path)
blob = self._bucket.blob(str(path)) return self._bucket.blob(str(path))
return blob
def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
"""Open a file in the workspace."""
blob = self._get_blob(path)
blob.reload() # pin revision number to prevent version mixing while reading
return blob.open("rb" if binary else "r")
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes: def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace.""" """Read a file in the workspace."""
blob = self.open_file(path, "r") return self.open_file(path, binary).read()
file_content = (
blob.download_as_text() if not binary else blob.download_as_bytes()
)
return file_content
async def write_file(self, path: str | Path, content: str | bytes): async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace.""" """Write to a file in the workspace."""
blob = self.open_file(path, "w") blob = self._get_blob(path)
blob.upload_from_string(content) if isinstance(
content, str if isinstance(content, str):
) else blob.upload_from_file(content) blob.upload_from_string(content)
else:
blob.upload_from_file(content)
if self.on_write_file: if self.on_write_file:
path = Path(path) path = Path(path)

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect import inspect
import logging import logging
from pathlib import Path from pathlib import Path
from typing import IO
from .base import FileWorkspace, FileWorkspaceConfiguration from .base import FileWorkspace, FileWorkspaceConfiguration
@@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace):
return self._root return self._root
@property @property
def restrict_to_root(self): def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root.""" """Whether to restrict generated paths to the root."""
return self._restrict_to_root return self._restrict_to_root
def initialize(self) -> None: def initialize(self) -> None:
self.root.mkdir(exist_ok=True, parents=True) self.root.mkdir(exist_ok=True, parents=True)
def open_file(self, path: str | Path, mode: str = "r"): def open_file(self, path: str | Path, binary: bool = False) -> IO:
"""Open a file in the workspace.""" """Open a file in the workspace."""
full_path = self.get_path(path) return self._open_file(path, "rb" if binary else "r")
return open(full_path, mode)
def read_file(self, path: str | Path, binary: bool = False): def _open_file(self, path: str | Path, mode: str = "r") -> IO:
full_path = self.get_path(path)
return open(full_path, mode) # type: ignore
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace.""" """Read a file in the workspace."""
with self.open_file(path, "rb" if binary else "r") as file: with self._open_file(path, "rb" if binary else "r") as file:
return file.read() return file.read()
async def write_file(self, path: str | Path, content: str | bytes): async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace.""" """Write to a file in the workspace."""
with self.open_file(path, "wb" if type(content) is bytes else "w") as file: with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
file.write(content) file.write(content)
if self.on_write_file: if self.on_write_file:
@@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace):
path = self.get_path(path) path = self.get_path(path)
return [file.relative_to(path) for file in path.rglob("*") if file.is_file()] return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]
def delete_file(self, path: str | Path): def delete_file(self, path: str | Path) -> None:
"""Delete a file in the workspace.""" """Delete a file in the workspace."""
full_path = self.get_path(path) full_path = self.get_path(path)
full_path.unlink() full_path.unlink()

View File

@@ -8,6 +8,7 @@ import contextlib
import inspect import inspect
import logging import logging
import os import os
from io import IOBase, TextIOWrapper
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
@@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path: def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/") return super().get_path(relative_path).relative_to("/")
def open_file(self, path: str | Path, mode: str = "r"): def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object:
"""Open a file in the workspace.""" """Get an S3 object."""
path = self.get_path(path) path = self.get_path(path)
obj = self._bucket.Object(str(path)) obj = self._bucket.Object(str(path))
with contextlib.suppress(botocore.exceptions.ClientError): with contextlib.suppress(botocore.exceptions.ClientError):
obj.load() obj.load()
return obj return obj
def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
"""Open a file in the workspace."""
obj = self._get_obj(path)
return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"])
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes: def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace.""" """Read a file in the workspace."""
file_content = self.open_file(path, "r").get()["Body"].read() file_content = self.open_file(path, binary).read()
return file_content if binary else file_content.decode() return file_content if binary else file_content.decode()
async def write_file(self, path: str | Path, content: str | bytes): async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace.""" """Write to a file in the workspace."""
obj = self.open_file(path, "w") obj = self._get_obj(path)
obj.put(Body=content) obj.put(Body=content)
if self.on_write_file: if self.on_write_file:

View File

@@ -5,7 +5,7 @@ from typing import Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from autogpt.commands.file_operations_utils import read_textual_file from autogpt.commands.file_operations_utils import decode_textual_file
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem):
@property @property
def content(self) -> str: def content(self) -> str:
return read_textual_file(self.file_path, logger) with open(self.file_path, "rb") as file:
return decode_textual_file(file, logger)
class FolderContextItem(BaseModel, ContextItem): class FolderContextItem(BaseModel, ContextItem):

View File

@@ -5,10 +5,14 @@ from pathlib import Path
from xml.etree import ElementTree from xml.etree import ElementTree
import docx import docx
import pytest
import yaml import yaml
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file from autogpt.commands.file_operations_utils import (
decode_textual_file,
is_file_binary_fn,
)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -148,15 +152,18 @@ respective_file_creation_functions = {
binary_files_extensions = [".pdf", ".docx"] binary_files_extensions = [".pdf", ".docx"]
def test_parsers(): @pytest.mark.parametrize(
for ( "file_extension, c_file_creator",
file_extension, respective_file_creation_functions.items(),
c_file_creator, )
) in respective_file_creation_functions.items(): def test_parsers(file_extension, c_file_creator):
created_file_path = Path(c_file_creator()) created_file_path = Path(c_file_creator())
loaded_text = read_textual_file(created_file_path, logger) with open(created_file_path, "rb") as file:
loaded_text = decode_textual_file(file, logger)
assert plain_text_str in loaded_text assert plain_text_str in loaded_text
should_be_binary = file_extension in binary_files_extensions should_be_binary = file_extension in binary_files_extensions
assert should_be_binary == is_file_binary_fn(created_file_path) assert should_be_binary == is_file_binary_fn(file)
created_file_path.unlink() # cleanup