refactor(agent/file_operations): Refactor file opening/reading and parsing

- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend
- Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality
- Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
This commit is contained in:
Reinier van der Leer
2023-12-12 17:41:04 +01:00
parent 198a0ecad6
commit d95e3b5b54
8 changed files with 125 additions and 111 deletions

View File

@@ -16,7 +16,7 @@ from autogpt.core.utils.json_schema import JSONSchema
from autogpt.memory.vector import MemoryItemFactory, VectorMemory
from .decorators import sanitize_path_arg
from .file_operations_utils import read_textual_file
from .file_operations_utils import decode_textual_file
COMMAND_CATEGORY = "file_operations"
COMMAND_CATEGORY_TITLE = "File Operations"
@@ -140,8 +140,7 @@ def log_operation(
)
},
)
@sanitize_path_arg("filename")
def read_file(filename: Path, agent: Agent) -> str:
def read_file(filename: str | Path, agent: Agent) -> str:
"""Read a file and return the contents
Args:
@@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
Returns:
str: The contents of the file
"""
content = read_textual_file(filename, logger)
# TODO: content = agent.workspace.read_file(filename)
file = agent.workspace.open_file(filename, binary=True)
content = decode_textual_file(file, logger)
# # TODO: invalidate/update memory when file is edited
# file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)

View File

@@ -1,11 +1,11 @@
import json
import logging
import os
from pathlib import Path
from abc import ABC, abstractmethod
from typing import BinaryIO
import charset_normalizer
import docx
import markdown
import pypdf
import yaml
from bs4 import BeautifulSoup
@@ -14,23 +14,24 @@ from pylatexenc.latex2text import LatexNodes2Text
logger = logging.getLogger(__name__)
class ParserStrategy:
def read(self, file_path: Path) -> str:
raise NotImplementedError
class ParserStrategy(ABC):
@abstractmethod
def read(self, file: BinaryIO) -> str:
...
# Basic text file reading
class TXTParser(ParserStrategy):
def read(self, file_path: Path) -> str:
charset_match = charset_normalizer.from_path(file_path).best()
logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
def read(self, file: BinaryIO) -> str:
charset_match = charset_normalizer.from_bytes(file.read()).best()
logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
return str(charset_match)
# Reading text from binary file using pdf parser
class PDFParser(ParserStrategy):
def read(self, file_path: Path) -> str:
parser = pypdf.PdfReader(file_path)
def read(self, file: BinaryIO) -> str:
parser = pypdf.PdfReader(file)
text = ""
for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text()
@@ -39,8 +40,8 @@ class PDFParser(ParserStrategy):
# Reading text from binary file using docs parser
class DOCXParser(ParserStrategy):
def read(self, file_path: Path) -> str:
doc_file = docx.Document(file_path)
def read(self, file: BinaryIO) -> str:
doc_file = docx.Document(file)
text = ""
for para in doc_file.paragraphs:
text += para.text
@@ -49,50 +50,37 @@ class DOCXParser(ParserStrategy):
# Reading as dictionary and returning string format
class JSONParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
data = json.load(f)
text = str(data)
def read(self, file: BinaryIO) -> str:
data = json.load(file)
text = str(data)
return text
class XMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "xml")
text = soup.get_text()
def read(self, file: BinaryIO) -> str:
soup = BeautifulSoup(file, "xml")
text = soup.get_text()
return text
# Reading as dictionary and returning string format
class YAMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
data = yaml.load(f, Loader=yaml.FullLoader)
text = str(data)
def read(self, file: BinaryIO) -> str:
data = yaml.load(file, Loader=yaml.FullLoader)
text = str(data)
return text
class HTMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "html.parser")
text = soup.get_text()
return text
class MarkdownParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
html = markdown.markdown(f.read())
text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
def read(self, file: BinaryIO) -> str:
soup = BeautifulSoup(file, "html.parser")
text = soup.get_text()
return text
class LaTeXParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
latex = f.read()
def read(self, file: BinaryIO) -> str:
latex = file.read().decode()
text = LatexNodes2Text().latex_to_text(latex)
return text
@@ -106,13 +94,15 @@ class FileContext:
self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser
def read_file(self, file_path) -> str:
self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
return self.parser.read(file_path)
def decode_file(self, file: BinaryIO) -> str:
self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
return self.parser.read(file)
extension_to_parser = {
".txt": TXTParser(),
".md": TXTParser(),
".markdown": TXTParser(),
".csv": TXTParser(),
".pdf": PDFParser(),
".docx": DOCXParser(),
@@ -123,47 +113,36 @@ extension_to_parser = {
".html": HTMLParser(),
".htm": HTMLParser(),
".xhtml": HTMLParser(),
".md": MarkdownParser(),
".markdown": MarkdownParser(),
".tex": LaTeXParser(),
}
def is_file_binary_fn(file_path: Path):
def is_file_binary_fn(file: BinaryIO):
"""Given a file path load all its content and checks if the null bytes is present
Args:
file_path (_type_): _description_
file (_type_): _description_
Returns:
bool: is_binary
"""
with open(file_path, "rb") as f:
file_data = f.read()
file_data = file.read()
file.seek(0)
if b"\x00" in file_data:
return True
return False
def read_textual_file(file_path: Path, logger: logging.Logger) -> str:
if not file_path.is_absolute():
raise ValueError("File path must be absolute")
def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
if not file.readable():
raise ValueError(f"read_file failed: {file.name} is not a file")
if not file_path.is_file():
if not file_path.exists():
raise FileNotFoundError(
f"read_file {file_path} failed: no such file or directory"
)
else:
raise ValueError(f"read_file failed: {file_path} is not a file")
is_binary = is_file_binary_fn(file_path)
file_extension = os.path.splitext(file_path)[1].lower()
file_extension = os.path.splitext(file.name)[1].lower()
parser = extension_to_parser.get(file_extension)
if not parser:
if is_binary:
if is_file_binary_fn(file):
raise ValueError(f"Unsupported binary file format: {file_extension}")
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)
return file_context.read_file(file_path)
return file_context.decode_file(file)

View File

@@ -5,8 +5,9 @@ from __future__ import annotations
import logging
from abc import ABC, abstractmethod
from io import IOBase, TextIOBase
from pathlib import Path
from typing import Any, Callable, Literal, Optional, overload
from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload
from autogpt.core.configuration.schema import SystemConfiguration
@@ -47,9 +48,23 @@ class FileWorkspace(ABC):
doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
"""
@overload
@abstractmethod
def open_file(self, path: str | Path, mode: str = "r"):
"""Open a file in the workspace."""
def open_file(
self, path: str | Path, binary: Literal[False] = False
) -> TextIO | TextIOBase:
"""Returns a readable text file-like object representing the file."""
@overload
@abstractmethod
def open_file(
self, path: str | Path, binary: Literal[True] = True
) -> BinaryIO | IOBase:
"""Returns a readable binary file-like object representing the file."""
@abstractmethod
def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
"""Returns a readable file-like object representing the file."""
@overload
@abstractmethod

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect
import logging
from io import IOBase
from pathlib import Path
from google.cloud import storage
@@ -40,7 +41,7 @@ class GCSFileWorkspace(FileWorkspace):
return self._root
@property
def restrict_to_root(self):
def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return True
@@ -50,26 +51,28 @@ class GCSFileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/")
def open_file(self, path: str | Path, mode: str = "r"):
"""Open a file in the workspace."""
def _get_blob(self, path: str | Path) -> storage.Blob:
path = self.get_path(path)
blob = self._bucket.blob(str(path))
return blob
return self._bucket.blob(str(path))
def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
"""Open a file in the workspace."""
blob = self._get_blob(path)
blob.reload() # pin revision number to prevent version mixing while reading
return blob.open("rb" if binary else "r")
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
blob = self.open_file(path, "r")
file_content = (
blob.download_as_text() if not binary else blob.download_as_bytes()
)
return file_content
return self.open_file(path, binary).read()
async def write_file(self, path: str | Path, content: str | bytes):
async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
blob = self.open_file(path, "w")
blob.upload_from_string(content) if isinstance(
content, str
) else blob.upload_from_file(content)
blob = self._get_blob(path)
if isinstance(content, str):
blob.upload_from_string(content)
else:
blob.upload_from_file(content)
if self.on_write_file:
path = Path(path)

View File

@@ -6,6 +6,7 @@ from __future__ import annotations
import inspect
import logging
from pathlib import Path
from typing import IO
from .base import FileWorkspace, FileWorkspaceConfiguration
@@ -26,26 +27,29 @@ class LocalFileWorkspace(FileWorkspace):
return self._root
@property
def restrict_to_root(self):
def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return self._restrict_to_root
def initialize(self) -> None:
self.root.mkdir(exist_ok=True, parents=True)
def open_file(self, path: str | Path, mode: str = "r"):
def open_file(self, path: str | Path, binary: bool = False) -> IO:
"""Open a file in the workspace."""
full_path = self.get_path(path)
return open(full_path, mode)
return self._open_file(path, "rb" if binary else "r")
def read_file(self, path: str | Path, binary: bool = False):
def _open_file(self, path: str | Path, mode: str = "r") -> IO:
full_path = self.get_path(path)
return open(full_path, mode) # type: ignore
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
with self.open_file(path, "rb" if binary else "r") as file:
with self._open_file(path, "rb" if binary else "r") as file:
return file.read()
async def write_file(self, path: str | Path, content: str | bytes):
async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
with self.open_file(path, "wb" if type(content) is bytes else "w") as file:
with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
file.write(content)
if self.on_write_file:
@@ -61,7 +65,7 @@ class LocalFileWorkspace(FileWorkspace):
path = self.get_path(path)
return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]
def delete_file(self, path: str | Path):
def delete_file(self, path: str | Path) -> None:
"""Delete a file in the workspace."""
full_path = self.get_path(path)
full_path.unlink()

View File

@@ -8,6 +8,7 @@ import contextlib
import inspect
import logging
import os
from io import IOBase, TextIOWrapper
from pathlib import Path
from typing import TYPE_CHECKING, Optional
@@ -74,22 +75,27 @@ class S3FileWorkspace(FileWorkspace):
def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/")
def open_file(self, path: str | Path, mode: str = "r"):
"""Open a file in the workspace."""
def _get_obj(self, path: str | Path) -> mypy_boto3_s3.service_resource.Object:
"""Get an S3 object."""
path = self.get_path(path)
obj = self._bucket.Object(str(path))
with contextlib.suppress(botocore.exceptions.ClientError):
obj.load()
return obj
def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
"""Open a file in the workspace."""
obj = self._get_obj(path)
return obj.get()["Body"] if not binary else TextIOWrapper(obj.get()["Body"])
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
file_content = self.open_file(path, "r").get()["Body"].read()
file_content = self.open_file(path, binary).read()
return file_content if binary else file_content.decode()
async def write_file(self, path: str | Path, content: str | bytes):
async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
obj = self.open_file(path, "w")
obj = self._get_obj(path)
obj.put(Body=content)
if self.on_write_file:

View File

@@ -5,7 +5,7 @@ from typing import Optional
from pydantic import BaseModel, Field
from autogpt.commands.file_operations_utils import read_textual_file
from autogpt.commands.file_operations_utils import decode_textual_file
logger = logging.getLogger(__name__)
@@ -56,7 +56,8 @@ class FileContextItem(BaseModel, ContextItem):
@property
def content(self) -> str:
return read_textual_file(self.file_path, logger)
with open(self.file_path, "rb") as file:
return decode_textual_file(file, logger)
class FolderContextItem(BaseModel, ContextItem):

View File

@@ -5,10 +5,14 @@ from pathlib import Path
from xml.etree import ElementTree
import docx
import pytest
import yaml
from bs4 import BeautifulSoup
from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
from autogpt.commands.file_operations_utils import (
decode_textual_file,
is_file_binary_fn,
)
logger = logging.getLogger(__name__)
@@ -148,15 +152,18 @@ respective_file_creation_functions = {
binary_files_extensions = [".pdf", ".docx"]
def test_parsers():
for (
file_extension,
c_file_creator,
) in respective_file_creation_functions.items():
created_file_path = Path(c_file_creator())
loaded_text = read_textual_file(created_file_path, logger)
@pytest.mark.parametrize(
"file_extension, c_file_creator",
respective_file_creation_functions.items(),
)
def test_parsers(file_extension, c_file_creator):
created_file_path = Path(c_file_creator())
with open(created_file_path, "rb") as file:
loaded_text = decode_textual_file(file, logger)
assert plain_text_str in loaded_text
should_be_binary = file_extension in binary_files_extensions
assert should_be_binary == is_file_binary_fn(created_file_path)
assert should_be_binary == is_file_binary_fn(file)
created_file_path.unlink() # cleanup