fix(agent/file_operations): Fix read_file command in GCS and S3 workspaces

- Update the `read_file` function in `file_operations.py` to pass the file's extension to the `decode_textual_file` function.
- Modify the `decode_textual_file` function in `file_operations_utils.py` to accept the file extension as an argument.
- Update the `content` property in the `FileContextItem` class in `context_item.py` to pass the file's extension to the `decode_textual_file` function.
- Update the `test_parsers` function in `test_text_file_parsers.py` to pass the file extension to the `decode_textual_file` function.
This commit is contained in:
Reinier van der Leer
2023-12-14 02:04:56 +01:00
parent 5978031f7a
commit e428130e4a
4 changed files with 17 additions and 10 deletions

View File

@@ -150,7 +150,7 @@ def read_file(filename: str | Path, agent: Agent) -> str:
str: The contents of the file
"""
file = agent.workspace.open_file(filename, binary=True)
content = decode_textual_file(file, logger)
content = decode_textual_file(file, os.path.splitext(filename)[1], logger)
# # TODO: invalidate/update memory when file is edited
# file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)

View File

@@ -24,7 +24,10 @@ class ParserStrategy(ABC):
class TXTParser(ParserStrategy):
def read(self, file: BinaryIO) -> str:
charset_match = charset_normalizer.from_bytes(file.read()).best()
logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
logger.debug(
f"Reading {getattr(file, 'name', 'file')} "
f"with encoding '{charset_match.encoding}'"
)
return str(charset_match)
@@ -95,7 +98,9 @@ class FileContext:
self.parser = parser
def decode_file(self, file: BinaryIO) -> str:
self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
self.logger.debug(
f"Reading {getattr(file, 'name', 'file')} with parser {self.parser}"
)
return self.parser.read(file)
@@ -133,15 +138,14 @@ def is_file_binary_fn(file: BinaryIO):
return False
def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
def decode_textual_file(file: BinaryIO, ext: str, logger: logging.Logger) -> str:
if not file.readable():
raise ValueError(f"read_file failed: {file.name} is not a file")
raise ValueError(f"{repr(file)} is not readable")
file_extension = os.path.splitext(file.name)[1].lower()
parser = extension_to_parser.get(file_extension)
parser = extension_to_parser.get(ext.lower())
if not parser:
if is_file_binary_fn(file):
raise ValueError(f"Unsupported binary file format: {file_extension}")
raise ValueError(f"Unsupported binary file format: {ext}")
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)

View File

@@ -1,4 +1,5 @@
import logging
import os.path
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional
@@ -56,8 +57,9 @@ class FileContextItem(BaseModel, ContextItem):
@property
def content(self) -> str:
# TODO: use workspace.open_file()
with open(self.file_path, "rb") as file:
return decode_textual_file(file, logger)
return decode_textual_file(file, os.path.splitext(file.name)[1], logger)
class FolderContextItem(BaseModel, ContextItem):

View File

@@ -1,5 +1,6 @@
import json
import logging
import os.path
import tempfile
from pathlib import Path
from xml.etree import ElementTree
@@ -159,7 +160,7 @@ binary_files_extensions = [".pdf", ".docx"]
def test_parsers(file_extension, c_file_creator):
created_file_path = Path(c_file_creator())
with open(created_file_path, "rb") as file:
loaded_text = decode_textual_file(file, logger)
loaded_text = decode_textual_file(file, os.path.splitext(file.name)[1], logger)
assert plain_text_str in loaded_text