Text file loaders (#3031)

* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages) * adding text file utils providing function to load file based on extension && adding read_text_file command * adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added)) * formatting * changing command name from 'read_text_file' to 'parse_text_document' * fallback to txtParser if file extension is not known to read script and code files * adding extension respective parsers * adding binary file check function * adding file existance check && raising valueError for unsupported binary file formats * adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes * adding .yml extension parser * removal of .doc parser * updating file loading commands names * updating test (removing .doc mock function) * fix: import sort * new cassette for mem A * feat: update Cassettes * feat: consolidate commands * feat: linting * feat: updates to cassettes --------- Co-authored-by: Reinier van der Leer <github@pwuts.nl> Co-authored-by: Nicholas Tindle <nick@ntindle.com> Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
2025-12-20 15:34:23 +01:00 · 2023-05-21 21:48:40 +02:00
parent e1c6778c3a
commit 31525dfef7
7 changed files with 4053 additions and 622 deletions
--- a/autogpt/commands/file_operations.py
+++ b/autogpt/commands/file_operations.py
@@ -12,6 +12,7 @@ from colorama import Back, Fore
 from requests.adapters import HTTPAdapter, Retry

 from autogpt.commands.command import command
+from autogpt.commands.file_operations_utils import read_textual_file
 from autogpt.config import Config
 from autogpt.logs import logger
 from autogpt.spinner import Spinner
@@ -143,7 +144,7 @@ def split_file(
        start += max_length - overlap


-@command("read_file", "Read file", '"filename": "<filename>"')
+@command("read_file", "Read a file", '"filename": "<filename>"')
 def read_file(filename: str) -> str:
    """Read a file and return the contents

@@ -154,12 +155,10 @@ def read_file(filename: str) -> str:
        str: The contents of the file
    """
    try:
-        charset_match = charset_normalizer.from_path(filename).best()
-        encoding = charset_match.encoding
-        logger.debug(f"Read file '{filename}' with encoding '{encoding}'")
-        return str(charset_match)
-    except Exception as err:
-        return f"Error: {err}"
+        content = read_textual_file(filename, logger)
+        return content
+    except Exception as e:
+        return f"Error: {str(e)}"


 def ingest_file(