Text file loaders (#3031)

* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages) * adding text file utils providing function to load file based on extension && adding read_text_file command * adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added)) * formatting * changing command name from 'read_text_file' to 'parse_text_document' * fallback to txtParser if file extension is not known to read script and code files * adding extension respective parsers * adding binary file check function * adding file existance check && raising valueError for unsupported binary file formats * adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes * adding .yml extension parser * removal of .doc parser * updating file loading commands names * updating test (removing .doc mock function) * fix: import sort * new cassette for mem A * feat: update Cassettes * feat: consolidate commands * feat: linting * feat: updates to cassettes --------- Co-authored-by: Reinier van der Leer <github@pwuts.nl> Co-authored-by: Nicholas Tindle <nick@ntindle.com> Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
2025-12-20 23:44:19 +01:00 · 2023-05-21 21:48:40 +02:00
parent e1c6778c3a
commit 31525dfef7
7 changed files with 4053 additions and 622 deletions
--- a/autogpt/commands/file_operations_utils.py
+++ b/autogpt/commands/file_operations_utils.py
@@ -0,0 +1,159 @@
+import json
+import os
+
+import charset_normalizer
+import docx
+import markdown
+import PyPDF2
+import yaml
+from bs4 import BeautifulSoup
+from pylatexenc.latex2text import LatexNodes2Text
+
+from autogpt import logs
+
+
+class ParserStrategy:
+    def read(self, file_path: str):
+        raise NotImplementedError
+
+
+# Basic text file reading
+class TXTParser(ParserStrategy):
+    def read(self, file_path):
+        charset_match = charset_normalizer.from_path(file_path).best()
+        return str(charset_match)
+
+
+# Reading text from binary file using pdf parser
+class PDFParser(ParserStrategy):
+    def read(self, file_path):
+        parser = PyPDF2.PdfReader(file_path)
+        text = ""
+        for page_idx in range(len(parser.pages)):
+            text += parser.pages[page_idx].extract_text()
+        return text
+
+
+# Reading text from binary file using docs parser
+class DOCXParser(ParserStrategy):
+    def read(self, file_path):
+        doc_file = docx.Document(file_path)
+        text = ""
+        for para in doc_file.paragraphs:
+            text += para.text
+        return text
+
+
+# Reading as dictionary and returning string format
+class JSONParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            data = json.load(f)
+            text = str(data)
+        return text
+
+
+class XMLParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            soup = BeautifulSoup(f, "xml")
+            text = soup.get_text()
+        return text
+
+
+# Reading as dictionary and returning string format
+class YAMLParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            data = yaml.load(f, Loader=yaml.FullLoader)
+            text = str(data)
+        return text
+
+
+class HTMLParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            soup = BeautifulSoup(f, "html.parser")
+            text = soup.get_text()
+        return text
+
+
+class MarkdownParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            html = markdown.markdown(f.read())
+            text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
+        return text
+
+
+class LaTeXParser(ParserStrategy):
+    def read(self, file_path):
+        with open(file_path, "r") as f:
+            latex = f.read()
+        text = LatexNodes2Text().latex_to_text(latex)
+        return text
+
+
+class FileContext:
+    def __init__(self, parser: ParserStrategy, logger: logs.Logger):
+        self.parser = parser
+        self.logger = logger
+
+    def set_parser(self, parser: ParserStrategy):
+        self.logger.debug(f"Setting Context Parser to {parser}")
+        self.parser = parser
+
+    def read_file(self, file_path):
+        self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
+        return self.parser.read(file_path)
+
+
+extension_to_parser = {
+    ".txt": TXTParser(),
+    ".csv": TXTParser(),
+    ".pdf": PDFParser(),
+    ".docx": DOCXParser(),
+    ".json": JSONParser(),
+    ".xml": XMLParser(),
+    ".yaml": YAMLParser(),
+    ".yml": YAMLParser(),
+    ".html": HTMLParser(),
+    ".htm": HTMLParser(),
+    ".xhtml": HTMLParser(),
+    ".md": MarkdownParser(),
+    ".markdown": MarkdownParser(),
+    ".tex": LaTeXParser(),
+}
+
+
+def is_file_binary_fn(file_path: str):
+    """Given a file path load all its content and checks if the null bytes is present
+
+    Args:
+        file_path (_type_): _description_
+
+    Returns:
+        bool: is_binary
+    """
+    with open(file_path, "rb") as f:
+        file_data = f.read()
+    if b"\x00" in file_data:
+        return True
+    return False
+
+
+def read_textual_file(file_path: str, logger: logs.Logger):
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"{file_path} not found!")
+    is_binary = is_file_binary_fn(file_path)
+    file_extension = os.path.splitext(file_path)[1].lower()
+    parser = extension_to_parser.get(file_extension)
+    if not parser:
+        if is_binary:
+            raise ValueError(
+                "Unsupported binary file format: {}".format(file_extension)
+            )
+        # fallback to txt file parser (to support script and code files loading)
+        parser = TXTParser()
+    file_context = FileContext(parser, logger)
+    return file_context.read_file(file_path)