Text file loaders (#3031)

* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages)

* adding text file utils providing function to load file based on extension && adding read_text_file command

* adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added))

* formatting

* changing command name from 'read_text_file' to 'parse_text_document'

* fallback to txtParser if file extension is not known to read script and code files

* adding extension respective parsers

* adding binary file check function

* adding file existance check && raising valueError for unsupported binary file formats

* adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes

* adding .yml extension parser

* removal of .doc parser

* updating file loading commands names

* updating test (removing .doc mock function)

* fix: import sort

* new cassette for mem A

* feat: update Cassettes

* feat: consolidate commands

* feat: linting

* feat: updates to cassettes

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
Co-authored-by: Nicholas Tindle <nick@ntindle.com>
Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
This commit is contained in:
sherif-med
2023-05-21 21:48:40 +02:00
committed by GitHub
parent e1c6778c3a
commit 31525dfef7
7 changed files with 4053 additions and 622 deletions

View File

@@ -0,0 +1,159 @@
import json
import os
import charset_normalizer
import docx
import markdown
import PyPDF2
import yaml
from bs4 import BeautifulSoup
from pylatexenc.latex2text import LatexNodes2Text
from autogpt import logs
class ParserStrategy:
def read(self, file_path: str):
raise NotImplementedError
# Basic text file reading
class TXTParser(ParserStrategy):
def read(self, file_path):
charset_match = charset_normalizer.from_path(file_path).best()
return str(charset_match)
# Reading text from binary file using pdf parser
class PDFParser(ParserStrategy):
def read(self, file_path):
parser = PyPDF2.PdfReader(file_path)
text = ""
for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text()
return text
# Reading text from binary file using docs parser
class DOCXParser(ParserStrategy):
def read(self, file_path):
doc_file = docx.Document(file_path)
text = ""
for para in doc_file.paragraphs:
text += para.text
return text
# Reading as dictionary and returning string format
class JSONParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
data = json.load(f)
text = str(data)
return text
class XMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "xml")
text = soup.get_text()
return text
# Reading as dictionary and returning string format
class YAMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
data = yaml.load(f, Loader=yaml.FullLoader)
text = str(data)
return text
class HTMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "html.parser")
text = soup.get_text()
return text
class MarkdownParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
html = markdown.markdown(f.read())
text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
return text
class LaTeXParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
latex = f.read()
text = LatexNodes2Text().latex_to_text(latex)
return text
class FileContext:
def __init__(self, parser: ParserStrategy, logger: logs.Logger):
self.parser = parser
self.logger = logger
def set_parser(self, parser: ParserStrategy):
self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser
def read_file(self, file_path):
self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
return self.parser.read(file_path)
extension_to_parser = {
".txt": TXTParser(),
".csv": TXTParser(),
".pdf": PDFParser(),
".docx": DOCXParser(),
".json": JSONParser(),
".xml": XMLParser(),
".yaml": YAMLParser(),
".yml": YAMLParser(),
".html": HTMLParser(),
".htm": HTMLParser(),
".xhtml": HTMLParser(),
".md": MarkdownParser(),
".markdown": MarkdownParser(),
".tex": LaTeXParser(),
}
def is_file_binary_fn(file_path: str):
"""Given a file path load all its content and checks if the null bytes is present
Args:
file_path (_type_): _description_
Returns:
bool: is_binary
"""
with open(file_path, "rb") as f:
file_data = f.read()
if b"\x00" in file_data:
return True
return False
def read_textual_file(file_path: str, logger: logs.Logger):
if not os.path.isfile(file_path):
raise FileNotFoundError(f"{file_path} not found!")
is_binary = is_file_binary_fn(file_path)
file_extension = os.path.splitext(file_path)[1].lower()
parser = extension_to_parser.get(file_extension)
if not parser:
if is_binary:
raise ValueError(
"Unsupported binary file format: {}".format(file_extension)
)
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)
return file_context.read_file(file_path)