Files
Auto-GPT/tests/test_text_file_parsers.py
sherif-med 31525dfef7 Text file loaders (#3031)
* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages)

* adding text file utils providing function to load file based on extension && adding read_text_file command

* adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added))

* formatting

* changing command name from 'read_text_file' to 'parse_text_document'

* fallback to txtParser if file extension is not known to read script and code files

* adding extension respective parsers

* adding binary file check function

* adding file existance check && raising valueError for unsupported binary file formats

* adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes

* adding .yml extension parser

* removal of .doc parser

* updating file loading commands names

* updating test (removing .doc mock function)

* fix: import sort

* new cassette for mem A

* feat: update Cassettes

* feat: consolidate commands

* feat: linting

* feat: updates to cassettes

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
Co-authored-by: Nicholas Tindle <nick@ntindle.com>
Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
2023-05-21 14:48:40 -05:00

153 lines
4.6 KiB
Python

import json
import tempfile
from functools import partial
from unittest import TestCase
from xml.etree import ElementTree
import docx
import yaml
from bs4 import BeautifulSoup
from PyPDF2 import PdfWriter
from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
from autogpt.logs import logger
plain_text_str = "Hello, world!"
def mock_text_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write(plain_text_str)
return f.name
def mock_csv_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
f.write(plain_text_str)
return f.name
def mock_pdf_file():
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as f:
# Create a new PDF and add a page with the text plain_text_str
# Write the PDF header
f.write(b"%PDF-1.7\n")
# Write the document catalog
f.write(b"1 0 obj\n")
f.write(b"<< /Type /Catalog /Pages 2 0 R >>\n")
f.write(b"endobj\n")
# Write the page object
f.write(b"2 0 obj\n")
f.write(
b"<< /Type /Page /Parent 1 0 R /Resources << /Font << /F1 3 0 R >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>\n"
)
f.write(b"endobj\n")
# Write the font object
f.write(b"3 0 obj\n")
f.write(
b"<< /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica-Bold >>\n"
)
f.write(b"endobj\n")
# Write the page contents object
f.write(b"4 0 obj\n")
f.write(b"<< /Length 25 >>\n")
f.write(b"stream\n")
f.write(b"BT\n/F1 12 Tf\n72 720 Td\n(Hello, world!) Tj\nET\n")
f.write(b"endstream\n")
f.write(b"endobj\n")
# Write the cross-reference table
f.write(b"xref\n")
f.write(b"0 5\n")
f.write(b"0000000000 65535 f \n")
f.write(b"0000000017 00000 n \n")
f.write(b"0000000073 00000 n \n")
f.write(b"0000000123 00000 n \n")
f.write(b"0000000271 00000 n \n")
f.write(b"trailer\n")
f.write(b"<< /Size 5 /Root 1 0 R >>\n")
f.write(b"startxref\n")
f.write(b"380\n")
f.write(b"%%EOF\n")
f.write(b"\x00")
return f.name
def mock_docx_file():
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".docx") as f:
document = docx.Document()
document.add_paragraph(plain_text_str)
document.save(f.name)
return f.name
def mock_json_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
json.dump({"text": plain_text_str}, f)
return f.name
def mock_xml_file():
root = ElementTree.Element("text")
root.text = plain_text_str
tree = ElementTree.ElementTree(root)
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".xml") as f:
tree.write(f)
return f.name
def mock_yaml_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".yaml") as f:
yaml.dump({"text": plain_text_str}, f)
return f.name
def mock_html_file():
html = BeautifulSoup(
f"<html><head><title>This is a test</title></head><body><p>{plain_text_str}</p></body></html>",
"html.parser",
)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html") as f:
f.write(str(html))
return f.name
def mock_md_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as f:
f.write(f"# {plain_text_str}!\n")
return f.name
def mock_latex_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tex") as f:
latex_str = rf"\documentclass{{article}}\begin{{document}}{plain_text_str}\end{{document}}"
f.write(latex_str)
return f.name
respective_file_creation_functions = {
".txt": mock_text_file,
".csv": mock_csv_file,
".pdf": mock_pdf_file,
".docx": mock_docx_file,
".json": mock_json_file,
".xml": mock_xml_file,
".yaml": mock_yaml_file,
".html": mock_html_file,
".md": mock_md_file,
".tex": mock_latex_file,
}
class TestConfig(TestCase):
def test_parsers(self):
binary_files_extensions = [".pdf", ".docx"]
for (
file_extension,
c_file_creator,
) in respective_file_creation_functions.items():
created_filepath = c_file_creator()
loaded_text = read_textual_file(created_filepath, logger)
self.assertIn(plain_text_str, loaded_text)
should_be_binary = file_extension in binary_files_extensions
self.assertEqual(should_be_binary, is_file_binary_fn(created_filepath))