mirror of
https://github.com/aljazceru/Auto-GPT.git
synced 2025-12-19 06:54:22 +01:00
* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages) * adding text file utils providing function to load file based on extension && adding read_text_file command * adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added)) * formatting * changing command name from 'read_text_file' to 'parse_text_document' * fallback to txtParser if file extension is not known to read script and code files * adding extension respective parsers * adding binary file check function * adding file existance check && raising valueError for unsupported binary file formats * adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes * adding .yml extension parser * removal of .doc parser * updating file loading commands names * updating test (removing .doc mock function) * fix: import sort * new cassette for mem A * feat: update Cassettes * feat: consolidate commands * feat: linting * feat: updates to cassettes --------- Co-authored-by: Reinier van der Leer <github@pwuts.nl> Co-authored-by: Nicholas Tindle <nick@ntindle.com> Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
153 lines
4.6 KiB
Python
153 lines
4.6 KiB
Python
import json
|
|
import tempfile
|
|
from functools import partial
|
|
from unittest import TestCase
|
|
from xml.etree import ElementTree
|
|
|
|
import docx
|
|
import yaml
|
|
from bs4 import BeautifulSoup
|
|
from PyPDF2 import PdfWriter
|
|
|
|
from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
|
|
from autogpt.logs import logger
|
|
|
|
plain_text_str = "Hello, world!"
|
|
|
|
|
|
def mock_text_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
|
|
f.write(plain_text_str)
|
|
return f.name
|
|
|
|
|
|
def mock_csv_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
|
|
f.write(plain_text_str)
|
|
return f.name
|
|
|
|
|
|
def mock_pdf_file():
|
|
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as f:
|
|
# Create a new PDF and add a page with the text plain_text_str
|
|
# Write the PDF header
|
|
f.write(b"%PDF-1.7\n")
|
|
# Write the document catalog
|
|
f.write(b"1 0 obj\n")
|
|
f.write(b"<< /Type /Catalog /Pages 2 0 R >>\n")
|
|
f.write(b"endobj\n")
|
|
# Write the page object
|
|
f.write(b"2 0 obj\n")
|
|
f.write(
|
|
b"<< /Type /Page /Parent 1 0 R /Resources << /Font << /F1 3 0 R >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>\n"
|
|
)
|
|
f.write(b"endobj\n")
|
|
# Write the font object
|
|
f.write(b"3 0 obj\n")
|
|
f.write(
|
|
b"<< /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica-Bold >>\n"
|
|
)
|
|
f.write(b"endobj\n")
|
|
# Write the page contents object
|
|
f.write(b"4 0 obj\n")
|
|
f.write(b"<< /Length 25 >>\n")
|
|
f.write(b"stream\n")
|
|
f.write(b"BT\n/F1 12 Tf\n72 720 Td\n(Hello, world!) Tj\nET\n")
|
|
f.write(b"endstream\n")
|
|
f.write(b"endobj\n")
|
|
# Write the cross-reference table
|
|
f.write(b"xref\n")
|
|
f.write(b"0 5\n")
|
|
f.write(b"0000000000 65535 f \n")
|
|
f.write(b"0000000017 00000 n \n")
|
|
f.write(b"0000000073 00000 n \n")
|
|
f.write(b"0000000123 00000 n \n")
|
|
f.write(b"0000000271 00000 n \n")
|
|
f.write(b"trailer\n")
|
|
f.write(b"<< /Size 5 /Root 1 0 R >>\n")
|
|
f.write(b"startxref\n")
|
|
f.write(b"380\n")
|
|
f.write(b"%%EOF\n")
|
|
f.write(b"\x00")
|
|
return f.name
|
|
|
|
|
|
def mock_docx_file():
|
|
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".docx") as f:
|
|
document = docx.Document()
|
|
document.add_paragraph(plain_text_str)
|
|
document.save(f.name)
|
|
return f.name
|
|
|
|
|
|
def mock_json_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
|
|
json.dump({"text": plain_text_str}, f)
|
|
return f.name
|
|
|
|
|
|
def mock_xml_file():
|
|
root = ElementTree.Element("text")
|
|
root.text = plain_text_str
|
|
tree = ElementTree.ElementTree(root)
|
|
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".xml") as f:
|
|
tree.write(f)
|
|
return f.name
|
|
|
|
|
|
def mock_yaml_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".yaml") as f:
|
|
yaml.dump({"text": plain_text_str}, f)
|
|
return f.name
|
|
|
|
|
|
def mock_html_file():
|
|
html = BeautifulSoup(
|
|
f"<html><head><title>This is a test</title></head><body><p>{plain_text_str}</p></body></html>",
|
|
"html.parser",
|
|
)
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html") as f:
|
|
f.write(str(html))
|
|
return f.name
|
|
|
|
|
|
def mock_md_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as f:
|
|
f.write(f"# {plain_text_str}!\n")
|
|
return f.name
|
|
|
|
|
|
def mock_latex_file():
|
|
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tex") as f:
|
|
latex_str = rf"\documentclass{{article}}\begin{{document}}{plain_text_str}\end{{document}}"
|
|
f.write(latex_str)
|
|
return f.name
|
|
|
|
|
|
respective_file_creation_functions = {
|
|
".txt": mock_text_file,
|
|
".csv": mock_csv_file,
|
|
".pdf": mock_pdf_file,
|
|
".docx": mock_docx_file,
|
|
".json": mock_json_file,
|
|
".xml": mock_xml_file,
|
|
".yaml": mock_yaml_file,
|
|
".html": mock_html_file,
|
|
".md": mock_md_file,
|
|
".tex": mock_latex_file,
|
|
}
|
|
|
|
|
|
class TestConfig(TestCase):
|
|
def test_parsers(self):
|
|
binary_files_extensions = [".pdf", ".docx"]
|
|
for (
|
|
file_extension,
|
|
c_file_creator,
|
|
) in respective_file_creation_functions.items():
|
|
created_filepath = c_file_creator()
|
|
loaded_text = read_textual_file(created_filepath, logger)
|
|
self.assertIn(plain_text_str, loaded_text)
|
|
should_be_binary = file_extension in binary_files_extensions
|
|
self.assertEqual(should_be_binary, is_file_binary_fn(created_filepath))
|