Text file loaders (#3031)

* adding requiered packages for loading pdf, docx, md, tex files (preferably pure python packages)

* adding text file utils providing function to load file based on extension && adding read_text_file command

* adding test cases for text file loading (pdf file creation is hardcoded due to external package requierment for creation (a sample file can be added))

* formatting

* changing command name from 'read_text_file' to 'parse_text_document'

* fallback to txtParser if file extension is not known to read script and code files

* adding extension respective parsers

* adding binary file check function

* adding file existance check && raising valueError for unsupported binary file formats

* adding check file type (binary) in test_parsers for specific extensions && fixing mock pdf generation to include null bytes

* adding .yml extension parser

* removal of .doc parser

* updating file loading commands names

* updating test (removing .doc mock function)

* fix: import sort

* new cassette for mem A

* feat: update Cassettes

* feat: consolidate commands

* feat: linting

* feat: updates to cassettes

---------

Co-authored-by: Reinier van der Leer <github@pwuts.nl>
Co-authored-by: Nicholas Tindle <nick@ntindle.com>
Co-authored-by: k-boikov <64261260+k-boikov@users.noreply.github.com>
This commit is contained in:
sherif-med
2023-05-21 21:48:40 +02:00
committed by GitHub
parent e1c6778c3a
commit 31525dfef7
7 changed files with 4053 additions and 622 deletions

View File

@@ -12,6 +12,7 @@ from colorama import Back, Fore
from requests.adapters import HTTPAdapter, Retry
from autogpt.commands.command import command
from autogpt.commands.file_operations_utils import read_textual_file
from autogpt.config import Config
from autogpt.logs import logger
from autogpt.spinner import Spinner
@@ -143,7 +144,7 @@ def split_file(
start += max_length - overlap
@command("read_file", "Read file", '"filename": "<filename>"')
@command("read_file", "Read a file", '"filename": "<filename>"')
def read_file(filename: str) -> str:
"""Read a file and return the contents
@@ -154,12 +155,10 @@ def read_file(filename: str) -> str:
str: The contents of the file
"""
try:
charset_match = charset_normalizer.from_path(filename).best()
encoding = charset_match.encoding
logger.debug(f"Read file '{filename}' with encoding '{encoding}'")
return str(charset_match)
except Exception as err:
return f"Error: {err}"
content = read_textual_file(filename, logger)
return content
except Exception as e:
return f"Error: {str(e)}"
def ingest_file(

View File

@@ -0,0 +1,159 @@
import json
import os
import charset_normalizer
import docx
import markdown
import PyPDF2
import yaml
from bs4 import BeautifulSoup
from pylatexenc.latex2text import LatexNodes2Text
from autogpt import logs
class ParserStrategy:
def read(self, file_path: str):
raise NotImplementedError
# Basic text file reading
class TXTParser(ParserStrategy):
def read(self, file_path):
charset_match = charset_normalizer.from_path(file_path).best()
return str(charset_match)
# Reading text from binary file using pdf parser
class PDFParser(ParserStrategy):
def read(self, file_path):
parser = PyPDF2.PdfReader(file_path)
text = ""
for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text()
return text
# Reading text from binary file using docs parser
class DOCXParser(ParserStrategy):
def read(self, file_path):
doc_file = docx.Document(file_path)
text = ""
for para in doc_file.paragraphs:
text += para.text
return text
# Reading as dictionary and returning string format
class JSONParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
data = json.load(f)
text = str(data)
return text
class XMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "xml")
text = soup.get_text()
return text
# Reading as dictionary and returning string format
class YAMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
data = yaml.load(f, Loader=yaml.FullLoader)
text = str(data)
return text
class HTMLParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "html.parser")
text = soup.get_text()
return text
class MarkdownParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
html = markdown.markdown(f.read())
text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
return text
class LaTeXParser(ParserStrategy):
def read(self, file_path):
with open(file_path, "r") as f:
latex = f.read()
text = LatexNodes2Text().latex_to_text(latex)
return text
class FileContext:
def __init__(self, parser: ParserStrategy, logger: logs.Logger):
self.parser = parser
self.logger = logger
def set_parser(self, parser: ParserStrategy):
self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser
def read_file(self, file_path):
self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
return self.parser.read(file_path)
extension_to_parser = {
".txt": TXTParser(),
".csv": TXTParser(),
".pdf": PDFParser(),
".docx": DOCXParser(),
".json": JSONParser(),
".xml": XMLParser(),
".yaml": YAMLParser(),
".yml": YAMLParser(),
".html": HTMLParser(),
".htm": HTMLParser(),
".xhtml": HTMLParser(),
".md": MarkdownParser(),
".markdown": MarkdownParser(),
".tex": LaTeXParser(),
}
def is_file_binary_fn(file_path: str):
"""Given a file path load all its content and checks if the null bytes is present
Args:
file_path (_type_): _description_
Returns:
bool: is_binary
"""
with open(file_path, "rb") as f:
file_data = f.read()
if b"\x00" in file_data:
return True
return False
def read_textual_file(file_path: str, logger: logs.Logger):
if not os.path.isfile(file_path):
raise FileNotFoundError(f"{file_path} not found!")
is_binary = is_file_binary_fn(file_path)
file_extension = os.path.splitext(file_path)[1].lower()
parser = extension_to_parser.get(file_extension)
if not parser:
if is_binary:
raise ValueError(
"Unsupported binary file format: {}".format(file_extension)
)
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)
return file_context.read_file(file_path)

View File

@@ -5,6 +5,10 @@ openai==0.27.2
playsound==1.2.2
python-dotenv==1.0.0
pyyaml==6.0
PyPDF2
python-docx
markdown
pylatexenc
readability-lxml==0.8.1
requests
tiktoken==0.3.3

View File

@@ -442,4 +442,505 @@ interactions:
status:
code: 200
message: OK
- request:
body: '{"model": "gpt-3.5-turbo", "messages": [{"role": "system", "content": "You
are write_to_file-GPT, an AI designed to use the write_to_file command to write
''Hello World'' into a file named \"hello_world.txt\" and then use the task_complete
command to complete the task.\nYour decisions must always be made independently
without seeking user assistance. Play to your strengths as an LLM and pursue
simple strategies with no legal complications.\n\nGOALS:\n\n1. Use the write_to_file
command to write ''Hello World'' into a file named \"hello_world.txt\".\n2.
Use the task_complete command to complete the task.\n3. Do not use any other
commands.\n\n\nConstraints:\n1. ~4000 word limit for short term memory. Your
short term memory is short, so immediately save important information to files.\n2.
If you are unsure how you previously did something or want to recall past events,
thinking about similar events will help you remember.\n3. No user assistance\n4.
Exclusively use the commands listed below e.g. command_name\n\nCommands:\n1.
append_to_file: Append to file, args: \"filename\": \"<filename>\", \"text\":
\"<text>\"\n2. delete_file: Delete file, args: \"filename\": \"<filename>\"\n3.
list_files: List Files in Directory, args: \"directory\": \"<directory>\"\n4.
read_file: Read text from file, args: \"filename\": \"<filename>\"\n5. read_raw_file:
Read file, args: \"filename\": \"<filename>\"\n6. write_to_file: Write to file,
args: \"filename\": \"<filename>\", \"text\": \"<text>\"\n7. delete_agent: Delete
GPT Agent, args: \"key\": \"<key>\"\n8. get_hyperlinks: Get text summary, args:
\"url\": \"<url>\"\n9. get_text_summary: Get text summary, args: \"url\": \"<url>\",
\"question\": \"<question>\"\n10. list_agents: List GPT Agents, args: () ->
str\n11. message_agent: Message GPT Agent, args: \"key\": \"<key>\", \"message\":
\"<message>\"\n12. start_agent: Start GPT Agent, args: \"name\": \"<name>\",
\"task\": \"<short_task_desc>\", \"prompt\": \"<prompt>\"\n13. task_complete:
Task Complete (Shutdown), args: \"reason\": \"<reason>\"\n\nResources:\n1. Internet
access for searches and information gathering.\n2. Long Term memory management.\n3.
GPT-3.5 powered Agents for delegation of simple tasks.\n4. File output.\n\nPerformance
Evaluation:\n1. Continuously review and analyze your actions to ensure you are
performing to the best of your abilities.\n2. Constructively self-criticize
your big-picture behavior constantly.\n3. Reflect on past decisions and strategies
to refine your approach.\n4. Every command has a cost, so be smart and efficient.
Aim to complete tasks in the least number of steps.\n5. Write all code to a
file.\n\nYou should only respond in JSON format as described below \nResponse
Format: \n{\n \"thoughts\": {\n \"text\": \"thought\",\n \"reasoning\":
\"reasoning\",\n \"plan\": \"- short bulleted\\n- list that conveys\\n-
long-term plan\",\n \"criticism\": \"constructive self-criticism\",\n \"speak\":
\"thoughts summary to say to user\"\n },\n \"command\": {\n \"name\":
\"command name\",\n \"args\": {\n \"arg name\": \"value\"\n }\n }\n}
\nEnsure the response can be parsed by Python json.loads"}, {"role": "system",
"content": "The current time and date is Tue Jan 1 00:00:00 2000"}, {"role":
"user", "content": "Determine which next command to use, and respond using the
format specified above:"}], "temperature": 0, "max_tokens": 0}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '3467'
Content-Type:
- application/json
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA7STz27TQBDG7zzFaC65OFFC2iT1ExCEWqAFhDCytuuJvcTeMbvjJijyuyP/SdIY
9QScVppv5pvf7M4e0CQYos6U6KLMx8v1/e2T0p/f36YfVh/5/m579fbu3fRhtf6qPAbIjz9IS18x
0VyUOYlhiwFqR0oowXC2WF0t5vP5chZgwQnlGGJayng+uR5L5R55PJ1PZxhg5VVKGB6wdFyUEgtv
yXoMl8tFgGfvU/z17DpAYVH5KXRzM6sD1BkbTR7DbwcsyB9tHeeEISrvjRdlpYFkK2SbAQ6RBQCI
UDKu0kx8hCH0wV6gvTTBCNewM3kOlSeQjGDnjFAsHG9MTqC5KJRNQLgTYPSG8pzhC7s8GYGxwqCg
TbWqoARGWaPHu0afyF5GkwiD540dKc/W2LTr/pARiPJbcPSzMo48FPQX3QJoaV+cw/hWLNgLqLJ0
XDqjhGDDDiRrVOW3Q+QyV7ajHcOn/3ZNkT27NxRxvyQX7qfYMWvIqp0Ro40vjo9ridpKsr5yTZ0S
WEOmnjoPzc6RljNbf4FKWrkNGw/9/oOxF1WJaQ52v4YYviS1fXm/GqB/t19d4zo4Ln1v+sfONw4d
08XrDdCVS4ffpROa3LPFgOPCZPjFns10pG2Je/DI1lgHuDHW+CzuPgiG6IVLDNDYhPYYTuvv9avf
AAAA//8DANPpphvSBAAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 7ca920092efb0c34-DFW
Cache-Control:
- no-cache, must-revalidate
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 21 May 2023 01:43:08 GMT
Server:
- cloudflare
access-control-allow-origin:
- '*'
alt-svc:
- h3=":443"; ma=86400, h3-29=":443"; ma=86400
openai-model:
- gpt-3.5-turbo-0301
openai-organization:
- significant-gravitas
openai-processing-ms:
- '17065'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=15724800; includeSubDomains
x-ratelimit-limit-requests:
- '3500'
x-ratelimit-limit-tokens:
- '90000'
x-ratelimit-remaining-requests:
- '3499'
x-ratelimit-remaining-tokens:
- '86497'
x-ratelimit-reset-requests:
- 17ms
x-ratelimit-reset-tokens:
- 2.335s
x-request-id:
- 90577bd1f9895f3d042b1a9287fa5f73
status:
code: 200
message: OK
- request:
body: '{"model": "gpt-3.5-turbo", "messages": [{"role": "system", "content": "You
are write_to_file-GPT, an AI designed to use the write_to_file command to write
''Hello World'' into a file named \"hello_world.txt\" and then use the task_complete
command to complete the task.\nYour decisions must always be made independently
without seeking user assistance. Play to your strengths as an LLM and pursue
simple strategies with no legal complications.\n\nGOALS:\n\n1. Use the write_to_file
command to write ''Hello World'' into a file named \"hello_world.txt\".\n2.
Use the task_complete command to complete the task.\n3. Do not use any other
commands.\n\n\nConstraints:\n1. ~4000 word limit for short term memory. Your
short term memory is short, so immediately save important information to files.\n2.
If you are unsure how you previously did something or want to recall past events,
thinking about similar events will help you remember.\n3. No user assistance\n4.
Exclusively use the commands listed below e.g. command_name\n\nCommands:\n1.
append_to_file: Append to file, args: \"filename\": \"<filename>\", \"text\":
\"<text>\"\n2. delete_file: Delete file, args: \"filename\": \"<filename>\"\n3.
list_files: List Files in Directory, args: \"directory\": \"<directory>\"\n4.
read_file: Read text from file, args: \"filename\": \"<filename>\"\n5. read_raw_file:
Read file, args: \"filename\": \"<filename>\"\n6. write_to_file: Write to file,
args: \"filename\": \"<filename>\", \"text\": \"<text>\"\n7. delete_agent: Delete
GPT Agent, args: \"key\": \"<key>\"\n8. get_hyperlinks: Get text summary, args:
\"url\": \"<url>\"\n9. get_text_summary: Get text summary, args: \"url\": \"<url>\",
\"question\": \"<question>\"\n10. list_agents: List GPT Agents, args: () ->
str\n11. message_agent: Message GPT Agent, args: \"key\": \"<key>\", \"message\":
\"<message>\"\n12. start_agent: Start GPT Agent, args: \"name\": \"<name>\",
\"task\": \"<short_task_desc>\", \"prompt\": \"<prompt>\"\n13. task_complete:
Task Complete (Shutdown), args: \"reason\": \"<reason>\"\n\nResources:\n1. Internet
access for searches and information gathering.\n2. Long Term memory management.\n3.
GPT-3.5 powered Agents for delegation of simple tasks.\n4. File output.\n\nPerformance
Evaluation:\n1. Continuously review and analyze your actions to ensure you are
performing to the best of your abilities.\n2. Constructively self-criticize
your big-picture behavior constantly.\n3. Reflect on past decisions and strategies
to refine your approach.\n4. Every command has a cost, so be smart and efficient.
Aim to complete tasks in the least number of steps.\n5. Write all code to a
file.\n\nYou should only respond in JSON format as described below \nResponse
Format: \n{\n \"thoughts\": {\n \"text\": \"thought\",\n \"reasoning\":
\"reasoning\",\n \"plan\": \"- short bulleted\\n- list that conveys\\n-
long-term plan\",\n \"criticism\": \"constructive self-criticism\",\n \"speak\":
\"thoughts summary to say to user\"\n },\n \"command\": {\n \"name\":
\"command name\",\n \"args\": {\n \"arg name\": \"value\"\n }\n }\n}
\nEnsure the response can be parsed by Python json.loads"}, {"role": "system",
"content": "The current time and date is Tue Jan 1 00:00:00 2000"}, {"role":
"system", "content": "This reminds you of these events from your past: \nI was
created and nothing new has happened."}, {"role": "user", "content": "Determine
which next command to use, and respond using the format specified above:"},
{"role": "assistant", "content": "{\n \"thoughts\": {\n \"text\":
\"I will use the write_to_file command to write ''Hello World'' into a file
named ''hello_world.txt''.\",\n \"reasoning\": \"The task requires me
to write ''Hello World'' into a file named ''hello_world.txt'', and the write_to_file
command is the most appropriate for this task.\",\n \"plan\": \"- Use
the write_to_file command to write ''Hello World'' into a file named ''hello_world.txt''.\\n-
Use the task_complete command to complete the task.\",\n \"criticism\":
\"I need to ensure that I have the correct file name and that the file is created
in the correct directory.\",\n \"speak\": \"I will use the write to file
command to write ''Hello World'' into a file named ''hello_world.txt''.\"\n },\n \"command\":
{\n \"name\": \"write_to_file\",\n \"args\": {\n \"filename\":
\"hello_world.txt\",\n \"text\": \"Hello World\"\n }\n }\n}"},
{"role": "system", "content": "Command write_to_file returned: File written
to successfully."}, {"role": "user", "content": "Determine which next command
to use, and respond using the format specified above:"}], "temperature": 0,
"max_tokens": 0}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '4790'
Content-Type:
- application/json
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA6ySTW/TQBCG7/yK0Vxy2UQJIWnwEYlCqkpFUIRUjKLtehIvWe+Y3TFJFfm/I8fO
R4OQkOA6H+88M/Pu0GaYoMm1mKJ0/av5p7vr+7fTh+ub4c3dx3e3byrzYaZ5ph+eblEhP34nI13H
wHBROhLLHhWaQFoow2Q0nb2ajsfj1yOFBWfkMMFVKf3xYNKXKjxyfzgejlBhFfWKMNlhGbgoZSG8
Jh8xGQ0nM4Un8VNicqVQWLQ7hV6OJrVCk7M1FDH5usOC4kE4sCNMUMdoo2gvDSZ7Id+ssEs9AECK
knO1yiWmmEAX7BK0lSaY4hw21jmoIoHkBKLjetEBEhguCu0zEIZj7FA1SFGdSwbSkb31q1b3viuD
QD8qGyhCQY3OJlgh6L0n5xi+cHBZD6wXBg1L6wi8LiiDXt7kF5smP5Ct9BQ0HHPI9U+CWBlDMS4r
556OYBlIbmOLBvc5BVpyIAVzMNr/h/1Kp327Wh8+/7OaCVassbE4fMET7TvJxyo0fVr2ze1ZYcmh
PefJPGAj6LIMXAarhS4nxJL0+o8/hr+lbiVrdbBUV/6bo5q/tdOeHeUCSofVpRnP3dNZ55wve/bt
I9AeqmNLfY21wqX1NuaLVgkTjMIlKrQ+oy0mw/pb/eIXAAAA//8DAKgaLA4VBAAA
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 7ca920871f130c34-DFW
Cache-Control:
- no-cache, must-revalidate
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 21 May 2023 01:43:24 GMT
Server:
- cloudflare
access-control-allow-origin:
- '*'
alt-svc:
- h3=":443"; ma=86400, h3-29=":443"; ma=86400
openai-model:
- gpt-3.5-turbo-0301
openai-organization:
- significant-gravitas
openai-processing-ms:
- '13071'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=15724800; includeSubDomains
x-ratelimit-limit-requests:
- '3500'
x-ratelimit-limit-tokens:
- '90000'
x-ratelimit-remaining-requests:
- '3499'
x-ratelimit-remaining-tokens:
- '86478'
x-ratelimit-reset-requests:
- 17ms
x-ratelimit-reset-tokens:
- 2.348s
x-request-id:
- 137423c120d49f49824a2fbbd7cbb558
status:
code: 200
message: OK
- request:
body: '{"model": "gpt-3.5-turbo", "messages": [{"role": "system", "content": "You
are write_to_file-GPT, an AI designed to use the write_to_file command to write
''Hello World'' into a file named \"hello_world.txt\" and then use the task_complete
command to complete the task.\nYour decisions must always be made independently
without seeking user assistance. Play to your strengths as an LLM and pursue
simple strategies with no legal complications.\n\nGOALS:\n\n1. Use the write_to_file
command to write ''Hello World'' into a file named \"hello_world.txt\".\n2.
Use the task_complete command to complete the task.\n3. Do not use any other
commands.\n\n\nConstraints:\n1. ~4000 word limit for short term memory. Your
short term memory is short, so immediately save important information to files.\n2.
If you are unsure how you previously did something or want to recall past events,
thinking about similar events will help you remember.\n3. No user assistance\n4.
Exclusively use the commands listed below e.g. command_name\n\nCommands:\n1.
append_to_file: Append to file, args: \"filename\": \"<filename>\", \"text\":
\"<text>\"\n2. delete_file: Delete file, args: \"filename\": \"<filename>\"\n3.
list_files: List Files in Directory, args: \"directory\": \"<directory>\"\n4.
read_file: Read a file, args: \"filename\": \"<filename>\"\n5. write_to_file:
Write to file, args: \"filename\": \"<filename>\", \"text\": \"<text>\"\n6.
delete_agent: Delete GPT Agent, args: \"key\": \"<key>\"\n7. get_hyperlinks:
Get text summary, args: \"url\": \"<url>\"\n8. get_text_summary: Get text summary,
args: \"url\": \"<url>\", \"question\": \"<question>\"\n9. list_agents: List
GPT Agents, args: () -> str\n10. message_agent: Message GPT Agent, args: \"key\":
\"<key>\", \"message\": \"<message>\"\n11. start_agent: Start GPT Agent, args:
\"name\": \"<name>\", \"task\": \"<short_task_desc>\", \"prompt\": \"<prompt>\"\n12.
task_complete: Task Complete (Shutdown), args: \"reason\": \"<reason>\"\n\nResources:\n1.
Internet access for searches and information gathering.\n2. Long Term memory
management.\n3. GPT-3.5 powered Agents for delegation of simple tasks.\n4. File
output.\n\nPerformance Evaluation:\n1. Continuously review and analyze your
actions to ensure you are performing to the best of your abilities.\n2. Constructively
self-criticize your big-picture behavior constantly.\n3. Reflect on past decisions
and strategies to refine your approach.\n4. Every command has a cost, so be
smart and efficient. Aim to complete tasks in the least number of steps.\n5.
Write all code to a file.\n\nYou should only respond in JSON format as described
below \nResponse Format: \n{\n \"thoughts\": {\n \"text\": \"thought\",\n \"reasoning\":
\"reasoning\",\n \"plan\": \"- short bulleted\\n- list that conveys\\n-
long-term plan\",\n \"criticism\": \"constructive self-criticism\",\n \"speak\":
\"thoughts summary to say to user\"\n },\n \"command\": {\n \"name\":
\"command name\",\n \"args\": {\n \"arg name\": \"value\"\n }\n }\n}
\nEnsure the response can be parsed by Python json.loads"}, {"role": "system",
"content": "The current time and date is Tue Jan 1 00:00:00 2000"}, {"role":
"user", "content": "Determine which next command to use, and respond using the
format specified above:"}], "temperature": 0, "max_tokens": 0}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '3393'
Content-Type:
- application/json
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA7yTT2/TQBDF73yK0Vxy2URJ0/zBRyoBEQL1UIQKRtHWnsRLdnfM7phEivzdK9tJ
2hpxQnCdN37vN7PjI5ocE8wKLZkr7XCxuru9f1PZ6+qde6tvPn74db843Mw+XX3d21tUyA8/KJPT
F6OMXWlJDHtUmAXSQjkmk/nyej5dTCdzhY5zspjgtpThdDQbShUeeDiejieosIp6S5gcsQzsSlkL
78hHTBazpcIn70v9ajxWKCzaXkqvZ8taYVawyShi8u2IjuLZNrAlTFDHaKJoLw0keyHfDHBMPQBA
ilJwtS0kppjAqXgS6CBNMcUVeKIchKGKBFIQ7IMRWguvN8YSZOyc9m1DK8DgPVnL8IWDzQdgvDBo
aFu9dpTDoGj09b7RR3KQwShF9Tw7kI7sjd92AHcFgei4g0A/KxMogqO/SFPQ0v5xDhNb0XEU0GUZ
uAxGC8GGA0jRqDru+sil1b6jHcLn/7SmLBgxmYmu/07kYxUaBi2wurxaxiFQJm1Ek9CtgQ7yhCTc
djYd/bBYkt6dg/bG2n95DV1wrc5XejL97Ugbh47pBUQPXYdt/7474byJzqLH8cKk/088m+lM2xKf
wFNfY61wY7yJxbo7Z0wwCpeo0PicDpiM6+/1q0cAAAD//wMAx8X8+IMEAAA=
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 7ca9805e4d7328e0-DFW
Cache-Control:
- no-cache, must-revalidate
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 21 May 2023 02:48:51 GMT
Server:
- cloudflare
access-control-allow-origin:
- '*'
alt-svc:
- h3=":443"; ma=86400, h3-29=":443"; ma=86400
openai-model:
- gpt-3.5-turbo-0301
openai-organization:
- significant-gravitas
openai-processing-ms:
- '14375'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=15724800; includeSubDomains
x-ratelimit-limit-requests:
- '3500'
x-ratelimit-limit-tokens:
- '90000'
x-ratelimit-remaining-requests:
- '3499'
x-ratelimit-remaining-tokens:
- '86497'
x-ratelimit-reset-requests:
- 17ms
x-ratelimit-reset-tokens:
- 2.335s
x-request-id:
- dd442d3790e304394237402085fd3bc5
status:
code: 200
message: OK
- request:
body: '{"model": "gpt-3.5-turbo", "messages": [{"role": "system", "content": "You
are write_to_file-GPT, an AI designed to use the write_to_file command to write
''Hello World'' into a file named \"hello_world.txt\" and then use the task_complete
command to complete the task.\nYour decisions must always be made independently
without seeking user assistance. Play to your strengths as an LLM and pursue
simple strategies with no legal complications.\n\nGOALS:\n\n1. Use the write_to_file
command to write ''Hello World'' into a file named \"hello_world.txt\".\n2.
Use the task_complete command to complete the task.\n3. Do not use any other
commands.\n\n\nConstraints:\n1. ~4000 word limit for short term memory. Your
short term memory is short, so immediately save important information to files.\n2.
If you are unsure how you previously did something or want to recall past events,
thinking about similar events will help you remember.\n3. No user assistance\n4.
Exclusively use the commands listed below e.g. command_name\n\nCommands:\n1.
append_to_file: Append to file, args: \"filename\": \"<filename>\", \"text\":
\"<text>\"\n2. delete_file: Delete file, args: \"filename\": \"<filename>\"\n3.
list_files: List Files in Directory, args: \"directory\": \"<directory>\"\n4.
read_file: Read a file, args: \"filename\": \"<filename>\"\n5. write_to_file:
Write to file, args: \"filename\": \"<filename>\", \"text\": \"<text>\"\n6.
delete_agent: Delete GPT Agent, args: \"key\": \"<key>\"\n7. get_hyperlinks:
Get text summary, args: \"url\": \"<url>\"\n8. get_text_summary: Get text summary,
args: \"url\": \"<url>\", \"question\": \"<question>\"\n9. list_agents: List
GPT Agents, args: () -> str\n10. message_agent: Message GPT Agent, args: \"key\":
\"<key>\", \"message\": \"<message>\"\n11. start_agent: Start GPT Agent, args:
\"name\": \"<name>\", \"task\": \"<short_task_desc>\", \"prompt\": \"<prompt>\"\n12.
task_complete: Task Complete (Shutdown), args: \"reason\": \"<reason>\"\n\nResources:\n1.
Internet access for searches and information gathering.\n2. Long Term memory
management.\n3. GPT-3.5 powered Agents for delegation of simple tasks.\n4. File
output.\n\nPerformance Evaluation:\n1. Continuously review and analyze your
actions to ensure you are performing to the best of your abilities.\n2. Constructively
self-criticize your big-picture behavior constantly.\n3. Reflect on past decisions
and strategies to refine your approach.\n4. Every command has a cost, so be
smart and efficient. Aim to complete tasks in the least number of steps.\n5.
Write all code to a file.\n\nYou should only respond in JSON format as described
below \nResponse Format: \n{\n \"thoughts\": {\n \"text\": \"thought\",\n \"reasoning\":
\"reasoning\",\n \"plan\": \"- short bulleted\\n- list that conveys\\n-
long-term plan\",\n \"criticism\": \"constructive self-criticism\",\n \"speak\":
\"thoughts summary to say to user\"\n },\n \"command\": {\n \"name\":
\"command name\",\n \"args\": {\n \"arg name\": \"value\"\n }\n }\n}
\nEnsure the response can be parsed by Python json.loads"}, {"role": "system",
"content": "The current time and date is Tue Jan 1 00:00:00 2000"}, {"role":
"system", "content": "This reminds you of these events from your past: \nI was
created and nothing new has happened."}, {"role": "user", "content": "Determine
which next command to use, and respond using the format specified above:"},
{"role": "assistant", "content": "{\n \"thoughts\": {\n \"text\":
\"I need to use the write_to_file command to write ''Hello World'' into a file
named ''hello_world.txt''.\",\n \"reasoning\": \"The task requires me
to write ''Hello World'' into a file named ''hello_world.txt'', and the write_to_file
command is the most appropriate for this task.\",\n \"plan\": \"- Use
the write_to_file command to write ''Hello World'' into a file named ''hello_world.txt''.\",\n \"criticism\":
\"I need to ensure that I use the correct filename and text to write to the
file.\",\n \"speak\": \"I will use the write_to_file command to write
''Hello World'' into a file named ''hello_world.txt''.\"\n },\n \"command\":
{\n \"name\": \"write_to_file\",\n \"args\": {\n \"filename\":
\"hello_world.txt\",\n \"text\": \"Hello World\"\n }\n }\n}"},
{"role": "system", "content": "Command write_to_file returned: File written
to successfully."}, {"role": "user", "content": "Determine which next command
to use, and respond using the format specified above:"}], "temperature": 0,
"max_tokens": 0}'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '4637'
Content-Type:
- application/json
method: POST
uri: https://api.openai.com/v1/chat/completions
response:
body:
string: !!binary |
H4sIAAAAAAAAA7RSwY4SQRC9+xWVunAZCOzILpmbFyOHTUwWo0YM6e0pZlp6utruGsAQ/t00zMi6
sheN13qvXr3X/Q5oSixQ10p04+3wbr547x+mn+/X929pnHPc3dWfHt5ow7Npgxny4zfS0m2MNDfe
khh2mKEOpIRKLCa3s9e3+V2e5xk2XJLFAisvw3w0HUobHnk4zscTzLCNqiIsDugDN15WwhtyEYvJ
+Gaa4UX8AszyDIVF2cvoZjw7ZqhrNpoiFl8O2FDshQNbwgJVjCaKcpJsshNyKcJh6QAAlig1t1Ut
cYkFdMMOoL2k4RLnUKstQWy1phjXrbU/YBeMCDkYvCNrGT5ysOUAjBMGBWtjCZxqqIRBnfDVLuEj
2ctgBIuawNFeIAp5MBGEoY0EUhOIiptVF55Ac9MoVybCr1nPGi0xe2o3kIrsjKvOnhcdDQJ9b02g
CA1d1QG1FgqnPMZV1/IkZkqUwcnLiy5Tkpqg4SigvA/sg1FCsOYAUif0im1vlTs7HsKHf34EnVJo
E5v+4xzRaZNcbEPaUwJz8IG3piRQsFXWlHB+u5PRvniuevFI9KQ2/YGdsfbvfu8secz6Inb0P3qY
enS+9pv8M1MqVM8r/LQX/6PIfYRTjC7N0h3xmOHaOBPr1fk2FhiFPWZoXEl7LMbHr8dXPwEAAP//
AwCEJsUGfQQAAA==
headers:
CF-Cache-Status:
- DYNAMIC
CF-RAY:
- 7ca980c86e2128e0-DFW
Cache-Control:
- no-cache, must-revalidate
Connection:
- keep-alive
Content-Encoding:
- gzip
Content-Type:
- application/json
Date:
- Sun, 21 May 2023 02:49:08 GMT
Server:
- cloudflare
access-control-allow-origin:
- '*'
alt-svc:
- h3=":443"; ma=86400, h3-29=":443"; ma=86400
openai-model:
- gpt-3.5-turbo-0301
openai-organization:
- significant-gravitas
openai-processing-ms:
- '14391'
openai-version:
- '2020-10-01'
strict-transport-security:
- max-age=15724800; includeSubDomains
x-ratelimit-limit-requests:
- '3500'
x-ratelimit-limit-tokens:
- '90000'
x-ratelimit-remaining-requests:
- '3499'
x-ratelimit-remaining-tokens:
- '86481'
x-ratelimit-reset-requests:
- 17ms
x-ratelimit-reset-tokens:
- 2.346s
x-request-id:
- 7efa7a4b949a3c79b750abf348c98545
status:
code: 200
message: OK
version: 1

View File

@@ -0,0 +1,152 @@
import json
import tempfile
from functools import partial
from unittest import TestCase
from xml.etree import ElementTree
import docx
import yaml
from bs4 import BeautifulSoup
from PyPDF2 import PdfWriter
from autogpt.commands.file_operations_utils import is_file_binary_fn, read_textual_file
from autogpt.logs import logger
plain_text_str = "Hello, world!"
def mock_text_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write(plain_text_str)
return f.name
def mock_csv_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") as f:
f.write(plain_text_str)
return f.name
def mock_pdf_file():
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".pdf") as f:
# Create a new PDF and add a page with the text plain_text_str
# Write the PDF header
f.write(b"%PDF-1.7\n")
# Write the document catalog
f.write(b"1 0 obj\n")
f.write(b"<< /Type /Catalog /Pages 2 0 R >>\n")
f.write(b"endobj\n")
# Write the page object
f.write(b"2 0 obj\n")
f.write(
b"<< /Type /Page /Parent 1 0 R /Resources << /Font << /F1 3 0 R >> >> /MediaBox [0 0 612 792] /Contents 4 0 R >>\n"
)
f.write(b"endobj\n")
# Write the font object
f.write(b"3 0 obj\n")
f.write(
b"<< /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica-Bold >>\n"
)
f.write(b"endobj\n")
# Write the page contents object
f.write(b"4 0 obj\n")
f.write(b"<< /Length 25 >>\n")
f.write(b"stream\n")
f.write(b"BT\n/F1 12 Tf\n72 720 Td\n(Hello, world!) Tj\nET\n")
f.write(b"endstream\n")
f.write(b"endobj\n")
# Write the cross-reference table
f.write(b"xref\n")
f.write(b"0 5\n")
f.write(b"0000000000 65535 f \n")
f.write(b"0000000017 00000 n \n")
f.write(b"0000000073 00000 n \n")
f.write(b"0000000123 00000 n \n")
f.write(b"0000000271 00000 n \n")
f.write(b"trailer\n")
f.write(b"<< /Size 5 /Root 1 0 R >>\n")
f.write(b"startxref\n")
f.write(b"380\n")
f.write(b"%%EOF\n")
f.write(b"\x00")
return f.name
def mock_docx_file():
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".docx") as f:
document = docx.Document()
document.add_paragraph(plain_text_str)
document.save(f.name)
return f.name
def mock_json_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
json.dump({"text": plain_text_str}, f)
return f.name
def mock_xml_file():
root = ElementTree.Element("text")
root.text = plain_text_str
tree = ElementTree.ElementTree(root)
with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix=".xml") as f:
tree.write(f)
return f.name
def mock_yaml_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".yaml") as f:
yaml.dump({"text": plain_text_str}, f)
return f.name
def mock_html_file():
html = BeautifulSoup(
f"<html><head><title>This is a test</title></head><body><p>{plain_text_str}</p></body></html>",
"html.parser",
)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html") as f:
f.write(str(html))
return f.name
def mock_md_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as f:
f.write(f"# {plain_text_str}!\n")
return f.name
def mock_latex_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tex") as f:
latex_str = rf"\documentclass{{article}}\begin{{document}}{plain_text_str}\end{{document}}"
f.write(latex_str)
return f.name
respective_file_creation_functions = {
".txt": mock_text_file,
".csv": mock_csv_file,
".pdf": mock_pdf_file,
".docx": mock_docx_file,
".json": mock_json_file,
".xml": mock_xml_file,
".yaml": mock_yaml_file,
".html": mock_html_file,
".md": mock_md_file,
".tex": mock_latex_file,
}
class TestConfig(TestCase):
def test_parsers(self):
binary_files_extensions = [".pdf", ".docx"]
for (
file_extension,
c_file_creator,
) in respective_file_creation_functions.items():
created_filepath = c_file_creator()
loaded_text = read_textual_file(created_filepath, logger)
self.assertIn(plain_text_str, loaded_text)
should_be_binary = file_extension in binary_files_extensions
self.assertEqual(should_be_binary, is_file_binary_fn(created_filepath))