Many small improvements, new benchmark results!

This commit is contained in:
Anton Osika
2023-06-21 00:37:18 +02:00
parent 4c68126295
commit 16dd96f50d
6 changed files with 97 additions and 33 deletions

1
.gitignore vendored
View File

@@ -44,6 +44,7 @@ archive
# any log file # any log file
*log.txt *log.txt
todo todo
scratchpad
# Ignore GPT Engineer files # Ignore GPT Engineer files
projects projects

View File

@@ -4,6 +4,36 @@
$ python scripts/benchmark.py $ python scripts/benchmark.py
``` ```
# 2023-06-21
| Benchmark | Ran | Works | Perfect |
|--------------------|-----|-------|---------|
| currency_converter | ✅ | ❌ | ❌ |
| image_resizer | ✅ | ✅ | ✅ |
| pomodoro_timer | ✅ | ✅ | ✅ |
| url_shortener | ✅ | ✅ | ✅ |
| file_explorer | ✅ | ✅ | ✅ |
| markdown_editor | ✅ | ✅ | ❌ |
| timer_app | ✅ | ❌ | ❌ |
| weather_app | ✅ | ✅ | ✅ |
| file_organizer | ✅ | ✅ | ✅ |
| password_generator | ✅ | ✅ | ✅ |
| todo_list | ✅ | ✅ | ✅ |
# Notes on the errors
Most errors come from that the "generate entrypoint" are incorrect. Ignoring
those, we get 8/11 fully correct.
All errors are very easy to fix.
One error was trying to modify a constant.
One error was that the html template was not fully filled in.
One error is that a dependency was used incorrectly and easy to fix
# 2023-06-19
| Benchmark | Ran | Works | Perfect | | Benchmark | Ran | Works | Perfect |
|--------------------|-----|-------|---------| |--------------------|-----|-------|---------|
| currency_converter | ❌ | ❌ | ❌ | | currency_converter | ❌ | ❌ | ❌ |

View File

@@ -7,6 +7,7 @@ from pathlib import Path
import typer import typer
from gpt_engineer import steps
from gpt_engineer.ai import AI from gpt_engineer.ai import AI
from gpt_engineer.db import DB, DBs from gpt_engineer.db import DB, DBs
from gpt_engineer.steps import STEPS from gpt_engineer.steps import STEPS
@@ -20,7 +21,9 @@ def main(
delete_existing: bool = typer.Argument(False, help="delete existing files"), delete_existing: bool = typer.Argument(False, help="delete existing files"),
model: str = "gpt-4", model: str = "gpt-4",
temperature: float = 0.1, temperature: float = 0.1,
steps_config: str = "default", steps_config: steps.Config = typer.Option(
steps.Config.DEFAULT, "--steps", "-s", help="decide which steps to run"
),
verbose: bool = typer.Option(False, "--verbose", "-v"), verbose: bool = typer.Option(False, "--verbose", "-v"),
run_prefix: str = typer.Option( run_prefix: str = typer.Option(
"", "",

View File

@@ -2,6 +2,8 @@ import json
import re import re
import subprocess import subprocess
from enum import Enum
from gpt_engineer.ai import AI from gpt_engineer.ai import AI
from gpt_engineer.chat_to_files import to_files from gpt_engineer.chat_to_files import to_files
from gpt_engineer.db import DBs from gpt_engineer.db import DBs
@@ -34,10 +36,10 @@ def clarify(ai: AI, dbs: DBs):
break break
print() print()
user = input('(answer in text, or "q" to move on)\n') user = input('(answer in text, or "c" to move on)\n')
print() print()
if not user or user == "q": if not user or user == "c":
break break
user += ( user += (
@@ -145,7 +147,7 @@ def execute_entrypoint(ai, dbs):
print() print()
print('If yes, press enter. Otherwise, type "no"') print('If yes, press enter. Otherwise, type "no"')
print() print()
if input() != "": if input() not in ["", "y", "yes"]:
print("Ok, not executing the code.") print("Ok, not executing the code.")
return [] return []
print("Executing the code...") print("Executing the code...")
@@ -183,7 +185,7 @@ def use_feedback(ai: AI, dbs: DBs):
ai.fassistant(dbs.workspace["all_output.txt"]), ai.fassistant(dbs.workspace["all_output.txt"]),
ai.fsystem(dbs.identity["use_feedback"]), ai.fsystem(dbs.identity["use_feedback"]),
] ]
messages = ai.next(messages, dbs.memory["feedback"]) messages = ai.next(messages, dbs.input["feedback"])
to_files(messages[-1]["content"], dbs.workspace) to_files(messages[-1]["content"], dbs.workspace)
return messages return messages
@@ -201,13 +203,36 @@ def fix_code(ai: AI, dbs: DBs):
return messages return messages
class Config(str, Enum):
DEFAULT = "default"
BENCHMARK = "benchmark"
SIMPLE = "simple"
TDD = "tdd"
TDD_PLUS = "tdd+"
CLARIFY = "clarify"
RESPEC = "respec"
EXECUTE_ONLY = "execute_only"
USE_FEEDBACK = "use_feedback"
# Different configs of what steps to run # Different configs of what steps to run
STEPS = { STEPS = {
"default": [simple_gen, gen_entrypoint, execute_entrypoint], Config.DEFAULT: [
"benchmark": [simple_gen, gen_entrypoint], clarify,
"simple": [simple_gen, gen_entrypoint, execute_entrypoint], gen_clarified_code,
"tdd": [gen_spec, gen_unit_tests, gen_code, gen_entrypoint, execute_entrypoint], gen_entrypoint,
"tdd+": [ execute_entrypoint,
],
Config.BENCHMARK: [simple_gen, gen_entrypoint],
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
Config.TDD: [
gen_spec,
gen_unit_tests,
gen_code,
gen_entrypoint,
execute_entrypoint,
],
Config.TDD_PLUS: [
gen_spec, gen_spec,
gen_unit_tests, gen_unit_tests,
gen_code, gen_code,
@@ -215,8 +240,13 @@ STEPS = {
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
], ],
"clarify": [clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint], Config.CLARIFY: [
"respec": [ clarify,
gen_clarified_code,
gen_entrypoint,
execute_entrypoint,
],
Config.RESPEC: [
gen_spec, gen_spec,
respec, respec,
gen_unit_tests, gen_unit_tests,
@@ -224,12 +254,9 @@ STEPS = {
gen_entrypoint, gen_entrypoint,
execute_entrypoint, execute_entrypoint,
], ],
"execute_only": [execute_entrypoint], Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
"use_feedback": [use_feedback], Config.EXECUTE_ONLY: [gen_entrypoint, execute_entrypoint],
} }
# Future steps that can be added: # Future steps that can be added:
# self_reflect_and_improve_files, # run_tests_and_fix_files
# add_tests
# run_tests_and_fix_files,
# improve_based_on_in_file_feedback_comments

View File

@@ -35,7 +35,7 @@ def main(
"-m", "-m",
"gpt_engineer.main", "gpt_engineer.main",
bench_folder, bench_folder,
"--steps-config", "--steps",
"benchmark", "benchmark",
], ],
stdout=log_file, stdout=log_file,
@@ -66,7 +66,7 @@ def main(
"-m", "-m",
"gpt_engineer.main", "gpt_engineer.main",
bench_folder, bench_folder,
"--steps-config", "--steps",
"execute_only", "execute_only",
], ],
) )

View File

@@ -16,6 +16,11 @@ def pretty_print_conversation(messages):
} }
formatted_messages = [] formatted_messages = []
for message in messages: for message in messages:
if message["role"] == "function":
formatted_messages.append(
f"function ({message['name']}): {message['content']}\n"
)
else:
assistant_content = ( assistant_content = (
message["function_call"] message["function_call"]
if message.get("function_call") if message.get("function_call")
@@ -25,9 +30,7 @@ def pretty_print_conversation(messages):
"system": f"system: {message['content']}\n", "system": f"system: {message['content']}\n",
"user": f"user: {message['content']}\n", "user": f"user: {message['content']}\n",
"assistant": f"assistant: {assistant_content}\n", "assistant": f"assistant: {assistant_content}\n",
"function": f"function ({message['name']}): {message['content']}\n",
} }
formatted_messages.append(role_to_message[message["role"]]) formatted_messages.append(role_to_message[message["role"]])
for formatted_message in formatted_messages: for formatted_message in formatted_messages: