mirror of
https://github.com/aljazceru/gpt-engineer.git
synced 2025-12-17 12:45:26 +01:00
Many small improvements, new benchmark results!
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -44,6 +44,7 @@ archive
|
|||||||
# any log file
|
# any log file
|
||||||
*log.txt
|
*log.txt
|
||||||
todo
|
todo
|
||||||
|
scratchpad
|
||||||
|
|
||||||
# Ignore GPT Engineer files
|
# Ignore GPT Engineer files
|
||||||
projects
|
projects
|
||||||
|
|||||||
@@ -4,6 +4,36 @@
|
|||||||
$ python scripts/benchmark.py
|
$ python scripts/benchmark.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
# 2023-06-21
|
||||||
|
|
||||||
|
| Benchmark | Ran | Works | Perfect |
|
||||||
|
|--------------------|-----|-------|---------|
|
||||||
|
| currency_converter | ✅ | ❌ | ❌ |
|
||||||
|
| image_resizer | ✅ | ✅ | ✅ |
|
||||||
|
| pomodoro_timer | ✅ | ✅ | ✅ |
|
||||||
|
| url_shortener | ✅ | ✅ | ✅ |
|
||||||
|
| file_explorer | ✅ | ✅ | ✅ |
|
||||||
|
| markdown_editor | ✅ | ✅ | ❌ |
|
||||||
|
| timer_app | ✅ | ❌ | ❌ |
|
||||||
|
| weather_app | ✅ | ✅ | ✅ |
|
||||||
|
| file_organizer | ✅ | ✅ | ✅ |
|
||||||
|
| password_generator | ✅ | ✅ | ✅ |
|
||||||
|
| todo_list | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
|
|
||||||
|
# Notes on the errors
|
||||||
|
Most errors come from that the "generate entrypoint" are incorrect. Ignoring
|
||||||
|
those, we get 8/11 fully correct.
|
||||||
|
|
||||||
|
All errors are very easy to fix.
|
||||||
|
|
||||||
|
One error was trying to modify a constant.
|
||||||
|
One error was that the html template was not fully filled in.
|
||||||
|
One error is that a dependency was used incorrectly and easy to fix
|
||||||
|
|
||||||
|
|
||||||
|
# 2023-06-19
|
||||||
|
|
||||||
| Benchmark | Ran | Works | Perfect |
|
| Benchmark | Ran | Works | Perfect |
|
||||||
|--------------------|-----|-------|---------|
|
|--------------------|-----|-------|---------|
|
||||||
| currency_converter | ❌ | ❌ | ❌ |
|
| currency_converter | ❌ | ❌ | ❌ |
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
|
from gpt_engineer import steps
|
||||||
from gpt_engineer.ai import AI
|
from gpt_engineer.ai import AI
|
||||||
from gpt_engineer.db import DB, DBs
|
from gpt_engineer.db import DB, DBs
|
||||||
from gpt_engineer.steps import STEPS
|
from gpt_engineer.steps import STEPS
|
||||||
@@ -20,7 +21,9 @@ def main(
|
|||||||
delete_existing: bool = typer.Argument(False, help="delete existing files"),
|
delete_existing: bool = typer.Argument(False, help="delete existing files"),
|
||||||
model: str = "gpt-4",
|
model: str = "gpt-4",
|
||||||
temperature: float = 0.1,
|
temperature: float = 0.1,
|
||||||
steps_config: str = "default",
|
steps_config: steps.Config = typer.Option(
|
||||||
|
steps.Config.DEFAULT, "--steps", "-s", help="decide which steps to run"
|
||||||
|
),
|
||||||
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
||||||
run_prefix: str = typer.Option(
|
run_prefix: str = typer.Option(
|
||||||
"",
|
"",
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ import json
|
|||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
from gpt_engineer.ai import AI
|
from gpt_engineer.ai import AI
|
||||||
from gpt_engineer.chat_to_files import to_files
|
from gpt_engineer.chat_to_files import to_files
|
||||||
from gpt_engineer.db import DBs
|
from gpt_engineer.db import DBs
|
||||||
@@ -34,10 +36,10 @@ def clarify(ai: AI, dbs: DBs):
|
|||||||
break
|
break
|
||||||
|
|
||||||
print()
|
print()
|
||||||
user = input('(answer in text, or "q" to move on)\n')
|
user = input('(answer in text, or "c" to move on)\n')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
if not user or user == "q":
|
if not user or user == "c":
|
||||||
break
|
break
|
||||||
|
|
||||||
user += (
|
user += (
|
||||||
@@ -145,7 +147,7 @@ def execute_entrypoint(ai, dbs):
|
|||||||
print()
|
print()
|
||||||
print('If yes, press enter. Otherwise, type "no"')
|
print('If yes, press enter. Otherwise, type "no"')
|
||||||
print()
|
print()
|
||||||
if input() != "":
|
if input() not in ["", "y", "yes"]:
|
||||||
print("Ok, not executing the code.")
|
print("Ok, not executing the code.")
|
||||||
return []
|
return []
|
||||||
print("Executing the code...")
|
print("Executing the code...")
|
||||||
@@ -183,7 +185,7 @@ def use_feedback(ai: AI, dbs: DBs):
|
|||||||
ai.fassistant(dbs.workspace["all_output.txt"]),
|
ai.fassistant(dbs.workspace["all_output.txt"]),
|
||||||
ai.fsystem(dbs.identity["use_feedback"]),
|
ai.fsystem(dbs.identity["use_feedback"]),
|
||||||
]
|
]
|
||||||
messages = ai.next(messages, dbs.memory["feedback"])
|
messages = ai.next(messages, dbs.input["feedback"])
|
||||||
to_files(messages[-1]["content"], dbs.workspace)
|
to_files(messages[-1]["content"], dbs.workspace)
|
||||||
return messages
|
return messages
|
||||||
|
|
||||||
@@ -201,13 +203,36 @@ def fix_code(ai: AI, dbs: DBs):
|
|||||||
return messages
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
class Config(str, Enum):
|
||||||
|
DEFAULT = "default"
|
||||||
|
BENCHMARK = "benchmark"
|
||||||
|
SIMPLE = "simple"
|
||||||
|
TDD = "tdd"
|
||||||
|
TDD_PLUS = "tdd+"
|
||||||
|
CLARIFY = "clarify"
|
||||||
|
RESPEC = "respec"
|
||||||
|
EXECUTE_ONLY = "execute_only"
|
||||||
|
USE_FEEDBACK = "use_feedback"
|
||||||
|
|
||||||
|
|
||||||
# Different configs of what steps to run
|
# Different configs of what steps to run
|
||||||
STEPS = {
|
STEPS = {
|
||||||
"default": [simple_gen, gen_entrypoint, execute_entrypoint],
|
Config.DEFAULT: [
|
||||||
"benchmark": [simple_gen, gen_entrypoint],
|
clarify,
|
||||||
"simple": [simple_gen, gen_entrypoint, execute_entrypoint],
|
gen_clarified_code,
|
||||||
"tdd": [gen_spec, gen_unit_tests, gen_code, gen_entrypoint, execute_entrypoint],
|
gen_entrypoint,
|
||||||
"tdd+": [
|
execute_entrypoint,
|
||||||
|
],
|
||||||
|
Config.BENCHMARK: [simple_gen, gen_entrypoint],
|
||||||
|
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
|
||||||
|
Config.TDD: [
|
||||||
|
gen_spec,
|
||||||
|
gen_unit_tests,
|
||||||
|
gen_code,
|
||||||
|
gen_entrypoint,
|
||||||
|
execute_entrypoint,
|
||||||
|
],
|
||||||
|
Config.TDD_PLUS: [
|
||||||
gen_spec,
|
gen_spec,
|
||||||
gen_unit_tests,
|
gen_unit_tests,
|
||||||
gen_code,
|
gen_code,
|
||||||
@@ -215,8 +240,13 @@ STEPS = {
|
|||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
],
|
],
|
||||||
"clarify": [clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint],
|
Config.CLARIFY: [
|
||||||
"respec": [
|
clarify,
|
||||||
|
gen_clarified_code,
|
||||||
|
gen_entrypoint,
|
||||||
|
execute_entrypoint,
|
||||||
|
],
|
||||||
|
Config.RESPEC: [
|
||||||
gen_spec,
|
gen_spec,
|
||||||
respec,
|
respec,
|
||||||
gen_unit_tests,
|
gen_unit_tests,
|
||||||
@@ -224,12 +254,9 @@ STEPS = {
|
|||||||
gen_entrypoint,
|
gen_entrypoint,
|
||||||
execute_entrypoint,
|
execute_entrypoint,
|
||||||
],
|
],
|
||||||
"execute_only": [execute_entrypoint],
|
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
|
||||||
"use_feedback": [use_feedback],
|
Config.EXECUTE_ONLY: [gen_entrypoint, execute_entrypoint],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Future steps that can be added:
|
# Future steps that can be added:
|
||||||
# self_reflect_and_improve_files,
|
# run_tests_and_fix_files
|
||||||
# add_tests
|
|
||||||
# run_tests_and_fix_files,
|
|
||||||
# improve_based_on_in_file_feedback_comments
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ def main(
|
|||||||
"-m",
|
"-m",
|
||||||
"gpt_engineer.main",
|
"gpt_engineer.main",
|
||||||
bench_folder,
|
bench_folder,
|
||||||
"--steps-config",
|
"--steps",
|
||||||
"benchmark",
|
"benchmark",
|
||||||
],
|
],
|
||||||
stdout=log_file,
|
stdout=log_file,
|
||||||
@@ -66,7 +66,7 @@ def main(
|
|||||||
"-m",
|
"-m",
|
||||||
"gpt_engineer.main",
|
"gpt_engineer.main",
|
||||||
bench_folder,
|
bench_folder,
|
||||||
"--steps-config",
|
"--steps",
|
||||||
"execute_only",
|
"execute_only",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -16,6 +16,11 @@ def pretty_print_conversation(messages):
|
|||||||
}
|
}
|
||||||
formatted_messages = []
|
formatted_messages = []
|
||||||
for message in messages:
|
for message in messages:
|
||||||
|
if message["role"] == "function":
|
||||||
|
formatted_messages.append(
|
||||||
|
f"function ({message['name']}): {message['content']}\n"
|
||||||
|
)
|
||||||
|
else:
|
||||||
assistant_content = (
|
assistant_content = (
|
||||||
message["function_call"]
|
message["function_call"]
|
||||||
if message.get("function_call")
|
if message.get("function_call")
|
||||||
@@ -25,9 +30,7 @@ def pretty_print_conversation(messages):
|
|||||||
"system": f"system: {message['content']}\n",
|
"system": f"system: {message['content']}\n",
|
||||||
"user": f"user: {message['content']}\n",
|
"user": f"user: {message['content']}\n",
|
||||||
"assistant": f"assistant: {assistant_content}\n",
|
"assistant": f"assistant: {assistant_content}\n",
|
||||||
"function": f"function ({message['name']}): {message['content']}\n",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
formatted_messages.append(role_to_message[message["role"]])
|
formatted_messages.append(role_to_message[message["role"]])
|
||||||
|
|
||||||
for formatted_message in formatted_messages:
|
for formatted_message in formatted_messages:
|
||||||
|
|||||||
Reference in New Issue
Block a user