mirror of
https://github.com/aljazceru/gpt-engineer.git
synced 2025-12-17 12:45:26 +01:00
Many small improvements, new benchmark results!
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -44,6 +44,7 @@ archive
|
||||
# any log file
|
||||
*log.txt
|
||||
todo
|
||||
scratchpad
|
||||
|
||||
# Ignore GPT Engineer files
|
||||
projects
|
||||
|
||||
@@ -4,6 +4,36 @@
|
||||
$ python scripts/benchmark.py
|
||||
```
|
||||
|
||||
# 2023-06-21
|
||||
|
||||
| Benchmark | Ran | Works | Perfect |
|
||||
|--------------------|-----|-------|---------|
|
||||
| currency_converter | ✅ | ❌ | ❌ |
|
||||
| image_resizer | ✅ | ✅ | ✅ |
|
||||
| pomodoro_timer | ✅ | ✅ | ✅ |
|
||||
| url_shortener | ✅ | ✅ | ✅ |
|
||||
| file_explorer | ✅ | ✅ | ✅ |
|
||||
| markdown_editor | ✅ | ✅ | ❌ |
|
||||
| timer_app | ✅ | ❌ | ❌ |
|
||||
| weather_app | ✅ | ✅ | ✅ |
|
||||
| file_organizer | ✅ | ✅ | ✅ |
|
||||
| password_generator | ✅ | ✅ | ✅ |
|
||||
| todo_list | ✅ | ✅ | ✅ |
|
||||
|
||||
|
||||
# Notes on the errors
|
||||
Most errors come from that the "generate entrypoint" are incorrect. Ignoring
|
||||
those, we get 8/11 fully correct.
|
||||
|
||||
All errors are very easy to fix.
|
||||
|
||||
One error was trying to modify a constant.
|
||||
One error was that the html template was not fully filled in.
|
||||
One error is that a dependency was used incorrectly and easy to fix
|
||||
|
||||
|
||||
# 2023-06-19
|
||||
|
||||
| Benchmark | Ran | Works | Perfect |
|
||||
|--------------------|-----|-------|---------|
|
||||
| currency_converter | ❌ | ❌ | ❌ |
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
|
||||
import typer
|
||||
|
||||
from gpt_engineer import steps
|
||||
from gpt_engineer.ai import AI
|
||||
from gpt_engineer.db import DB, DBs
|
||||
from gpt_engineer.steps import STEPS
|
||||
@@ -20,7 +21,9 @@ def main(
|
||||
delete_existing: bool = typer.Argument(False, help="delete existing files"),
|
||||
model: str = "gpt-4",
|
||||
temperature: float = 0.1,
|
||||
steps_config: str = "default",
|
||||
steps_config: steps.Config = typer.Option(
|
||||
steps.Config.DEFAULT, "--steps", "-s", help="decide which steps to run"
|
||||
),
|
||||
verbose: bool = typer.Option(False, "--verbose", "-v"),
|
||||
run_prefix: str = typer.Option(
|
||||
"",
|
||||
|
||||
@@ -2,6 +2,8 @@ import json
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from gpt_engineer.ai import AI
|
||||
from gpt_engineer.chat_to_files import to_files
|
||||
from gpt_engineer.db import DBs
|
||||
@@ -34,10 +36,10 @@ def clarify(ai: AI, dbs: DBs):
|
||||
break
|
||||
|
||||
print()
|
||||
user = input('(answer in text, or "q" to move on)\n')
|
||||
user = input('(answer in text, or "c" to move on)\n')
|
||||
print()
|
||||
|
||||
if not user or user == "q":
|
||||
if not user or user == "c":
|
||||
break
|
||||
|
||||
user += (
|
||||
@@ -145,7 +147,7 @@ def execute_entrypoint(ai, dbs):
|
||||
print()
|
||||
print('If yes, press enter. Otherwise, type "no"')
|
||||
print()
|
||||
if input() != "":
|
||||
if input() not in ["", "y", "yes"]:
|
||||
print("Ok, not executing the code.")
|
||||
return []
|
||||
print("Executing the code...")
|
||||
@@ -183,7 +185,7 @@ def use_feedback(ai: AI, dbs: DBs):
|
||||
ai.fassistant(dbs.workspace["all_output.txt"]),
|
||||
ai.fsystem(dbs.identity["use_feedback"]),
|
||||
]
|
||||
messages = ai.next(messages, dbs.memory["feedback"])
|
||||
messages = ai.next(messages, dbs.input["feedback"])
|
||||
to_files(messages[-1]["content"], dbs.workspace)
|
||||
return messages
|
||||
|
||||
@@ -201,13 +203,36 @@ def fix_code(ai: AI, dbs: DBs):
|
||||
return messages
|
||||
|
||||
|
||||
class Config(str, Enum):
|
||||
DEFAULT = "default"
|
||||
BENCHMARK = "benchmark"
|
||||
SIMPLE = "simple"
|
||||
TDD = "tdd"
|
||||
TDD_PLUS = "tdd+"
|
||||
CLARIFY = "clarify"
|
||||
RESPEC = "respec"
|
||||
EXECUTE_ONLY = "execute_only"
|
||||
USE_FEEDBACK = "use_feedback"
|
||||
|
||||
|
||||
# Different configs of what steps to run
|
||||
STEPS = {
|
||||
"default": [simple_gen, gen_entrypoint, execute_entrypoint],
|
||||
"benchmark": [simple_gen, gen_entrypoint],
|
||||
"simple": [simple_gen, gen_entrypoint, execute_entrypoint],
|
||||
"tdd": [gen_spec, gen_unit_tests, gen_code, gen_entrypoint, execute_entrypoint],
|
||||
"tdd+": [
|
||||
Config.DEFAULT: [
|
||||
clarify,
|
||||
gen_clarified_code,
|
||||
gen_entrypoint,
|
||||
execute_entrypoint,
|
||||
],
|
||||
Config.BENCHMARK: [simple_gen, gen_entrypoint],
|
||||
Config.SIMPLE: [simple_gen, gen_entrypoint, execute_entrypoint],
|
||||
Config.TDD: [
|
||||
gen_spec,
|
||||
gen_unit_tests,
|
||||
gen_code,
|
||||
gen_entrypoint,
|
||||
execute_entrypoint,
|
||||
],
|
||||
Config.TDD_PLUS: [
|
||||
gen_spec,
|
||||
gen_unit_tests,
|
||||
gen_code,
|
||||
@@ -215,8 +240,13 @@ STEPS = {
|
||||
gen_entrypoint,
|
||||
execute_entrypoint,
|
||||
],
|
||||
"clarify": [clarify, gen_clarified_code, gen_entrypoint, execute_entrypoint],
|
||||
"respec": [
|
||||
Config.CLARIFY: [
|
||||
clarify,
|
||||
gen_clarified_code,
|
||||
gen_entrypoint,
|
||||
execute_entrypoint,
|
||||
],
|
||||
Config.RESPEC: [
|
||||
gen_spec,
|
||||
respec,
|
||||
gen_unit_tests,
|
||||
@@ -224,12 +254,9 @@ STEPS = {
|
||||
gen_entrypoint,
|
||||
execute_entrypoint,
|
||||
],
|
||||
"execute_only": [execute_entrypoint],
|
||||
"use_feedback": [use_feedback],
|
||||
Config.USE_FEEDBACK: [use_feedback, gen_entrypoint, execute_entrypoint],
|
||||
Config.EXECUTE_ONLY: [gen_entrypoint, execute_entrypoint],
|
||||
}
|
||||
|
||||
# Future steps that can be added:
|
||||
# self_reflect_and_improve_files,
|
||||
# add_tests
|
||||
# run_tests_and_fix_files,
|
||||
# improve_based_on_in_file_feedback_comments
|
||||
# run_tests_and_fix_files
|
||||
|
||||
@@ -35,7 +35,7 @@ def main(
|
||||
"-m",
|
||||
"gpt_engineer.main",
|
||||
bench_folder,
|
||||
"--steps-config",
|
||||
"--steps",
|
||||
"benchmark",
|
||||
],
|
||||
stdout=log_file,
|
||||
@@ -66,7 +66,7 @@ def main(
|
||||
"-m",
|
||||
"gpt_engineer.main",
|
||||
bench_folder,
|
||||
"--steps-config",
|
||||
"--steps",
|
||||
"execute_only",
|
||||
],
|
||||
)
|
||||
|
||||
@@ -16,6 +16,11 @@ def pretty_print_conversation(messages):
|
||||
}
|
||||
formatted_messages = []
|
||||
for message in messages:
|
||||
if message["role"] == "function":
|
||||
formatted_messages.append(
|
||||
f"function ({message['name']}): {message['content']}\n"
|
||||
)
|
||||
else:
|
||||
assistant_content = (
|
||||
message["function_call"]
|
||||
if message.get("function_call")
|
||||
@@ -25,9 +30,7 @@ def pretty_print_conversation(messages):
|
||||
"system": f"system: {message['content']}\n",
|
||||
"user": f"user: {message['content']}\n",
|
||||
"assistant": f"assistant: {assistant_content}\n",
|
||||
"function": f"function ({message['name']}): {message['content']}\n",
|
||||
}
|
||||
|
||||
formatted_messages.append(role_to_message[message["role"]])
|
||||
|
||||
for formatted_message in formatted_messages:
|
||||
|
||||
Reference in New Issue
Block a user