From 59ff485253225dc7902cc506369ded9457dfed64 Mon Sep 17 00:00:00 2001 From: douglas Date: Mon, 17 Apr 2023 18:14:09 -0400 Subject: [PATCH] Prompt engineering fixes --- README.md | 39 +++++++++++++++++-- auto_gpt_benchmarking/AutoGPTAgent.py | 3 ++ .../AutoGPTData/ai_settings.yaml | 6 +-- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index db3c5e3a..b8f09a94 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ A set of standardised benchmarks to assess the performance of Auto-GPTs. - [ ] Get token counting data from the model Add scores to result files based on pricing associated with tokens and models used - [ ] Think about how this can be applied to other projects besides AutoGPT so we can be THE agent evaluation framework. - [ ] Figure our how the OpenAI Evals results are saved... +- [ ] Support multi-threaded evals. OpenAI has great support for this. The docker system built here doesn't. +- [ ] Make the file logger/duplicate op checker more robust. It's not great right now. ## Understanding OpenAI Evals @@ -30,12 +32,43 @@ See our completion function itself in CompletionFn.py That points to the AutoGPT model we want to test which is spun up dynamically in a docker container in AutoGPTAgent.py -# RANDOM SHIT +## Setup You must add the auto_gpt_bencchmarking dir to the python path Do this with a path file in your venv. OpenAI evals needs to import it. -I added a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: -`/home/douglas/AGI/Auto-GPT-Benchmarks-fork` +Create a venv with + +`python3.9 -m venv venv` + +Activate it with + +`source venv/bin/activate` + +Add a file to `venv/lib/python3.9/site-packages/benchmarking.pth` with the contents: +`/PATH/TO/REPO/Auto-GPT-Benchmarks-fork` + +This is because evals tries to import it directly. + +Install the requirements with + +`pip install -r requirements.txt` + +## Running the tests + +EVALS_THREADS=1 EVALS_THREAD_TIMEOUT=600 oaieval auto_gpt_completion_fn test-match --registry_path $PWD/auto_gpt_benchmarking +# Example final output: + +~/AGI/Auto-GPT-Benchmarks-fork$ cat /tmp/evallogs/230417220821DPM75QNS_auto_gpt_completion_fn_test-match.jsonl +{"spec": {"completion_fns": ["auto_gpt_completion_fn"], "eval_name": "test-match.s1.simple-v0", "base_eval": "test-match", "split": "s1", "run_config": {"completion_fns": ["auto_gpt_completion_fn"], "eval_spec": {"cls": "evals.elsuite.basic.match:Match", "args": {"samples_jsonl": "test_match/samples.jsonl"}, "key": "test-match.s1.simple-v0", "group": "test-basic"}, "seed": 20220722, "max_samples": null, "command": "/home/douglas/AGI/Auto-GPT-Benchmarks-fork/venv/bin/oaieval auto_gpt_completion_fn test-match --registry_path /home/douglas/AGI/Auto-GPT-Benchmarks-fork/auto_gpt_benchmarking", "initial_settings": {"visible": true}}, "created_by": "", "run_id": "230417220821DPM75QNS", "created_at": "2023-04-17 22:08:21.904498"}} +{"final_report": {"accuracy": 0.3333333333333333}} +{"run_id": "230417220821DPM75QNS", "event_id": 0, "sample_id": "test-match.s1.2", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: OpenAI was founded in 20\nAssistant: ", "sampled": "OpenAI was founded in 2015.2015"}, "created_by": "", "created_at": "2023-04-17 22:10:13.127375+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 1, "sample_id": "test-match.s1.2", "type": "match", "data": {"correct": false, "expected": "15", "picked": null, "sampled": "OpenAI was founded in 2015.2015", "options": ["15"]}, "created_by": "", "created_at": "2023-04-17 22:10:13.127550+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 2, "sample_id": "test-match.s1.1", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: The first US president was \nAssistant: ", "sampled": "George Washington"}, "created_by": "", "created_at": "2023-04-17 22:11:17.761693+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 3, "sample_id": "test-match.s1.1", "type": "match", "data": {"correct": true, "expected": "George Washington", "picked": "George Washington", "sampled": "George Washington", "options": ["George Washington"]}, "created_by": "", "created_at": "2023-04-17 22:11:17.761739+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 4, "sample_id": "test-match.s1.0", "type": "sampling", "data": {"prompt": "Complete the phrase as concisely as possible.\nUser: Once upon a \nAssistant: ", "sampled": "Once upon a time"}, "created_by": "", "created_at": "2023-04-17 22:12:04.691026+00:00"} +{"run_id": "230417220821DPM75QNS", "event_id": 5, "sample_id": "test-match.s1.0", "type": "match", "data": {"correct": false, "expected": "time", "picked": null, "sampled": "Once upon a time", "options": ["time"]}, "created_by": "", "created_at": "2023-04-17 22:12:04.691064+00:00"} +(venv) douglas@douglas-XPS-15-9500:~/AGI/Auto-GPT-Benchmarks-fork$ + diff --git a/auto_gpt_benchmarking/AutoGPTAgent.py b/auto_gpt_benchmarking/AutoGPTAgent.py index f24b150b..097311c7 100644 --- a/auto_gpt_benchmarking/AutoGPTAgent.py +++ b/auto_gpt_benchmarking/AutoGPTAgent.py @@ -33,6 +33,8 @@ class AutoGPTAgent: self.prompt_file.unlink() if self.output_file.exists(): self.output_file.unlink() + if self.file_logger.exists(): + self.file_logger.unlink() def _copy_ai_settings(self): self.ai_settings_dest.write_text(self.ai_settings_file.read_text()) @@ -67,6 +69,7 @@ class AutoGPTAgent: self.auto_workspace = self.auto_gpt_path / "auto_gpt_workspace" self.prompt_file = self.auto_workspace / "prompt.txt" self.output_file = self.auto_workspace / "output.txt" + self.file_logger = self.auto_workspace / "file_logger.txt" self.ai_settings_file = Path(__file__).parent / "AutoGPTData" / "ai_settings.yaml" self.ai_settings_dest = self.auto_workspace / "ai_settings.yaml" self.prompt = prompt diff --git a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml index b7cc573d..ec995a66 100644 --- a/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml +++ b/auto_gpt_benchmarking/AutoGPTData/ai_settings.yaml @@ -1,8 +1,6 @@ ai_goals: -- Evaluate the prompt in `prompt.txt` -- Use all of the tools at your disposal to evaluate the question and find the best answer in the format provided. +- Evaluate the prompt in `prompt.txt` and find the best answer in the format provided. - Get the correct answer to the question in the fewest number of steps possible. You are scored first on if you get the correct answer, and second on how many tokens you take to get the right answer so keep your thinking and tool usage as minimal as possible while still ensuring you get the correct answer. -- Save your work in the `output.txt` file, the second you do this, exit the program. -- Exit the program when you are done. +- Save the final answer and output to the `output.txt` file, the only file you should write to then immediately exit the program. ai_name: EvaluationAgent ai_role: an ai that is tested on how effectively it can efficiently evaluate questions and answer them correctly while using as few resources as possible