diff --git a/.gitignore b/.gitignore
index 1b0f3ba1..31a1c4dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 .DS_Store
 ```
 secrets.json
+challenges_already_beaten.json
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index 7d8523da..75030906 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -94,6 +94,7 @@ def pytest_addoption(parser: Any) -> None:
     parser.addoption("--cutoff", action="store_true", default=False)
     parser.addoption("--improve", action="store_true", default=False)
     parser.addoption("--maintain", action="store_true", default=False)
+    parser.addoption("--explore", action="store_true", default=False)
     parser.addoption("--test", action="store_true", default=None)
     parser.addoption("--no_dep", action="store_true", default=False)
     parser.addoption("--suite", action="store_true", default=False)
@@ -159,7 +160,12 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
     except Exception as e:
         pass
 
-    flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
+    flags = (
+        "--test" in sys.argv
+        or "--maintain" in sys.argv
+        or "--improve" in sys.argv
+        or "--explore" in sys.argv
+    )
 
     if call.when == "call":
         # if it's a same task suite, we combine the report.
diff --git a/agbenchmark/generate_test.py b/agbenchmark/generate_test.py
index 062b39d5..ff72bff0 100644
--- a/agbenchmark/generate_test.py
+++ b/agbenchmark/generate_test.py
@@ -1,5 +1,6 @@
 import glob
 import importlib
+import json
 import os
 import sys
 import types
@@ -97,7 +98,23 @@ def create_single_test(
 
     # Define test method within the dynamically created class
     def test_method(self, config: Dict[str, Any], request) -> None:  # type: ignore
+        # create a random number between 0 and 1
+        test_name = self.data.name
+
+        try:
+            with open("challenges_already_beaten.json", "r") as f:
+                challenges_beaten_in_the_past = json.load(f)
+        except:
+            challenges_beaten_in_the_past = {}
+
+        if request.config.getoption("--explore") and challenges_beaten_in_the_past.get(
+            test_name, False
+        ):
+            return None
+
+        # skip optional categories
         self.skip_optional_categories(config)
+
         from helicone.lock import HeliconeLockManager
 
         if os.environ.get("HELICONE_API_KEY"):
@@ -108,6 +125,7 @@ def create_single_test(
 
         scores = self.get_scores(config)
         request.node.scores = scores  # store scores in request.node
+
         assert 1 in scores["values"]
 
     # Parametrize the method here
diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py
index 22c4dbbb..1e9146cf 100644
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -2,7 +2,7 @@ import json
 import os
 import sys
 from pathlib import Path
-from typing import Any
+from typing import Any, Dict
 
 from agbenchmark.agent_interface import MOCK_FLAG
 from agbenchmark.reports.ReportManager import ReportManager
@@ -144,7 +144,11 @@ def update_regression_tests(
 def generate_single_call_report(
     item: Any, call: Any, challenge_data: dict[str, Any]
 ) -> None:
-    difficulty = challenge_data["info"]["difficulty"]
+
+    try:
+        difficulty = challenge_data["info"]["difficulty"]
+    except KeyError:
+        return None
 
     if isinstance(difficulty, DifficultyLevel):
         difficulty = difficulty.value
@@ -222,9 +226,33 @@ def finalize_reports(item: Any, challenge_data: dict[str, Any]) -> None:
 
             info_details["reached_cutoff"] = float(run_time) > challenge_data["cutoff"]
 
+            update_challenges_already_beaten(info_details, test_name)
+            if info_details.get("tests") is not None:
+                for nested_test_name, nested_test_info in info_details["tests"].items():
+                    update_challenges_already_beaten(nested_test_info, nested_test_name)
+
         info_manager.add_test(test_name, info_details)
 
 
+def update_challenges_already_beaten(
+    info_details: Dict[str, Any], test_name: str
+) -> None:
+    current_run_successful = info_details["metrics"]["success"]
+    try:
+        with open("challenges_already_beaten.json", "r") as f:
+            challenge_data = json.load(f)
+    except:
+        challenge_data = {}
+    challenge_beaten_in_the_past = challenge_data.get(test_name)
+
+    challenge_data[test_name] = True
+    if challenge_beaten_in_the_past is None and not current_run_successful:
+        challenge_data[test_name] = False
+
+    with open("challenges_already_beaten.json", "w") as f:
+        json.dump(challenge_data, f, indent=4)
+
+
 def generate_separate_suite_reports(suite_reports: dict) -> None:
     for prefix, suite_file_datum in suite_reports.items():
         successes = []
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 046a18f8..f296e96d 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -85,6 +85,11 @@ def cli() -> None:
 @click.option("--test", default=None, help="Specific test to run")
 @click.option("--maintain", is_flag=True, help="Runs only regression tests")
 @click.option("--improve", is_flag=True, help="Run only non-regression tests")
+@click.option(
+    "--explore",
+    is_flag=True,
+    help="Only attempt challenges that have never been beaten",
+)
 @click.option("--mock", is_flag=True, help="Run with mock")
 @click.option("--suite", default=None, help="Run a suite of related tests")
 @click.option(
@@ -100,6 +105,7 @@ def start(
     test: str,
     maintain: bool,
     improve: bool,
+    explore: bool,
     mock: bool,
     suite: str,
     no_dep: bool,
@@ -109,13 +115,13 @@ def start(
     """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
     # Check if configuration file exists and is not empty
 
-    if maintain and improve:
+    if int(maintain) + int(improve) + int(explore) > 1:
         print(
-            "Error: You can't use both --maintain and --improve at the same time. Please choose one."
+            "Error: You can't use --maintain, --improve or --explore at the same time. Please choose one."
         )
         return 1
 
-    if test and (category or skip_category or maintain or improve or suite):
+    if test and (category or skip_category or maintain or improve or suite or explore):
         print(
             "Error: If you're running a specific test make sure no other options are selected. Please just pass the --test."
         )
@@ -123,7 +129,7 @@ def start(
 
     # TODO: test and ensure that this functionality works before removing
     # change elif suite below if removing
-    if suite and (category or skip_category or maintain or improve):
+    if suite and (category or skip_category or maintain or improve or explore):
         print(
             "Error: If you're running a specific suite make sure no other options are selected. Please just pass the --suite."
         )
@@ -193,6 +199,9 @@ def start(
         elif improve:
             print("Running only non-regression tests")
             pytest_args.append("--improve")
+        elif explore:
+            print("Only attempt challenges that have never been beaten")
+            pytest_args.append("--explore")
 
     if mock:
         pytest_args.append("--mock")
diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py
index 63168731..efee9cc7 100644
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -261,6 +261,7 @@ class Challenge(ABC):
         return scores_data
 
     def get_dummy_scores(self, test_name: str, scores: dict[str, Any]) -> int | None:
+        return 1  # remove this once this works
         if 1 in scores.get("scores_obj", {}).get(test_name, []):
             return 1
 
diff --git a/agbenchmark/utils/utils.py b/agbenchmark/utils/utils.py
index 41b659e9..f9bff83d 100644
--- a/agbenchmark/utils/utils.py
+++ b/agbenchmark/utils/utils.py
@@ -47,6 +47,9 @@ def calculate_info_test_path(reports_path: Path) -> str:
     elif "--improve" in command:
         test_index = command.index("--improve")
         test_arg = "improve"
+    elif "--improve" in command:
+        test_index = command.index("--explore")
+        test_arg = "explore"
 
     if test_index:
         if not test_arg: