diff --git a/agbenchmark/config.json b/agbenchmark/config.json
deleted file mode 100644
index 3a03b741..00000000
--- a/agbenchmark/config.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "workspace": "${os.path.join(Path.home(), 'miniagi')}"
-}
diff --git a/agbenchmark/conftest.py b/agbenchmark/conftest.py
index f865a3d8..0a239201 100644
--- a/agbenchmark/conftest.py
+++ b/agbenchmark/conftest.py
@@ -134,9 +134,6 @@ suite_reports: dict[str, list] = {}
 def pytest_runtest_makereport(item: Any, call: Any) -> None:
     challenge_data = item.funcargs.get("challenge_data", None)
 
-    HeliconeLockManager.write_custom_property("challenge", challenge_data["name"])
-
-
     if not challenge_data:
         # this will only happen for dummy dependency setup tests
         return
@@ -158,14 +155,18 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
     flags = "--test" in sys.argv or "--maintain" in sys.argv or "--improve" in sys.argv
 
     if call.when == "call":
+        test_name = ""
         # if it's a same task suite, we combine the report.
         # but not if it's a single --test
         if is_suite and is_suite.same_task and not flags:
+            test_name = is_suite.prefix
             generate_combined_suite_report(item, challenge_data, challenge_location)
         else:
             # single non suite test
+            test_name = challenge_data["name"]
             generate_single_call_report(item, call, challenge_data)
         # else: it's a same_task=false suite (tests aren't combined)
+        HeliconeLockManager.write_custom_property("challenge", test_name)
     if call.when == "teardown":
         finalize_reports(item, challenge_data)
 
diff --git a/agbenchmark/reports/reports.py b/agbenchmark/reports/reports.py
index 17ddd399..e051c444 100644
--- a/agbenchmark/reports/reports.py
+++ b/agbenchmark/reports/reports.py
@@ -65,7 +65,6 @@ def generate_combined_suite_report(
                 "success": False,
             },
         }
-
         if scores["scores_obj"][test_name] == 1:
             # add dependency successful here
 
diff --git a/agbenchmark/start_benchmark.py b/agbenchmark/start_benchmark.py
index 0ca906be..c5078408 100644
--- a/agbenchmark/start_benchmark.py
+++ b/agbenchmark/start_benchmark.py
@@ -76,22 +76,14 @@ def start(
         )
         return 1
 
+    print("CONFIG_PATH", CONFIG_PATH)
+
     if not os.path.exists(CONFIG_PATH) or os.stat(CONFIG_PATH).st_size == 0:
         config = {}
 
         config["workspace"] = click.prompt(
             "Please enter a new workspace path",
-            default=os.path.join(Path.home(), "workspace"),
-        )
-
-        config["entry_path"] = click.prompt(
-            "Please enter a the path to your run_specific_agent function implementation within the benchmarks folder",
-            default="agbenchmark/benchmarks.py",
-        )
-
-        config["cutoff"] = click.prompt(
-            "Please enter a hard cutoff runtime for your agent per test",
-            default="60",
+            default=os.path.join("workspace"),
         )
 
         with open(CONFIG_PATH, "w") as f:
diff --git a/agbenchmark/utils/challenge.py b/agbenchmark/utils/challenge.py
index 5a8e1273..d23d3ec2 100644
--- a/agbenchmark/utils/challenge.py
+++ b/agbenchmark/utils/challenge.py
@@ -120,7 +120,7 @@ class Challenge(ABC):
                 print_content = (
                     f"\033[1;34mWord that should exist\033[0m - {should_contain_word}:"
                 )
-                if self.data.ground.type == "file_llm_evaluation":
+                if ground.type == "file_llm_evaluation":
                     return self.llm_eval(content, should_contain_word)
                 elif should_contain_word not in content:
                     print(print_content, "False")
@@ -164,46 +164,49 @@ ANSWER:
         scores_dict = {}
         percentage = None
 
-        if isinstance(self.data.ground, Ground):
-            files_contents = self.get_artifacts_out(
-                config["workspace"], self.data.ground
-            )
-
-            for file_content in files_contents:
-                score = self.scoring(file_content, self.data.ground)
-                print("\033[1;32mYour score is:\033[0m", score)
-                scores.append(score)
-        elif isinstance(self.data.ground, dict):
-            # if it's a dict then we know its a combined suite
-            for ground_key in self.data.ground:
-                ground = self.data.ground[ground_key]
-                files_contents = self.get_artifacts_out(config["workspace"], ground)
+        try:
+            if isinstance(self.data.ground, Ground):
+                files_contents = self.get_artifacts_out(
+                    config["workspace"], self.data.ground
+                )
 
                 for file_content in files_contents:
-                    score = self.scoring(file_content, ground)
-                    scores_dict[ground_key] = score
-                    print(
-                        f"\033[1;35mScore for {ground_key}:\033[0m",
-                        scores_dict[ground_key],
-                    )
+                    score = self.scoring(file_content, self.data.ground)
+                    print("\033[1;32mYour score is:\033[0m", score)
+                    scores.append(score)
+            elif isinstance(self.data.ground, dict):
+                # if it's a dict then we know its a combined suite
+                for ground_key in self.data.ground:
+                    ground = self.data.ground[ground_key]
+                    files_contents = self.get_artifacts_out(config["workspace"], ground)
 
-            # Count the number of times the value 1.0 appears in the dictionary
-            num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
+                    for file_content in files_contents:
+                        score = self.scoring(file_content, ground)
+                        scores_dict[ground_key] = score
+                        print(
+                            f"\033[1;35mScore for {ground_key}:\033[0m",
+                            scores_dict[ground_key],
+                        )
 
-            # Calculate the percentage
-            percentage = round((num_ones / len(scores_dict)) * 100, 2)
+                # Count the number of times the value 1.0 appears in the dictionary
+                num_ones = sum(1 for score in scores_dict.values() if score == 1.0)
 
-            # Print the result in green
-            print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
+                # Calculate the percentage
+                percentage = round((num_ones / len(scores_dict)) * 100, 2)
 
-            # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
-            # So for now we return 1.0 if there's any that pass
-            if percentage > 0:
-                scores.append(1.0)
-                if percentage != 100:
-                    print(
-                        "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
-                    )
+                # Print the result in green
+                print(f"\033[1;92mPercentage of 1.0 scores:\033[0m {percentage}%")
+
+                # TODO: in an ideal world it only returns 1.0 if all of the tests pass but then the dependencies break.
+                # So for now we return 1.0 if there's any that pass
+                if percentage > 0:
+                    scores.append(1.0)
+                    if percentage != 100:
+                        print(
+                            "\033[1;93mWARNING:\033[0m Your agent did not pass all the tests in the suite."
+                        )
+        except Exception as e:
+            print("Error getting scores", e)
 
         scores_data = {
             "values": scores,