fixing backslashes, adding basic metrics (#89)

This commit is contained in:
Silen Naihin
2023-07-12 01:37:59 -04:00
committed by GitHub
parent e292ffebaf
commit 8d0c5179ed
26 changed files with 418 additions and 206 deletions

View File

@@ -3,7 +3,9 @@ import os
import sys
import time
from datetime import datetime
from typing import Any, Dict, Union
from typing import Any, Dict
from agbenchmark.utils import get_highest_success_difficulty
class ReportManager:
@@ -23,7 +25,6 @@ class ReportManager:
if file_content: # if file is not empty, load the json
data = json.loads(file_content)
self.tests = {k: data[k] for k in sorted(data)}
data = self.replace_backslash(data)
else: # if file is empty, assign an empty dictionary
self.tests = {}
except FileNotFoundError:
@@ -36,8 +37,9 @@ class ReportManager:
with open(self.filename, "w") as f:
json.dump(self.tests, f, indent=4)
def add_test(self, test_name: str, test_details: dict) -> None:
def add_test(self, test_name: str, test_details: dict | list) -> None:
self.tests[test_name] = test_details
self.save()
def remove_test(self, test_name: str) -> None:
@@ -50,19 +52,12 @@ class ReportManager:
self.tests = {
"command": command.split(os.sep)[-1],
"completion_time": datetime.now().strftime("%Y-%m-%d-%H:%M"),
"time_elapsed": str(round(time.time() - self.start_time, 2)) + " seconds",
"metrics": {
"run_time": str(round(time.time() - self.start_time, 2)) + " seconds",
"highest_difficulty": get_highest_success_difficulty(self.tests),
},
"tests": self.tests,
"config": config,
}
self.save()
def replace_backslash(self, value: str) -> Union[str, list[str], dict]:
if isinstance(value, str):
return value.replace("\\\\", "/") # escape \ with \\
elif isinstance(value, list):
return [self.replace_backslash(i) for i in value]
elif isinstance(value, dict):
return {k: self.replace_backslash(v) for k, v in value.items()}
else:
return value

View File

@@ -23,26 +23,10 @@ def run_agent(
"""Calling to get a response"""
if MOCK_FLAG:
print("ITS A MOCK TEST", challenge_location)
copy_artifacts_into_workspace(
config["workspace"], "artifacts_out", challenge_location
)
else:
timeout = config["cutoff"]
print(
f"Running Python function '{config['entry_path']}' with timeout {timeout}"
)
command = [sys.executable, "-m", config["entry_path"], str(task)]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
universal_newlines=True,
cwd=os.getcwd(),
)
start_time = time.time()
print(
f"Running Python function '{config['entry_path']}' with timeout {config['cutoff']}"
)

View File

@@ -13,6 +13,6 @@
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to debug python code with a simple typo in it.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -11,8 +11,8 @@
"type": "execute_python_code"
},
"info": {
"difficulty": "medium",
"difficulty": "novice",
"description": "Tests ability for the agent to debug python code with a simple typo in it, using a very broad prompt without guidance",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -11,7 +11,7 @@
"type": "custom_python"
},
"info": {
"difficulty": "medium",
"difficulty": "advanced",
"description": "Tests ability for the agent to build a simple web server locally",
"side_effects": []
}

View File

@@ -1,15 +1,52 @@
import json
from enum import Enum
from pathlib import Path
from typing import List, Optional
from pydantic import BaseModel
from pydantic import BaseModel, validator
class DifficultyLevel(Enum):
interface = "interface"
basic = "basic"
novice = "novice"
intermediate = "intermediate"
advanced = "advanced"
expert = "expert"
human = "human"
# map from enum to difficulty level (numeric)
DIFFICULTY_MAP = {
DifficultyLevel.interface: 1,
DifficultyLevel.basic: 2,
DifficultyLevel.novice: 3,
DifficultyLevel.intermediate: 4,
DifficultyLevel.advanced: 5,
DifficultyLevel.expert: 6,
DifficultyLevel.human: 7,
}
class Info(BaseModel):
difficulty: str
difficulty: DifficultyLevel
description: str
side_effects: List[str]
@validator("difficulty", pre=True)
def difficulty_to_enum(cls: "Info", v: str | DifficultyLevel) -> DifficultyLevel:
"""Convert a string to an instance of DifficultyLevel."""
if isinstance(v, DifficultyLevel):
return v
if isinstance(v, str):
try:
return DifficultyLevel(v.lower())
except ValueError:
pass
raise ValueError(f"Cannot convert {v} to DifficultyLevel.")
class Ground(BaseModel):
answer: str

View File

@@ -11,7 +11,7 @@
},
"info": {
"description": "This reads the file quickly",
"difficulty": "basic",
"difficulty": "interface",
"side_effects": [""]
}
}

View File

@@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "interface",
"description": "Tests if an llm can search",
"side_effects": [""]
}

View File

@@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "interface",
"description": "Tests the writing to file",
"side_effects": ["tests if there is in fact an LLM attached"]
}

View File

@@ -10,13 +10,9 @@
"files": [".txt"],
"type": "file"
},
"mock": {
"mock_func": "basic_memory_mock",
"mock_task": "Follow the instructions in the instructions_1.txt file"
},
"info": {
"difficulty": "basic",
"description": "Tests ability for the agent to remember information between each action. An id is presented initially and the agent has to remember it after reading 4 other files",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -11,8 +11,8 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "novice",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -11,8 +11,8 @@
"type": "file"
},
"info": {
"difficulty": "medium",
"difficulty": "intermediate",
"description": "Tests ability for the agent to remember information between each action. Multiple ids are presented in different files and the last file instructs the agent to write these ids into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -16,8 +16,8 @@
"type": "file"
},
"info": {
"difficulty": "medium",
"difficulty": "advanced",
"description": "Tests ability for the agent to remember information between each action. Multiple phrases are presented in different files and the last file instructs the agent to write these phrases into another file. Some noise is also added to the files to test the agent's ability to filter out noise.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -13,6 +13,6 @@
"info": {
"difficulty": "basic",
"description": "Tests ability to retrieve information from a website.",
"side_effects": ["tests if there is in fact an LLM attached"]
"side_effects": []
}
}

View File

@@ -11,7 +11,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "novice",
"description": "Tests ability to retrieve information.",
"side_effects": ["tests if there is in fact an LLM attached"]
}

View File

@@ -27,7 +27,7 @@
"type": "file"
},
"info": {
"difficulty": "basic",
"difficulty": "intermediate",
"description": "Tests ability to retrieve information.",
"side_effects": ["tests if there is in fact an LLM attached"]
}

View File

@@ -9,15 +9,10 @@ from pathlib import Path
from typing import Any, Dict
import pytest
from dotenv import load_dotenv
from agbenchmark.challenge import Challenge
from agbenchmark.start_benchmark import CURRENT_DIRECTORY
load_dotenv()
IMPROVE = os.getenv("IMPROVE", "False")
from agbenchmark.utils import replace_backslash
json_files = glob.glob(f"{CURRENT_DIRECTORY}/**/data.json", recursive=True)
@@ -36,7 +31,11 @@ def get_test_path(json_file: str) -> str:
# Create the path from "agbenchmark" onwards
challenge_location = Path(*path.parts[agbenchmark_index:])
return str(challenge_location)
formatted_location = replace_backslash(str(challenge_location))
if isinstance(formatted_location, str):
return formatted_location
else:
return str(challenge_location)
def generate_tests() -> None:
@@ -68,7 +67,7 @@ def generate_tests() -> None:
)
sys.path.append(str(custom_python_location))
for (module_loader, name, ispkg) in pkgutil.iter_modules(
for module_loader, name, ispkg in pkgutil.iter_modules(
[str(custom_python_location)]
):
module = importlib.import_module(name)

View File

@@ -1,6 +1,8 @@
import json
import os
import shutil
import sys
import time
from pathlib import Path # noqa
from typing import Any, Dict, Generator
@@ -13,6 +15,7 @@ from agbenchmark.start_benchmark import (
REGRESSION_TESTS_PATH,
get_regression_data,
)
from agbenchmark.utils import calculate_success_percentage
def resolve_workspace(config: Dict[str, Any]) -> str:
@@ -107,9 +110,29 @@ def challenge_data(request: Any) -> None:
return request.param
@pytest.fixture(autouse=True, scope="session")
def mock(request: Any) -> None:
return request.config.getoption("--mock")
@pytest.fixture(autouse=True, scope="function")
def timer(request: Any) -> Any:
start_time = time.time()
yield
run_time = time.time() - start_time
request.node.user_properties.append(("run_time", run_time))
# tests that consistently pass are considered regression tests
regression_manager = ReportManager(REGRESSION_TESTS_PATH)
# user facing reporting information
info_manager = ReportManager(INFO_TESTS_PATH)
INTERNAL_LOGS = Path(__file__).resolve().parent # agbenchmark/conftest.py
# internal db step in replacement track pass/fail rate
internal_info = ReportManager(str(INTERNAL_LOGS / "internal_info.json"))
def pytest_runtest_makereport(item: Any, call: Any) -> None:
if call.when == "call":
@@ -122,23 +145,66 @@ def pytest_runtest_makereport(item: Any, call: Any) -> None:
)
# Extract the challenge_location from the class
challenge_location: str = getattr(item.cls, "CHALLENGE_LOCATION", "")
test_name = item.nodeid.split("::")[1]
item.test_name = test_name
test_details = {
"difficulty": difficulty,
"dependencies": dependencies,
"test": challenge_location,
"data_path": challenge_location,
}
print("pytest_runtest_makereport", test_details)
if call.excinfo is None:
regression_manager.add_test(item.nodeid.split("::")[1], test_details)
test_details["success"] = True
else:
regression_manager.remove_test(item.nodeid.split("::")[1])
test_details["success"] = False
test_details["fail_reason"] = str(call.excinfo.value)
info_details: Any = {
"data_path": challenge_location,
"is_regression": False,
"metrics": {
"difficulty": difficulty,
"success": False,
},
}
info_manager.add_test(item.nodeid.split("::")[1], test_details)
mock = "--mock" in sys.argv # Check if --mock is in sys.argv
if call.excinfo is None:
info_details["metrics"]["success"] = True
else:
if not mock: # don't remove if it's a mock test
regression_manager.remove_test(test_name)
info_details["metrics"]["fail_reason"] = str(call.excinfo.value)
prev_test_results: list[bool] = []
if not mock:
# only add if it's an actual test
prev_test_results = internal_info.tests.get(test_name, [])
prev_test_results.append(info_details["metrics"]["success"])
internal_info.add_test(test_name, prev_test_results)
# can calculate success rate regardless of mock
info_details["metrics"]["success_%"] = calculate_success_percentage(
prev_test_results
)
if len(prev_test_results) >= 3 and prev_test_results[-3:] == [True, True, True]:
# if the last 3 tests were successful, add to the regression tests
info_details["is_regression"] = True
regression_manager.add_test(test_name, test_details)
# user facing reporting
item.info_details = info_details
if call.when == "teardown":
run_time = dict(item.user_properties).get("run_time")
info_details = getattr(item, "info_details", {})
test_name = getattr(item, "test_name", "")
if info_details and test_name:
if run_time:
info_details["metrics"][
"run_time"
] = f"{str(round(run_time, 3))} seconds"
info_manager.add_test(test_name, info_details)
def pytest_sessionfinish(session: Any) -> None:
@@ -146,6 +212,7 @@ def pytest_sessionfinish(session: Any) -> None:
with open(CONFIG_PATH, "r") as f:
config = json.load(f)
internal_info.save()
info_manager.end_info_report(config)
regression_manager.save()

View File

@@ -0,0 +1,67 @@
{
"TestBasicMemory": [
true,
true,
true
],
"TestBasicRetrieval": [
true,
true,
true
],
"TestCreateSimpleWebServer": [
false,
false,
false
],
"TestDebugSimpleTypoWithGuidance": [
false,
false,
false
],
"TestDebugSimpleTypoWithoutGuidance": [
false,
false,
false
],
"TestReadFile": [
true,
true,
true
],
"TestRememberMultipleIds": [
true,
true,
true
],
"TestRememberMultipleIdsWithNoise": [
true,
true,
true
],
"TestRememberMultiplePhrasesWithNoise": [
true,
true,
true
],
"TestRetrieval2": [
true,
true,
true
],
"TestRetrieval3": [
true,
true,
true
],
"TestSearch": [
true,
true,
true
],
"TestWriteFile": [
true,
true,
true
]
}

View File

@@ -1,20 +1,11 @@
{
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file",
"success": true
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1",
"success": true
"data_path": "agbenchmark/challenges/memory/m1"
},
"TestBasicRetrieval": {
"difficulty": "basic",
@@ -22,78 +13,60 @@
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
"data_path": "agbenchmark/challenges/retrieval/r1"
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"data_path": "agbenchmark/challenges/interface/read_file"
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2",
"success": true
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
"data_path": "agbenchmark/challenges/memory/m2"
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3",
"success": true
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
"data_path": "agbenchmark/challenges/memory/m3"
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4",
"success": true
"data_path": "agbenchmark/challenges/memory/m4"
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"data_path": "agbenchmark/challenges/retrieval/r2"
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"data_path": "agbenchmark/challenges/retrieval/r3"
},
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/search",
"success": true
"data_path": "agbenchmark/challenges/interface/search"
},
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file",
"success": true
},
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1",
"success": true
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2",
"success": true
"data_path": "agbenchmark/challenges/interface/write_file"
}
}

View File

@@ -1,109 +1,148 @@
{
"command": "agbenchmark start --mock",
"completion_time": "2023-07-10-21:19",
"time_elapsed": "8.75 seconds",
"completion_time": "2023-07-11-21:09",
"metrics": {
"run_time": "0.96 seconds",
"highest_difficulty": "advanced: 5"
},
"tests": {
"TestWriteFile": {
"difficulty": "basic",
"dependencies": [],
"test": "agbenchmark/challenges/interface/write_file",
"success": true
"data_path": "agbenchmark/challenges/interface/write_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 0,
"run_time": "0.008 seconds"
}
},
"TestReadFile": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/read_file",
"success": true
"data_path": "agbenchmark/challenges/interface/read_file",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 0,
"run_time": "0.005 seconds"
}
},
"TestSearch": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile"
],
"test": "agbenchmark/challenges/interface/search",
"success": true
"data_path": "agbenchmark/challenges/interface/search",
"is_regression": false,
"metrics": {
"difficulty": "interface",
"success": true,
"success_%": 0,
"run_time": "0.006 seconds"
}
},
"TestDebugSimpleTypoWithGuidance": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/code/d1",
"success": true
"data_path": "agbenchmark/challenges/code/d1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": false,
"fail_reason": "assert 1 in [0.0]",
"success_%": 0,
"run_time": "0.489 seconds"
}
},
"TestBasicMemory": {
"difficulty": "basic",
"dependencies": [
"TestReadFile",
"TestWriteFile"
],
"test": "agbenchmark/challenges/memory/m1",
"success": true
"data_path": "agbenchmark/challenges/memory/m1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 0,
"run_time": "0.02 seconds"
}
},
"TestBasicRetrieval": {
"difficulty": "basic",
"dependencies": [
"TestWriteFile",
"TestSearch"
],
"test": "agbenchmark/challenges/retrieval/r1",
"success": true
"data_path": "agbenchmark/challenges/retrieval/r1",
"is_regression": false,
"metrics": {
"difficulty": "basic",
"success": true,
"success_%": 0,
"run_time": "0.01 seconds"
}
},
"TestDebugSimpleTypoWithoutGuidance": {
"difficulty": "medium",
"dependencies": [
"TestDebugSimpleTypoWithGuidance"
],
"test": "agbenchmark/challenges/code/d2",
"success": true
"data_path": "agbenchmark/challenges/code/d2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": false,
"fail_reason": "agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithoutGuidance::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0,
"run_time": "0.001 seconds"
}
},
"TestCreateSimpleWebServer": {
"data_path": "agbenchmark/challenges/code/d3",
"is_regression": false,
"metrics": {
"difficulty": "advanced",
"success": false,
"fail_reason": "agbenchmark/challenges/test_all.py::TestCreateSimpleWebServer::test_method[challenge_data0] depends on agbenchmark/challenges/test_all.py::TestDebugSimpleTypoWithGuidance::test_method[challenge_data0]",
"success_%": 0,
"run_time": "0.001 seconds"
}
},
"TestRememberMultipleIds": {
"difficulty": "basic",
"dependencies": [
"TestBasicMemory"
],
"test": "agbenchmark/challenges/memory/m2",
"success": true
"data_path": "agbenchmark/challenges/memory/m2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": true,
"success_%": 0,
"run_time": "0.018 seconds"
}
},
"TestRetrieval2": {
"difficulty": "basic",
"dependencies": [
"TestBasicRetrieval"
],
"test": "agbenchmark/challenges/retrieval/r2",
"success": true
"data_path": "agbenchmark/challenges/retrieval/r2",
"is_regression": false,
"metrics": {
"difficulty": "novice",
"success": true,
"success_%": 0,
"run_time": "0.009 seconds"
}
},
"TestRememberMultipleIdsWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIds"
],
"test": "agbenchmark/challenges/memory/m3",
"success": true
"data_path": "agbenchmark/challenges/memory/m3",
"is_regression": false,
"metrics": {
"difficulty": "intermediate",
"success": true,
"success_%": 0,
"run_time": "0.022 seconds"
}
},
"TestRetrieval3": {
"difficulty": "basic",
"dependencies": [
"TestRetrieval2"
],
"test": "agbenchmark/challenges/retrieval/r3",
"success": true
"data_path": "agbenchmark/challenges/retrieval/r3",
"is_regression": false,
"metrics": {
"difficulty": "intermediate",
"success": true,
"success_%": 0,
"run_time": "0.01 seconds"
}
},
"TestRememberMultiplePhrasesWithNoise": {
"difficulty": "medium",
"dependencies": [
"TestRememberMultipleIdsWithNoise"
],
"test": "agbenchmark/challenges/memory/m4",
"success": true
"data_path": "agbenchmark/challenges/memory/m4",
"is_regression": false,
"metrics": {
"difficulty": "advanced",
"success": true,
"success_%": 0,
"run_time": "0.021 seconds"
}
}
},
"config": {
"workspace": "${os.path.join(Path.home(), 'miniagi')}",
"entry_path": "agbenchmark/benchmarks.py",
"entry_path": "agbenchmark.benchmarks",
"cutoff": 60
}
}

View File

@@ -1,6 +1,10 @@
# radio charts, logs, helper functions for tests, anything else relevant.
import glob
import re
from pathlib import Path
from typing import Any
from agbenchmark.challenges.define_task_types import DIFFICULTY_MAP, DifficultyLevel
def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
@@ -15,3 +19,54 @@ def calculate_info_test_path(benchmarks_folder_path: Path) -> str:
run_name = f"{file_count + 1}.json"
new_file_path = INFO_TESTS_PATH / run_name
return str(new_file_path)
def replace_backslash(value: Any) -> Any:
if isinstance(value, str):
return re.sub(
r"\\+", "/", value
) # replace one or more backslashes with a forward slash
elif isinstance(value, list):
return [replace_backslash(i) for i in value]
elif isinstance(value, dict):
return {k: replace_backslash(v) for k, v in value.items()}
else:
return value
def calculate_success_percentage(results: list[bool]) -> float:
success_count = results.count(True)
total_count = len(results)
if total_count == 0:
return 0
success_percentage = (success_count / total_count) * 100 # as a percentage
return round(success_percentage, 2)
def get_highest_success_difficulty(data: dict) -> str:
highest_difficulty = None
highest_difficulty_level = -1
for test_name, test_data in data.items():
if test_data["metrics"]["success"]:
# Replace 'medium' with 'intermediate' for this example
difficulty_str = test_data["metrics"]["difficulty"]
try:
difficulty_enum = DifficultyLevel[difficulty_str.lower()]
difficulty_level = DIFFICULTY_MAP[difficulty_enum]
if difficulty_level > highest_difficulty_level:
highest_difficulty = difficulty_enum
highest_difficulty_level = difficulty_level
except KeyError:
print(
f"Unexpected difficulty level '{difficulty_str}' in test '{test_name}'"
)
if highest_difficulty is not None:
highest_difficulty_str = highest_difficulty.name # convert enum to string
else:
highest_difficulty_str = ""
return f"{highest_difficulty_str}: {highest_difficulty_level}"