feat: add additional goosebench evals (#1571)

Co-authored-by: Alice Hau <alice.a.hau@gmail.com>
2025-12-18 14:44:21 +01:00 · 2025-03-10 15:11:44 -04:00
parent 8689d24407
commit bb4feacf03
14 changed files with 859 additions and 3 deletions
--- a/crates/goose-cli/src/commands/bench.rs
+++ b/crates/goose-cli/src/commands/bench.rs
@@ -56,6 +56,20 @@ impl BenchAgent for BenchSession {
        let errors = self.errors.lock().await;
        errors.clone()
    }
+
+    async fn get_token_usage(&self) -> Option<i32> {
+        // Get token usage from the provider
+        if let Ok(usage) = self.session.get_usage().await {
+            // Sum up total tokens across all usage entries
+            let total_tokens = usage
+                .iter()
+                .map(|u| u.usage.total_tokens.unwrap_or(0))
+                .sum();
+            Some(total_tokens)
+        } else {
+            None
+        }
+    }
 }

 // Wrapper struct to implement BenchAgent for Arc<Mutex<BenchSession>>
@@ -72,6 +86,11 @@ impl BenchAgent for BenchAgentWrapper {
        let session = self.0.lock().await;
        session.get_errors().await
    }
+
+    async fn get_token_usage(&self) -> Option<i32> {
+        let session = self.0.lock().await;
+        session.get_token_usage().await
+    }
 }

 async fn run_eval(