feat: goose to read PDFs (#1522)

This commit is contained in:
Michael Neale
2025-03-05 16:12:18 +11:00
committed by GitHub
parent b06cbdd6ff
commit e03e3c6037
7 changed files with 576 additions and 1 deletions

View File

@@ -19,6 +19,8 @@ use mcp_core::{
use mcp_server::router::CapabilitiesBuilder;
use mcp_server::Router;
mod pdf_tool;
mod platform;
use platform::{create_system_automation, SystemAutomation};
@@ -232,6 +234,33 @@ impl ComputerControllerRouter {
}),
);
let pdf_tool = Tool::new(
"pdf_tool",
indoc! {r#"
Process PDF files to extract text and images.
Supports operations:
- extract_text: Extract all text content from the PDF
- extract_images: Extract and save embedded images to PNG files
Use this when there is a .pdf file or files that need to be processed.
"#},
json!({
"type": "object",
"required": ["path", "operation"],
"properties": {
"path": {
"type": "string",
"description": "Path to the PDF file"
},
"operation": {
"type": "string",
"enum": ["extract_text", "extract_images"],
"description": "Operation to perform on the PDF"
}
}
}),
);
// choose_app_strategy().cache_dir()
// - macOS/Linux: ~/.cache/goose/computer_controller/
// - Windows: ~\AppData\Local\Block\goose\cache\computer_controller\
@@ -359,6 +388,7 @@ impl ComputerControllerRouter {
quick_script_tool,
computer_control_tool,
cache_tool,
pdf_tool,
],
cache_dir,
active_resources: Arc::new(Mutex::new(HashMap::new())),
@@ -653,6 +683,20 @@ impl ComputerControllerRouter {
}
// Implement cache tool functionality
async fn pdf_tool(&self, params: Value) -> Result<Vec<Content>, ToolError> {
let path = params
.get("path")
.and_then(|v| v.as_str())
.ok_or_else(|| ToolError::InvalidParameters("Missing 'path' parameter".into()))?;
let operation = params
.get("operation")
.and_then(|v| v.as_str())
.ok_or_else(|| ToolError::InvalidParameters("Missing 'operation' parameter".into()))?;
crate::computercontroller::pdf_tool::pdf_tool(path, operation, &self.cache_dir).await
}
async fn cache(&self, params: Value) -> Result<Vec<Content>, ToolError> {
let command = params
.get("command")
@@ -764,6 +808,7 @@ impl Router for ComputerControllerRouter {
"automation_script" => this.quick_script(arguments).await,
"computer_control" => this.computer_control(arguments).await,
"cache" => this.cache(arguments).await,
"pdf_tool" => this.pdf_tool(arguments).await,
_ => Err(ToolError::NotFound(format!("Tool {} not found", tool_name))),
}
})

View File

@@ -0,0 +1,422 @@
use lopdf::{content::Content as PdfContent, Document, Object};
use mcp_core::{Content, ToolError};
use std::{fs, path::Path};
pub async fn pdf_tool(
path: &str,
operation: &str,
cache_dir: &Path,
) -> Result<Vec<Content>, ToolError> {
// Open and parse the PDF file
let doc = Document::load(path)
.map_err(|e| ToolError::ExecutionError(format!("Failed to open PDF file: {}", e)))?;
let result = match operation {
"extract_text" => {
let mut text = String::new();
// Iterate over each page in the document
for (page_num, page_id) in doc.get_pages() {
text.push_str(&format!("Page {}:\n", page_num));
// Try to get text from page contents
if let Ok(page_obj) = doc.get_object(page_id) {
if let Ok(page_dict) = page_obj.as_dict() {
// Try to get text from Contents stream
if let Ok(contents) =
page_dict.get(b"Contents").and_then(|c| c.as_reference())
{
if let Ok(content_obj) = doc.get_object(contents) {
if let Ok(stream) = content_obj.as_stream() {
if let Ok(content_data) = stream.get_plain_content() {
if let Ok(content) = PdfContent::decode(&content_data) {
// Process each operation in the content stream
for operation in content.operations {
match operation.operator.as_ref() {
// "Tj" operator: show text
"Tj" => {
for operand in operation.operands {
if let Object::String(ref bytes, _) =
operand
{
if let Ok(s) =
std::str::from_utf8(bytes)
{
text.push_str(s);
}
}
}
text.push(' ');
}
// "TJ" operator: show text with positioning
"TJ" => {
if let Some(Object::Array(ref arr)) =
operation.operands.first()
{
let mut last_was_text = false;
for element in arr {
match element {
Object::String(
ref bytes,
_,
) => {
if let Ok(s) =
std::str::from_utf8(
bytes,
)
{
if last_was_text {
text.push(' ');
}
text.push_str(s);
last_was_text = true;
}
}
Object::Integer(offset) => {
// Large negative offsets often indicate word spacing
if *offset < -100 {
text.push(' ');
last_was_text = false;
}
}
Object::Real(offset) => {
if *offset < -100.0 {
text.push(' ');
last_was_text = false;
}
}
_ => {}
}
}
text.push(' ');
}
}
_ => (), // Ignore other operators
}
}
}
}
}
}
}
}
}
text.push('\n');
}
if text.trim().is_empty() {
"No text found in PDF".to_string()
} else {
format!("Extracted text from PDF:\n\n{}", text)
}
}
"extract_images" => {
let cache_dir = cache_dir.join("pdf_images");
fs::create_dir_all(&cache_dir).map_err(|e| {
ToolError::ExecutionError(format!("Failed to create image cache directory: {}", e))
})?;
let mut images = Vec::new();
let mut image_count = 0;
// Helper function to determine file extension based on stream dict
fn get_image_extension(dict: &lopdf::Dictionary) -> &'static str {
if let Ok(filter) = dict.get(b"Filter") {
match filter {
Object::Name(name) => {
match name.as_slice() {
b"DCTDecode" => ".jpg",
b"JBIG2Decode" => ".jbig2",
b"JPXDecode" => ".jp2",
b"CCITTFaxDecode" => ".tiff",
b"FlateDecode" => {
// PNG-like images often use FlateDecode
// Check color space to confirm
if let Ok(cs) = dict.get(b"ColorSpace") {
if let Ok(name) = cs.as_name() {
if name == b"DeviceRGB" || name == b"DeviceGray" {
return ".png";
}
}
}
".raw"
}
_ => ".raw",
}
}
Object::Array(filters) => {
// If multiple filters, check the last one
if let Some(Object::Name(name)) = filters.last() {
match name.as_slice() {
b"DCTDecode" => return ".jpg",
b"JPXDecode" => return ".jp2",
_ => {}
}
}
".raw"
}
_ => ".raw",
}
} else {
".raw"
}
}
// Process each page
for (page_num, page_id) in doc.get_pages() {
let page = doc.get_object(page_id).map_err(|e| {
ToolError::ExecutionError(format!("Failed to get page {}: {}", page_num, e))
})?;
let page_dict = page.as_dict().map_err(|e| {
ToolError::ExecutionError(format!(
"Failed to get page dict {}: {}",
page_num, e
))
})?;
// Get page resources - handle both direct dict and reference
let resources = match page_dict.get(b"Resources") {
Ok(res) => match res {
Object::Dictionary(dict) => Ok(dict),
Object::Reference(id) => doc
.get_object(*id)
.map_err(|e| {
ToolError::ExecutionError(format!(
"Failed to get resource reference: {}",
e
))
})
.and_then(|obj| {
obj.as_dict().map_err(|e| {
ToolError::ExecutionError(format!(
"Resource reference is not a dictionary: {}",
e
))
})
}),
_ => Err(ToolError::ExecutionError(
"Resources is neither dictionary nor reference".to_string(),
)),
},
Err(e) => Err(ToolError::ExecutionError(format!(
"Failed to get Resources: {}",
e
))),
}?;
// Look for XObject dictionary - handle both direct dict and reference
let xobjects = match resources.get(b"XObject") {
Ok(xobj) => match xobj {
Object::Dictionary(dict) => Ok(dict),
Object::Reference(id) => doc
.get_object(*id)
.map_err(|e| {
ToolError::ExecutionError(format!(
"Failed to get XObject reference: {}",
e
))
})
.and_then(|obj| {
obj.as_dict().map_err(|e| {
ToolError::ExecutionError(format!(
"XObject reference is not a dictionary: {}",
e
))
})
}),
_ => Err(ToolError::ExecutionError(
"XObject is neither dictionary nor reference".to_string(),
)),
},
Err(e) => Err(ToolError::ExecutionError(format!(
"Failed to get XObject: {}",
e
))),
};
if let Ok(xobjects) = xobjects {
for (name, xobject) in xobjects.iter() {
let xobject_id = xobject.as_reference().map_err(|_| {
ToolError::ExecutionError("Failed to get XObject reference".to_string())
})?;
let xobject = doc.get_object(xobject_id).map_err(|e| {
ToolError::ExecutionError(format!("Failed to get XObject: {}", e))
})?;
if let Ok(stream) = xobject.as_stream() {
// Check if it's an image
if let Ok(subtype) =
stream.dict.get(b"Subtype").and_then(|s| s.as_name())
{
if subtype == b"Image" {
let extension = get_image_extension(&stream.dict);
// Get image metadata
let width = stream
.dict
.get(b"Width")
.and_then(|w| w.as_i64())
.unwrap_or(0);
let height = stream
.dict
.get(b"Height")
.and_then(|h| h.as_i64())
.unwrap_or(0);
let bpc = stream
.dict
.get(b"BitsPerComponent")
.and_then(|b| b.as_i64())
.unwrap_or(0);
// Get the image data
if let Ok(data) = stream.get_plain_content() {
let image_path = cache_dir.join(format!(
"page{}_obj{}_{}{}",
page_num,
xobject_id.0,
String::from_utf8_lossy(name),
extension
));
fs::write(&image_path, &data).map_err(|e| {
ToolError::ExecutionError(format!(
"Failed to write image: {}",
e
))
})?;
images.push(format!(
"Saved image to: {} ({}x{}, {} bits per component)",
image_path.display(),
width,
height,
bpc
));
image_count += 1;
}
}
}
}
}
}
}
if images.is_empty() {
"No images found in PDF".to_string()
} else {
format!("Found {} images:\n{}", image_count, images.join("\n"))
}
}
_ => {
return Err(ToolError::InvalidParameters(format!(
"Invalid operation: {}. Valid operations are: 'extract_text', 'extract_images'",
operation
)))
}
};
Ok(vec![Content::text(result)])
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[tokio::test]
async fn test_pdf_text_extraction() {
let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("src/computercontroller/tests/data/test.pdf");
let cache_dir = tempfile::tempdir().unwrap().into_path();
println!("Testing text extraction from: {}", test_pdf_path.display());
let result = pdf_tool(test_pdf_path.to_str().unwrap(), "extract_text", &cache_dir).await;
assert!(result.is_ok(), "PDF text extraction should succeed");
let content = result.unwrap();
assert!(!content.is_empty(), "Extracted text should not be empty");
let text = content[0].as_text().unwrap();
println!("Extracted text:\n{}", text);
assert!(text.contains("Page 1"), "Should contain page marker");
assert!(
text.contains("This is a test PDF"),
"Should contain expected test content"
);
}
#[tokio::test]
async fn test_pdf_image_extraction() {
let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("src/computercontroller/tests/data/test_image.pdf");
let cache_dir = tempfile::tempdir().unwrap().into_path();
println!("Testing image extraction from: {}", test_pdf_path.display());
// Now try image extraction
let result = pdf_tool(
test_pdf_path.to_str().unwrap(),
"extract_images",
&cache_dir,
)
.await;
println!("Image extraction result: {:?}", result);
assert!(result.is_ok(), "PDF image extraction should succeed");
let content = result.unwrap();
assert!(
!content.is_empty(),
"Image extraction result should not be empty"
);
let text = content[0].as_text().unwrap();
println!("Extracted content: {}", text);
// Should either find images or explicitly state none were found
assert!(
text.contains("Saved image to:") || text.contains("No images found"),
"Should either save images or report none found"
);
// If we found images, verify they exist
if text.contains("Saved image to:") {
// Extract the file path from the output
let file_path = text
.lines()
.find(|line| line.contains("Saved image to:"))
.and_then(|line| line.split(": ").nth(1))
.and_then(|path| path.split(" (").next())
.expect("Should have a valid file path");
println!("Verifying image file exists: {}", file_path);
assert!(PathBuf::from(file_path).exists(), "Image file should exist");
}
}
#[tokio::test]
async fn test_pdf_invalid_path() {
let cache_dir = tempfile::tempdir().unwrap().into_path();
let result = pdf_tool("nonexistent.pdf", "extract_text", &cache_dir).await;
assert!(result.is_err(), "Should fail with invalid path");
}
#[tokio::test]
async fn test_pdf_invalid_operation() {
let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("src/computercontroller/tests/data/test.pdf");
let cache_dir = tempfile::tempdir().unwrap().into_path();
let result = pdf_tool(
test_pdf_path.to_str().unwrap(),
"invalid_operation",
&cache_dir,
)
.await;
assert!(result.is_err(), "Should fail with invalid operation");
}
}