feat: goose to read PDFs (#1522)

2025-12-18 14:44:21 +01:00 · 2025-03-05 16:12:18 +11:00
parent b06cbdd6ff
commit e03e3c6037
7 changed files with 576 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,6 +17,17 @@ version = "2.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"

+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures",
+]
+
 [[package]]
 name = "ahash"
 version = "0.8.11"
@@ -896,6 +907,15 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "block-padding"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "brotli"
 version = "7.0.0"
@@ -951,6 +971,12 @@ version = "3.17.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"

+[[package]]
+name = "bytecount"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
+
 [[package]]
 name = "bytemuck"
 version = "1.22.0"
@@ -997,6 +1023,15 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbc"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1087,6 +1122,16 @@ dependencies = [
 "half",
 ]

+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -2205,6 +2250,7 @@ dependencies = [
 "indoc",
 "kill_tree",
 "lazy_static",
+ "lopdf",
 "mcp-core",
 "mcp-server",
 "once_cell",
@@ -2860,6 +2906,16 @@ version = "2.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f4c7245a08504955605670dbf141fceab975f15ca21570696aebe9d2e71576bd"

+[[package]]
+name = "inout"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
+dependencies = [
+ "block-padding",
+ "generic-array",
+]
+
 [[package]]
 name = "interpolate_name"
 version = "0.2.4"
@@ -3171,6 +3227,30 @@ dependencies = [
 "imgref",
 ]

+[[package]]
+name = "lopdf"
+version = "0.35.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c7c1d3350d071cb86987a6bcb205c7019a0eb70dcad92b454fec722cca8d68b"
+dependencies = [
+ "aes",
+ "cbc",
+ "chrono",
+ "encoding_rs",
+ "flate2",
+ "indexmap 2.7.1",
+ "itoa",
+ "log",
+ "md-5",
+ "nom",
+ "nom_locate",
+ "rangemap",
+ "rayon",
+ "thiserror 2.0.11",
+ "time",
+ "weezl",
+]
+
 [[package]]
 name = "macro_rules_attribute"
 version = "0.2.0"
@@ -3303,6 +3383,16 @@ dependencies = [
 "tracing-subscriber",
 ]

+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
 [[package]]
 name = "memchr"
 version = "2.7.4"
@@ -3461,6 +3551,17 @@ dependencies = [
 "minimal-lexical",
 ]

+[[package]]
+name = "nom_locate"
+version = "4.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e3c83c053b0713da60c5b8de47fe8e494fe3ece5267b2f23090a07a053ba8f3"
+dependencies = [
+ "bytecount",
+ "memchr",
+ "nom",
+]
+
 [[package]]
 name = "noop_proc_macro"
 version = "0.3.0"
@@ -4152,6 +4253,12 @@ dependencies = [
 "getrandom 0.2.15",
 ]

+[[package]]
+name = "rangemap"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
+
 [[package]]
 name = "rav1e"
 version = "0.7.1"
--- a/crates/goose-mcp/Cargo.toml
+++ b/crates/goose-mcp/Cargo.toml
@@ -39,6 +39,7 @@ regex = "1.11.1"
 once_cell = "1.20.2"
 ignore = "0.4"
 temp-env = "0.3"
+lopdf = "0.35.0"

 [dev-dependencies]
 serial_test = "3.0.0"
--- a/crates/goose-mcp/src/computercontroller/mod.rs
+++ b/crates/goose-mcp/src/computercontroller/mod.rs
@@ -19,6 +19,8 @@ use mcp_core::{
 use mcp_server::router::CapabilitiesBuilder;
 use mcp_server::Router;

+mod pdf_tool;
+
 mod platform;
 use platform::{create_system_automation, SystemAutomation};

@@ -232,6 +234,33 @@ impl ComputerControllerRouter {
            }),
        );

+        let pdf_tool = Tool::new(
+            "pdf_tool",
+            indoc! {r#"
+                Process PDF files to extract text and images.
+                Supports operations:
+                - extract_text: Extract all text content from the PDF
+                - extract_images: Extract and save embedded images to PNG files
+
+                Use this when there is a .pdf file or files that need to be processed.
+            "#},
+            json!({
+                "type": "object",
+                "required": ["path", "operation"],
+                "properties": {
+                    "path": {
+                        "type": "string",
+                        "description": "Path to the PDF file"
+                    },
+                    "operation": {
+                        "type": "string",
+                        "enum": ["extract_text", "extract_images"],
+                        "description": "Operation to perform on the PDF"
+                    }
+                }
+            }),
+        );
+
        // choose_app_strategy().cache_dir()
        // - macOS/Linux: ~/.cache/goose/computer_controller/
        // - Windows:     ~\AppData\Local\Block\goose\cache\computer_controller\
@@ -359,6 +388,7 @@ impl ComputerControllerRouter {
                quick_script_tool,
                computer_control_tool,
                cache_tool,
+                pdf_tool,
            ],
            cache_dir,
            active_resources: Arc::new(Mutex::new(HashMap::new())),
@@ -653,6 +683,20 @@ impl ComputerControllerRouter {
    }

    // Implement cache tool functionality
+    async fn pdf_tool(&self, params: Value) -> Result<Vec<Content>, ToolError> {
+        let path = params
+            .get("path")
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| ToolError::InvalidParameters("Missing 'path' parameter".into()))?;
+
+        let operation = params
+            .get("operation")
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| ToolError::InvalidParameters("Missing 'operation' parameter".into()))?;
+
+        crate::computercontroller::pdf_tool::pdf_tool(path, operation, &self.cache_dir).await
+    }
+
    async fn cache(&self, params: Value) -> Result<Vec<Content>, ToolError> {
        let command = params
            .get("command")
@@ -764,6 +808,7 @@ impl Router for ComputerControllerRouter {
                "automation_script" => this.quick_script(arguments).await,
                "computer_control" => this.computer_control(arguments).await,
                "cache" => this.cache(arguments).await,
+                "pdf_tool" => this.pdf_tool(arguments).await,
                _ => Err(ToolError::NotFound(format!("Tool {} not found", tool_name))),
            }
        })
--- a/crates/goose-mcp/src/computercontroller/pdf_tool.rs
+++ b/crates/goose-mcp/src/computercontroller/pdf_tool.rs
@@ -0,0 +1,422 @@
+use lopdf::{content::Content as PdfContent, Document, Object};
+use mcp_core::{Content, ToolError};
+use std::{fs, path::Path};
+
+pub async fn pdf_tool(
+    path: &str,
+    operation: &str,
+    cache_dir: &Path,
+) -> Result<Vec<Content>, ToolError> {
+    // Open and parse the PDF file
+    let doc = Document::load(path)
+        .map_err(|e| ToolError::ExecutionError(format!("Failed to open PDF file: {}", e)))?;
+
+    let result = match operation {
+        "extract_text" => {
+            let mut text = String::new();
+
+            // Iterate over each page in the document
+            for (page_num, page_id) in doc.get_pages() {
+                text.push_str(&format!("Page {}:\n", page_num));
+
+                // Try to get text from page contents
+                if let Ok(page_obj) = doc.get_object(page_id) {
+                    if let Ok(page_dict) = page_obj.as_dict() {
+                        // Try to get text from Contents stream
+                        if let Ok(contents) =
+                            page_dict.get(b"Contents").and_then(|c| c.as_reference())
+                        {
+                            if let Ok(content_obj) = doc.get_object(contents) {
+                                if let Ok(stream) = content_obj.as_stream() {
+                                    if let Ok(content_data) = stream.get_plain_content() {
+                                        if let Ok(content) = PdfContent::decode(&content_data) {
+                                            // Process each operation in the content stream
+                                            for operation in content.operations {
+                                                match operation.operator.as_ref() {
+                                                    // "Tj" operator: show text
+                                                    "Tj" => {
+                                                        for operand in operation.operands {
+                                                            if let Object::String(ref bytes, _) =
+                                                                operand
+                                                            {
+                                                                if let Ok(s) =
+                                                                    std::str::from_utf8(bytes)
+                                                                {
+                                                                    text.push_str(s);
+                                                                }
+                                                            }
+                                                        }
+                                                        text.push(' ');
+                                                    }
+                                                    // "TJ" operator: show text with positioning
+                                                    "TJ" => {
+                                                        if let Some(Object::Array(ref arr)) =
+                                                            operation.operands.first()
+                                                        {
+                                                            let mut last_was_text = false;
+                                                            for element in arr {
+                                                                match element {
+                                                                    Object::String(
+                                                                        ref bytes,
+                                                                        _,
+                                                                    ) => {
+                                                                        if let Ok(s) =
+                                                                            std::str::from_utf8(
+                                                                                bytes,
+                                                                            )
+                                                                        {
+                                                                            if last_was_text {
+                                                                                text.push(' ');
+                                                                            }
+                                                                            text.push_str(s);
+                                                                            last_was_text = true;
+                                                                        }
+                                                                    }
+                                                                    Object::Integer(offset) => {
+                                                                        // Large negative offsets often indicate word spacing
+                                                                        if *offset < -100 {
+                                                                            text.push(' ');
+                                                                            last_was_text = false;
+                                                                        }
+                                                                    }
+                                                                    Object::Real(offset) => {
+                                                                        if *offset < -100.0 {
+                                                                            text.push(' ');
+                                                                            last_was_text = false;
+                                                                        }
+                                                                    }
+                                                                    _ => {}
+                                                                }
+                                                            }
+                                                            text.push(' ');
+                                                        }
+                                                    }
+                                                    _ => (), // Ignore other operators
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                text.push('\n');
+            }
+
+            if text.trim().is_empty() {
+                "No text found in PDF".to_string()
+            } else {
+                format!("Extracted text from PDF:\n\n{}", text)
+            }
+        }
+
+        "extract_images" => {
+            let cache_dir = cache_dir.join("pdf_images");
+            fs::create_dir_all(&cache_dir).map_err(|e| {
+                ToolError::ExecutionError(format!("Failed to create image cache directory: {}", e))
+            })?;
+
+            let mut images = Vec::new();
+            let mut image_count = 0;
+
+            // Helper function to determine file extension based on stream dict
+            fn get_image_extension(dict: &lopdf::Dictionary) -> &'static str {
+                if let Ok(filter) = dict.get(b"Filter") {
+                    match filter {
+                        Object::Name(name) => {
+                            match name.as_slice() {
+                                b"DCTDecode" => ".jpg",
+                                b"JBIG2Decode" => ".jbig2",
+                                b"JPXDecode" => ".jp2",
+                                b"CCITTFaxDecode" => ".tiff",
+                                b"FlateDecode" => {
+                                    // PNG-like images often use FlateDecode
+                                    // Check color space to confirm
+                                    if let Ok(cs) = dict.get(b"ColorSpace") {
+                                        if let Ok(name) = cs.as_name() {
+                                            if name == b"DeviceRGB" || name == b"DeviceGray" {
+                                                return ".png";
+                                            }
+                                        }
+                                    }
+                                    ".raw"
+                                }
+                                _ => ".raw",
+                            }
+                        }
+                        Object::Array(filters) => {
+                            // If multiple filters, check the last one
+                            if let Some(Object::Name(name)) = filters.last() {
+                                match name.as_slice() {
+                                    b"DCTDecode" => return ".jpg",
+                                    b"JPXDecode" => return ".jp2",
+                                    _ => {}
+                                }
+                            }
+                            ".raw"
+                        }
+                        _ => ".raw",
+                    }
+                } else {
+                    ".raw"
+                }
+            }
+
+            // Process each page
+            for (page_num, page_id) in doc.get_pages() {
+                let page = doc.get_object(page_id).map_err(|e| {
+                    ToolError::ExecutionError(format!("Failed to get page {}: {}", page_num, e))
+                })?;
+
+                let page_dict = page.as_dict().map_err(|e| {
+                    ToolError::ExecutionError(format!(
+                        "Failed to get page dict {}: {}",
+                        page_num, e
+                    ))
+                })?;
+
+                // Get page resources - handle both direct dict and reference
+                let resources = match page_dict.get(b"Resources") {
+                    Ok(res) => match res {
+                        Object::Dictionary(dict) => Ok(dict),
+                        Object::Reference(id) => doc
+                            .get_object(*id)
+                            .map_err(|e| {
+                                ToolError::ExecutionError(format!(
+                                    "Failed to get resource reference: {}",
+                                    e
+                                ))
+                            })
+                            .and_then(|obj| {
+                                obj.as_dict().map_err(|e| {
+                                    ToolError::ExecutionError(format!(
+                                        "Resource reference is not a dictionary: {}",
+                                        e
+                                    ))
+                                })
+                            }),
+                        _ => Err(ToolError::ExecutionError(
+                            "Resources is neither dictionary nor reference".to_string(),
+                        )),
+                    },
+                    Err(e) => Err(ToolError::ExecutionError(format!(
+                        "Failed to get Resources: {}",
+                        e
+                    ))),
+                }?;
+
+                // Look for XObject dictionary - handle both direct dict and reference
+                let xobjects = match resources.get(b"XObject") {
+                    Ok(xobj) => match xobj {
+                        Object::Dictionary(dict) => Ok(dict),
+                        Object::Reference(id) => doc
+                            .get_object(*id)
+                            .map_err(|e| {
+                                ToolError::ExecutionError(format!(
+                                    "Failed to get XObject reference: {}",
+                                    e
+                                ))
+                            })
+                            .and_then(|obj| {
+                                obj.as_dict().map_err(|e| {
+                                    ToolError::ExecutionError(format!(
+                                        "XObject reference is not a dictionary: {}",
+                                        e
+                                    ))
+                                })
+                            }),
+                        _ => Err(ToolError::ExecutionError(
+                            "XObject is neither dictionary nor reference".to_string(),
+                        )),
+                    },
+                    Err(e) => Err(ToolError::ExecutionError(format!(
+                        "Failed to get XObject: {}",
+                        e
+                    ))),
+                };
+
+                if let Ok(xobjects) = xobjects {
+                    for (name, xobject) in xobjects.iter() {
+                        let xobject_id = xobject.as_reference().map_err(|_| {
+                            ToolError::ExecutionError("Failed to get XObject reference".to_string())
+                        })?;
+
+                        let xobject = doc.get_object(xobject_id).map_err(|e| {
+                            ToolError::ExecutionError(format!("Failed to get XObject: {}", e))
+                        })?;
+
+                        if let Ok(stream) = xobject.as_stream() {
+                            // Check if it's an image
+                            if let Ok(subtype) =
+                                stream.dict.get(b"Subtype").and_then(|s| s.as_name())
+                            {
+                                if subtype == b"Image" {
+                                    let extension = get_image_extension(&stream.dict);
+
+                                    // Get image metadata
+                                    let width = stream
+                                        .dict
+                                        .get(b"Width")
+                                        .and_then(|w| w.as_i64())
+                                        .unwrap_or(0);
+                                    let height = stream
+                                        .dict
+                                        .get(b"Height")
+                                        .and_then(|h| h.as_i64())
+                                        .unwrap_or(0);
+                                    let bpc = stream
+                                        .dict
+                                        .get(b"BitsPerComponent")
+                                        .and_then(|b| b.as_i64())
+                                        .unwrap_or(0);
+
+                                    // Get the image data
+                                    if let Ok(data) = stream.get_plain_content() {
+                                        let image_path = cache_dir.join(format!(
+                                            "page{}_obj{}_{}{}",
+                                            page_num,
+                                            xobject_id.0,
+                                            String::from_utf8_lossy(name),
+                                            extension
+                                        ));
+
+                                        fs::write(&image_path, &data).map_err(|e| {
+                                            ToolError::ExecutionError(format!(
+                                                "Failed to write image: {}",
+                                                e
+                                            ))
+                                        })?;
+
+                                        images.push(format!(
+                                            "Saved image to: {} ({}x{}, {} bits per component)",
+                                            image_path.display(),
+                                            width,
+                                            height,
+                                            bpc
+                                        ));
+                                        image_count += 1;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            if images.is_empty() {
+                "No images found in PDF".to_string()
+            } else {
+                format!("Found {} images:\n{}", image_count, images.join("\n"))
+            }
+        }
+
+        _ => {
+            return Err(ToolError::InvalidParameters(format!(
+                "Invalid operation: {}. Valid operations are: 'extract_text', 'extract_images'",
+                operation
+            )))
+        }
+    };
+
+    Ok(vec![Content::text(result)])
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::path::PathBuf;
+
+    #[tokio::test]
+    async fn test_pdf_text_extraction() {
+        let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("src/computercontroller/tests/data/test.pdf");
+        let cache_dir = tempfile::tempdir().unwrap().into_path();
+
+        println!("Testing text extraction from: {}", test_pdf_path.display());
+
+        let result = pdf_tool(test_pdf_path.to_str().unwrap(), "extract_text", &cache_dir).await;
+
+        assert!(result.is_ok(), "PDF text extraction should succeed");
+        let content = result.unwrap();
+        assert!(!content.is_empty(), "Extracted text should not be empty");
+        let text = content[0].as_text().unwrap();
+        println!("Extracted text:\n{}", text);
+        assert!(text.contains("Page 1"), "Should contain page marker");
+        assert!(
+            text.contains("This is a test PDF"),
+            "Should contain expected test content"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_pdf_image_extraction() {
+        let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("src/computercontroller/tests/data/test_image.pdf");
+        let cache_dir = tempfile::tempdir().unwrap().into_path();
+
+        println!("Testing image extraction from: {}", test_pdf_path.display());
+
+        // Now try image extraction
+        let result = pdf_tool(
+            test_pdf_path.to_str().unwrap(),
+            "extract_images",
+            &cache_dir,
+        )
+        .await;
+
+        println!("Image extraction result: {:?}", result);
+        assert!(result.is_ok(), "PDF image extraction should succeed");
+        let content = result.unwrap();
+        assert!(
+            !content.is_empty(),
+            "Image extraction result should not be empty"
+        );
+        let text = content[0].as_text().unwrap();
+        println!("Extracted content: {}", text);
+
+        // Should either find images or explicitly state none were found
+        assert!(
+            text.contains("Saved image to:") || text.contains("No images found"),
+            "Should either save images or report none found"
+        );
+
+        // If we found images, verify they exist
+        if text.contains("Saved image to:") {
+            // Extract the file path from the output
+            let file_path = text
+                .lines()
+                .find(|line| line.contains("Saved image to:"))
+                .and_then(|line| line.split(": ").nth(1))
+                .and_then(|path| path.split(" (").next())
+                .expect("Should have a valid file path");
+
+            println!("Verifying image file exists: {}", file_path);
+            assert!(PathBuf::from(file_path).exists(), "Image file should exist");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_pdf_invalid_path() {
+        let cache_dir = tempfile::tempdir().unwrap().into_path();
+        let result = pdf_tool("nonexistent.pdf", "extract_text", &cache_dir).await;
+
+        assert!(result.is_err(), "Should fail with invalid path");
+    }
+
+    #[tokio::test]
+    async fn test_pdf_invalid_operation() {
+        let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("src/computercontroller/tests/data/test.pdf");
+        let cache_dir = tempfile::tempdir().unwrap().into_path();
+
+        let result = pdf_tool(
+            test_pdf_path.to_str().unwrap(),
+            "invalid_operation",
+            &cache_dir,
+        )
+        .await;
+
+        assert!(result.is_err(), "Should fail with invalid operation");
+    }
+}
--- a/crates/goose-mcp/src/computercontroller/tests/data/test.pdf
+++ b/crates/goose-mcp/src/computercontroller/tests/data/test.pdf
--- a/crates/goose-mcp/src/computercontroller/tests/data/test_image.pdf
+++ b/crates/goose-mcp/src/computercontroller/tests/data/test_image.pdf
--- a/crates/goose-mcp/src/lib.rs
+++ b/crates/goose-mcp/src/lib.rs
@@ -7,7 +7,7 @@ pub static APP_STRATEGY: Lazy<AppStrategyArgs> = Lazy::new(|| AppStrategyArgs {
    app_name: "goose".to_string(),
 });

-mod computercontroller;
+pub mod computercontroller;
 mod developer;
 mod google_drive;
 mod jetbrains;