mirror of
https://github.com/aljazceru/goose.git
synced 2025-12-18 14:44:21 +01:00
feat: handling larger more complex PDF docs (and fix) (#1663)
This commit is contained in:
@@ -42,6 +42,7 @@ ignore = "0.4"
|
||||
lopdf = "0.35.0"
|
||||
docx-rs = "0.4.7"
|
||||
image = "0.24.9"
|
||||
extractous = "0.3.0"
|
||||
umya-spreadsheet = "2.2.3"
|
||||
keyring = { version = "3.6.1", features = ["apple-native", "windows-native", "sync-secret-service"] }
|
||||
|
||||
|
||||
269
crates/goose-mcp/src/computercontroller/document_tool.rs
Normal file
269
crates/goose-mcp/src/computercontroller/document_tool.rs
Normal file
@@ -0,0 +1,269 @@
|
||||
use extractous::Extractor;
|
||||
use mcp_core::{Content, ToolError};
|
||||
use std::{
|
||||
fs,
|
||||
io::Read,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
// Threshold for large text files (0.22MB - about 1/18 of the 4,194,304 bytes limit)
|
||||
const LARGE_TEXT_THRESHOLD: usize = (2 * 1024 * 1024) / 9; // ~0.22MB in bytes
|
||||
|
||||
pub async fn document_tool(
|
||||
path: &str,
|
||||
operation: &str,
|
||||
cache_dir: &Path,
|
||||
) -> Result<Vec<Content>, ToolError> {
|
||||
match operation {
|
||||
"get_text" => {
|
||||
// Extract text from a local file (PDF, DOCX, XLSX, etc.)
|
||||
extract_text_from_file(path, cache_dir)
|
||||
}
|
||||
"get_text_url" => {
|
||||
// Extract text from a URL
|
||||
extract_text_from_url(path, cache_dir)
|
||||
}
|
||||
_ => Err(ToolError::InvalidParameters(format!(
|
||||
"Invalid operation: {}. Valid operations are: 'get_text', 'get_text_url'",
|
||||
operation
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_text_from_file(path: &str, cache_dir: &Path) -> Result<Vec<Content>, ToolError> {
|
||||
// Use extractous library for text extraction
|
||||
let extractor = Extractor::new();
|
||||
|
||||
// Extract text from the file
|
||||
let (text, metadata) = extractor.extract_file_to_string(path).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to extract text from file: {}", e))
|
||||
})?;
|
||||
|
||||
process_extracted_text(text, metadata, path, cache_dir)
|
||||
}
|
||||
|
||||
fn extract_text_from_url(url: &str, cache_dir: &Path) -> Result<Vec<Content>, ToolError> {
|
||||
// Validate that the input is actually a URL
|
||||
if !url.starts_with("http://") && !url.starts_with("https://") {
|
||||
return Err(ToolError::InvalidParameters(format!(
|
||||
"Invalid URL: {}. URL must start with http:// or https://",
|
||||
url
|
||||
)));
|
||||
}
|
||||
|
||||
// Use extractous library for text extraction
|
||||
let extractor = Extractor::new();
|
||||
|
||||
// Handle URL extraction
|
||||
let (mut stream_reader, metadata) = extractor.extract_url(url).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to extract text from URL: {}", e))
|
||||
})?;
|
||||
|
||||
// Convert StreamReader to String
|
||||
let mut text = String::new();
|
||||
stream_reader
|
||||
.read_to_string(&mut text)
|
||||
.map_err(|e| ToolError::ExecutionError(format!("Failed to read text from URL: {}", e)))?;
|
||||
|
||||
process_extracted_text(text, metadata, url, cache_dir)
|
||||
}
|
||||
|
||||
fn process_extracted_text(
|
||||
text: String,
|
||||
metadata: std::collections::HashMap<String, Vec<String>>,
|
||||
source_path: &str,
|
||||
cache_dir: &Path,
|
||||
) -> Result<Vec<Content>, ToolError> {
|
||||
// Check if the extracted text is large
|
||||
let text_size = text.len();
|
||||
if text_size > LARGE_TEXT_THRESHOLD {
|
||||
// Create a directory for large text files if it doesn't exist
|
||||
let large_text_dir = cache_dir.join("large_document_texts");
|
||||
fs::create_dir_all(&large_text_dir).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to create directory for large text: {}", e))
|
||||
})?;
|
||||
|
||||
// Create a filename based on the original document name
|
||||
let doc_path = PathBuf::from(source_path);
|
||||
let doc_filename = doc_path
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.unwrap_or("unnamed_document");
|
||||
|
||||
let text_file_path = large_text_dir.join(format!("{}.txt", doc_filename));
|
||||
|
||||
// Write the text to a file
|
||||
fs::write(&text_file_path, &text).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to write large text to file: {}", e))
|
||||
})?;
|
||||
|
||||
// Format size in human-readable form
|
||||
let size_str = if text_size < 1024 * 1024 {
|
||||
format!("{:.2} KB", text_size as f64 / 1024.0)
|
||||
} else {
|
||||
format!("{:.2} MB", text_size as f64 / (1024.0 * 1024.0))
|
||||
};
|
||||
|
||||
Ok(vec![Content::text(format!(
|
||||
"Large text extracted from document ({})\n\n\
|
||||
The extracted text is too large to display directly.\n\
|
||||
Text has been written to: {}\n\n\
|
||||
You can search through this file using ripgrep:\n\
|
||||
rg 'search term' {}\n\n\
|
||||
Or view portions of it:\n\
|
||||
head -n 50 {}\n\
|
||||
tail -n 50 {}\n\
|
||||
less {}",
|
||||
size_str,
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display()
|
||||
))])
|
||||
} else {
|
||||
// Include metadata information in the output
|
||||
let metadata_info = if metadata.is_empty() {
|
||||
"Document Metadata: None\n\n".to_string()
|
||||
} else {
|
||||
let mut formatted_metadata = String::from("Document Metadata:\n");
|
||||
|
||||
// Format each metadata entry
|
||||
for (key, values) in &metadata {
|
||||
formatted_metadata.push_str(&format!(" {}: ", key));
|
||||
|
||||
// Single value case
|
||||
if values.len() == 1 {
|
||||
formatted_metadata.push_str(&format!("{}\n", values[0]));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multiple values case
|
||||
formatted_metadata.push_str("[\n");
|
||||
for value in values {
|
||||
formatted_metadata.push_str(&format!(" {}\n", value));
|
||||
}
|
||||
formatted_metadata.push_str(" ]\n");
|
||||
}
|
||||
|
||||
formatted_metadata.push('\n');
|
||||
formatted_metadata
|
||||
};
|
||||
|
||||
Ok(vec![Content::text(format!(
|
||||
"{}Extracted text from document:\n\n{}",
|
||||
metadata_info, text
|
||||
))])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_docx_text_extraction() {
|
||||
let test_docx_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("src/computercontroller/tests/data/sample.docx");
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
println!(
|
||||
"Testing text extraction from DOCX: {}",
|
||||
test_docx_path.display()
|
||||
);
|
||||
|
||||
let result = document_tool(test_docx_path.to_str().unwrap(), "get_text", &cache_dir).await;
|
||||
|
||||
assert!(result.is_ok(), "DOCX text extraction should succeed");
|
||||
let content = result.unwrap();
|
||||
assert!(!content.is_empty(), "Extracted text should not be empty");
|
||||
let text = content[0].as_text().unwrap();
|
||||
println!("Extracted text:\n{}", text);
|
||||
assert!(
|
||||
text.contains("Document Metadata") || !text.is_empty(),
|
||||
"Should contain metadata or at least some text content"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_text_extraction() {
|
||||
// Skip this test if we're not online
|
||||
// This is a simple test URL that should be stable
|
||||
let test_url = "https://example.com";
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
println!("Testing text extraction from URL: {}", test_url);
|
||||
|
||||
let result = document_tool(test_url, "get_text_url", &cache_dir).await;
|
||||
|
||||
// If the test fails due to network issues, just skip it
|
||||
if let Err(err) = &result {
|
||||
if err.to_string().contains("network") || err.to_string().contains("connection") {
|
||||
println!("Skipping URL extraction test due to network issues");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(result.is_ok(), "URL text extraction should succeed");
|
||||
let content = result.unwrap();
|
||||
assert!(!content.is_empty(), "Extracted text should not be empty");
|
||||
let text = content[0].as_text().unwrap();
|
||||
println!("Extracted text from URL:\n{}", text);
|
||||
assert!(
|
||||
text.contains("Example Domain"),
|
||||
"Should contain expected content from example.com"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_invalid_path() {
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
let result = document_tool("nonexistent.pdf", "get_text", &cache_dir).await;
|
||||
|
||||
assert!(result.is_err(), "Should fail with invalid path");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_document_invalid_operation() {
|
||||
let test_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("src/computercontroller/tests/data/test.pdf");
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
let result = document_tool(
|
||||
test_pdf_path.to_str().unwrap(),
|
||||
"invalid_operation",
|
||||
&cache_dir,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(result.is_err(), "Should fail with invalid operation");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_with_get_text() {
|
||||
let test_url = "https://example.com";
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
let result = document_tool(test_url, "get_text", &cache_dir).await;
|
||||
|
||||
// This should fail since URLs should use get_text_url
|
||||
assert!(result.is_err(), "Using get_text with URL should fail");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_file_with_get_text_url() {
|
||||
let test_docx_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("src/computercontroller/tests/data/sample.docx");
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
let result =
|
||||
document_tool(test_docx_path.to_str().unwrap(), "get_text_url", &cache_dir).await;
|
||||
|
||||
// This should fail since local files should use get_text
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"Using get_text_url with local file should fail"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -19,6 +19,7 @@ use mcp_core::{
|
||||
use mcp_server::router::CapabilitiesBuilder;
|
||||
use mcp_server::Router;
|
||||
|
||||
mod document_tool;
|
||||
mod docx_tool;
|
||||
mod pdf_tool;
|
||||
mod presentation_tool;
|
||||
@@ -67,10 +68,10 @@ impl ComputerControllerRouter {
|
||||
}),
|
||||
);
|
||||
|
||||
let web_scrape_tool = Tool::new(
|
||||
"web_scrape",
|
||||
let web_fetch_tool = Tool::new(
|
||||
"web_fetch",
|
||||
indoc! {r#"
|
||||
Fetch and save content from a web page. The content can be saved as:
|
||||
Fetch and save content from a web page using http(s). The content can be saved as:
|
||||
- text (for HTML pages)
|
||||
- json (for API responses)
|
||||
- binary (for images and other files)
|
||||
@@ -122,6 +123,7 @@ impl ComputerControllerRouter {
|
||||
- File Operations: Organize files/folders
|
||||
- Integration: Calendar, reminders, messages
|
||||
- Data: Interact with spreadsheets and documents
|
||||
- Text: extract content from many file formats
|
||||
|
||||
Can be combined with screenshot tool for visual task assistance.
|
||||
"#},
|
||||
@@ -242,10 +244,10 @@ impl ComputerControllerRouter {
|
||||
indoc! {r#"
|
||||
Process PDF files to extract text and images.
|
||||
Supports operations:
|
||||
- extract_text: Extract all text content from the PDF
|
||||
- extract_text: Extract all text content from the PDF (file or url to file)
|
||||
- extract_images: Extract and save embedded images to PNG files
|
||||
|
||||
Use this when there is a .pdf file or files that need to be processed.
|
||||
Use this when there is a .pdf file or files that need to be processed.
|
||||
"#},
|
||||
json!({
|
||||
"type": "object",
|
||||
@@ -253,7 +255,7 @@ impl ComputerControllerRouter {
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path to the PDF file"
|
||||
"description": "Path to the PDF file or URL to pdf"
|
||||
},
|
||||
"operation": {
|
||||
"type": "string",
|
||||
@@ -264,6 +266,74 @@ impl ComputerControllerRouter {
|
||||
}),
|
||||
);
|
||||
|
||||
// Check if Tesseract OCR is installed
|
||||
let has_tesseract = match std::env::consts::OS {
|
||||
"macos" | "linux" => {
|
||||
let output = std::process::Command::new("which")
|
||||
.arg("tesseract")
|
||||
.output()
|
||||
.map(|output| output.status.success())
|
||||
.unwrap_or(false);
|
||||
output
|
||||
}
|
||||
"windows" => {
|
||||
let output = std::process::Command::new("where")
|
||||
.arg("tesseract")
|
||||
.output()
|
||||
.map(|output| output.status.success())
|
||||
.unwrap_or(false);
|
||||
output
|
||||
}
|
||||
_ => false,
|
||||
};
|
||||
|
||||
// Conditionally include OCR information in the description
|
||||
let image_formats_desc = if has_tesseract {
|
||||
"This will also extract any embedded text via OCR for the following: png, jpeg, tiff, bmp, gif, ico, psd, svg and pdf (use this if there are embedded images in PDF)"
|
||||
} else {
|
||||
"metadata only: png, jpeg, tiff, bmp, gif, ico, psd, svg (metadata only, OCR not available as tesseract not installed)"
|
||||
};
|
||||
|
||||
let document_tool = Tool::new(
|
||||
"document_tool",
|
||||
formatdoc! {r#"
|
||||
Extract plain text from various file formats. Use this when you see a file extension of the following,
|
||||
OR a url to treat as a document to get text from.
|
||||
Formats:
|
||||
doc, docx, ppt, pptx, xls, xlsx, rtf, odt, ods, odp
|
||||
(consider using docx and xlsx tools for those first)
|
||||
csv, tsv
|
||||
(when not handled by other tools)
|
||||
html, xml,epub, txt
|
||||
|
||||
{image_formats_desc}
|
||||
E-Mail: eml, msg, mbox, pst (extracts content, headers, attachments)
|
||||
|
||||
Supports operations:
|
||||
- get_text: Extract all text content from local document files
|
||||
- get_text_url: Extract all text content from a document at a URL
|
||||
|
||||
Use this for general text extraction from misc document types.
|
||||
"#,
|
||||
image_formats_desc = image_formats_desc
|
||||
},
|
||||
json!({
|
||||
"type": "object",
|
||||
"required": ["path", "operation"],
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Path to the document file or URL to load content from"
|
||||
},
|
||||
"operation": {
|
||||
"type": "string",
|
||||
"enum": ["get_text", "get_text_url"],
|
||||
"description": "Operation to perform on the document"
|
||||
}
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
let docx_tool = Tool::new(
|
||||
"docx_tool",
|
||||
indoc! {r#"
|
||||
@@ -564,14 +634,13 @@ impl ComputerControllerRouter {
|
||||
|
||||
{os_instructions}
|
||||
|
||||
web_search
|
||||
- Search the web using DuckDuckGo's API for general topics or keywords
|
||||
web_scrape
|
||||
- Fetch content from html websites and APIs
|
||||
- Save as text, JSON, or binary files
|
||||
- Content is cached locally for later use
|
||||
- This is not optimised for complex websites, so don't use this as the first tool.
|
||||
cache
|
||||
This extension has many tools to automate, for example:
|
||||
|
||||
web_search, web_fetch, quick_script, computer_control for automation,
|
||||
pdf_tool (pdfs text),
|
||||
document_tool (many doc types and URLs), docx_tool, xlsx_tool, make_presentation
|
||||
|
||||
cache of content:
|
||||
- Manage your cached files
|
||||
- List, view, delete files
|
||||
- Clear all cached data
|
||||
@@ -586,11 +655,12 @@ impl ComputerControllerRouter {
|
||||
Self {
|
||||
tools: vec![
|
||||
web_search_tool,
|
||||
web_scrape_tool,
|
||||
web_fetch_tool,
|
||||
quick_script_tool,
|
||||
computer_control_tool,
|
||||
cache_tool,
|
||||
pdf_tool,
|
||||
document_tool,
|
||||
docx_tool,
|
||||
xlsx_tool,
|
||||
make_presentation_tool,
|
||||
@@ -685,7 +755,7 @@ impl ComputerControllerRouter {
|
||||
))])
|
||||
}
|
||||
|
||||
async fn web_scrape(&self, params: Value) -> Result<Vec<Content>, ToolError> {
|
||||
async fn web_fetch(&self, params: Value) -> Result<Vec<Content>, ToolError> {
|
||||
let url = params
|
||||
.get("url")
|
||||
.and_then(|v| v.as_str())
|
||||
@@ -1082,6 +1152,21 @@ impl ComputerControllerRouter {
|
||||
crate::computercontroller::pdf_tool::pdf_tool(path, operation, &self.cache_dir).await
|
||||
}
|
||||
|
||||
async fn document_tool(&self, params: Value) -> Result<Vec<Content>, ToolError> {
|
||||
let path = params
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| ToolError::InvalidParameters("Missing 'path' parameter".into()))?;
|
||||
|
||||
let operation = params
|
||||
.get("operation")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or_else(|| ToolError::InvalidParameters("Missing 'operation' parameter".into()))?;
|
||||
|
||||
crate::computercontroller::document_tool::document_tool(path, operation, &self.cache_dir)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn cache(&self, params: Value) -> Result<Vec<Content>, ToolError> {
|
||||
let command = params
|
||||
.get("command")
|
||||
@@ -1189,11 +1274,12 @@ impl Router for ComputerControllerRouter {
|
||||
Box::pin(async move {
|
||||
match tool_name.as_str() {
|
||||
"web_search" => this.web_search(arguments).await,
|
||||
"web_scrape" => this.web_scrape(arguments).await,
|
||||
"web_fetch" => this.web_fetch(arguments).await,
|
||||
"automation_script" => this.quick_script(arguments).await,
|
||||
"computer_control" => this.computer_control(arguments).await,
|
||||
"cache" => this.cache(arguments).await,
|
||||
"pdf_tool" => this.pdf_tool(arguments).await,
|
||||
"document_tool" => this.document_tool(arguments).await,
|
||||
"docx_tool" => this.docx_tool(arguments).await,
|
||||
"xlsx_tool" => this.xlsx_tool(arguments).await,
|
||||
"make_presentation" => {
|
||||
|
||||
@@ -1,117 +1,124 @@
|
||||
use lopdf::{content::Content as PdfContent, Document, Object};
|
||||
use extractous::Extractor;
|
||||
use lopdf::{Document, Object};
|
||||
use mcp_core::{Content, ToolError};
|
||||
use std::{fs, path::Path};
|
||||
use std::{
|
||||
fs,
|
||||
io::Read,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
// Threshold for large text files (0.22MB - about 1/18 of the 4,194,304 bytes limit)
|
||||
const LARGE_TEXT_THRESHOLD: usize = (2 * 1024 * 1024) / 9; // ~0.22MB in bytes
|
||||
|
||||
pub async fn pdf_tool(
|
||||
path: &str,
|
||||
operation: &str,
|
||||
cache_dir: &Path,
|
||||
) -> Result<Vec<Content>, ToolError> {
|
||||
// Open and parse the PDF file
|
||||
let doc = Document::load(path)
|
||||
.map_err(|e| ToolError::ExecutionError(format!("Failed to open PDF file: {}", e)))?;
|
||||
|
||||
let result = match operation {
|
||||
match operation {
|
||||
"extract_text" => {
|
||||
let mut text = String::new();
|
||||
// Use extractous library for text extraction
|
||||
let extractor = Extractor::new();
|
||||
|
||||
// Iterate over each page in the document
|
||||
for (page_num, page_id) in doc.get_pages() {
|
||||
text.push_str(&format!("Page {}:\n", page_num));
|
||||
// Check if the path is a URL or a file
|
||||
let (text, metadata) = if path.starts_with("http://") || path.starts_with("https://") {
|
||||
// Handle URL extraction
|
||||
let (mut stream_reader, metadata) = extractor.extract_url(path).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to extract text from URL: {}", e))
|
||||
})?;
|
||||
|
||||
// Try to get text from page contents
|
||||
if let Ok(page_obj) = doc.get_object(page_id) {
|
||||
if let Ok(page_dict) = page_obj.as_dict() {
|
||||
// Try to get text from Contents stream
|
||||
if let Ok(contents) =
|
||||
page_dict.get(b"Contents").and_then(|c| c.as_reference())
|
||||
{
|
||||
if let Ok(content_obj) = doc.get_object(contents) {
|
||||
if let Ok(stream) = content_obj.as_stream() {
|
||||
if let Ok(content_data) = stream.get_plain_content() {
|
||||
if let Ok(content) = PdfContent::decode(&content_data) {
|
||||
// Process each operation in the content stream
|
||||
for operation in content.operations {
|
||||
match operation.operator.as_ref() {
|
||||
// "Tj" operator: show text
|
||||
"Tj" => {
|
||||
for operand in operation.operands {
|
||||
if let Object::String(ref bytes, _) =
|
||||
operand
|
||||
{
|
||||
if let Ok(s) =
|
||||
std::str::from_utf8(bytes)
|
||||
{
|
||||
text.push_str(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
text.push(' ');
|
||||
}
|
||||
// "TJ" operator: show text with positioning
|
||||
"TJ" => {
|
||||
if let Some(Object::Array(ref arr)) =
|
||||
operation.operands.first()
|
||||
{
|
||||
let mut last_was_text = false;
|
||||
for element in arr {
|
||||
match element {
|
||||
Object::String(
|
||||
ref bytes,
|
||||
_,
|
||||
) => {
|
||||
if let Ok(s) =
|
||||
std::str::from_utf8(
|
||||
bytes,
|
||||
)
|
||||
{
|
||||
if last_was_text {
|
||||
text.push(' ');
|
||||
}
|
||||
text.push_str(s);
|
||||
last_was_text = true;
|
||||
}
|
||||
}
|
||||
Object::Integer(offset) => {
|
||||
// Large negative offsets often indicate word spacing
|
||||
if *offset < -100 {
|
||||
text.push(' ');
|
||||
last_was_text = false;
|
||||
}
|
||||
}
|
||||
Object::Real(offset) => {
|
||||
if *offset < -100.0 {
|
||||
text.push(' ');
|
||||
last_was_text = false;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
text.push(' ');
|
||||
}
|
||||
}
|
||||
_ => (), // Ignore other operators
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
text.push('\n');
|
||||
}
|
||||
// Convert StreamReader to String - assuming it has a read_to_string method
|
||||
let mut text = String::new();
|
||||
stream_reader.read_to_string(&mut text).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to read text from URL: {}", e))
|
||||
})?;
|
||||
|
||||
if text.trim().is_empty() {
|
||||
"No text found in PDF".to_string()
|
||||
(text, metadata)
|
||||
} else {
|
||||
format!("Extracted text from PDF:\n\n{}", text)
|
||||
// Extract text from the file (PDF or other)
|
||||
extractor.extract_file_to_string(path).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to extract text from file: {}", e))
|
||||
})?
|
||||
};
|
||||
|
||||
// Check if the extracted text is large
|
||||
let text_size = text.len();
|
||||
if text_size > LARGE_TEXT_THRESHOLD {
|
||||
// Create a directory for large text files if it doesn't exist
|
||||
let large_text_dir = cache_dir.join("large_pdf_texts");
|
||||
fs::create_dir_all(&large_text_dir).map_err(|e| {
|
||||
ToolError::ExecutionError(format!(
|
||||
"Failed to create directory for large text: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
|
||||
// Create a filename based on the original PDF name
|
||||
let pdf_path = PathBuf::from(path);
|
||||
let pdf_filename = pdf_path
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.unwrap_or("unnamed_pdf");
|
||||
|
||||
let text_file_path = large_text_dir.join(format!("{}.txt", pdf_filename));
|
||||
|
||||
// Write the text to a file
|
||||
fs::write(&text_file_path, &text).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to write large text to file: {}", e))
|
||||
})?;
|
||||
|
||||
// Format size in human-readable form
|
||||
let size_str = if text_size < 1024 * 1024 {
|
||||
format!("{:.2} KB", text_size as f64 / 1024.0)
|
||||
} else {
|
||||
format!("{:.2} MB", text_size as f64 / (1024.0 * 1024.0))
|
||||
};
|
||||
|
||||
Ok(vec![Content::text(format!(
|
||||
"Large text extracted from PDF ({})\n\n\
|
||||
The extracted text is too large to display directly.\n\
|
||||
Text has been written to: {}\n\n\
|
||||
You can search through this file using ripgrep:\n\
|
||||
rg 'search term' {}\n\n\
|
||||
Or view portions of it:\n\
|
||||
head -n 50 {}\n\
|
||||
tail -n 50 {}\n\
|
||||
less {}",
|
||||
size_str,
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display(),
|
||||
text_file_path.display()
|
||||
))])
|
||||
} else {
|
||||
// Include metadata information in the output
|
||||
let metadata_info = format!(
|
||||
"PDF Metadata:\n{}\n\n",
|
||||
serde_json::to_string_pretty(&metadata)
|
||||
.unwrap_or_else(|_| "Unable to format metadata".to_string())
|
||||
);
|
||||
|
||||
Ok(vec![Content::text(format!(
|
||||
"{}Extracted text from PDF:\n\n{}",
|
||||
metadata_info, text
|
||||
))])
|
||||
}
|
||||
}
|
||||
|
||||
"extract_images" => {
|
||||
// Check if the path is a URL (not supported for image extraction)
|
||||
if path.starts_with("http://") || path.starts_with("https://") {
|
||||
return Err(ToolError::InvalidParameters(
|
||||
"Image extraction is not supported for URLs. Please provide a local PDF file path.".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Open and parse the PDF file for image extraction
|
||||
let doc = Document::load(path).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to open PDF file: {}", e))
|
||||
})?;
|
||||
|
||||
let cache_dir = cache_dir.join("pdf_images");
|
||||
fs::create_dir_all(&cache_dir).map_err(|e| {
|
||||
ToolError::ExecutionError(format!("Failed to create image cache directory: {}", e))
|
||||
@@ -305,21 +312,21 @@ pub async fn pdf_tool(
|
||||
}
|
||||
|
||||
if images.is_empty() {
|
||||
"No images found in PDF".to_string()
|
||||
Ok(vec![Content::text("No images found in PDF".to_string())])
|
||||
} else {
|
||||
format!("Found {} images:\n{}", image_count, images.join("\n"))
|
||||
Ok(vec![Content::text(format!(
|
||||
"Found {} images:\n{}",
|
||||
image_count,
|
||||
images.join("\n")
|
||||
))])
|
||||
}
|
||||
}
|
||||
|
||||
_ => {
|
||||
return Err(ToolError::InvalidParameters(format!(
|
||||
"Invalid operation: {}. Valid operations are: 'extract_text', 'extract_images'",
|
||||
operation
|
||||
)))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(vec![Content::text(result)])
|
||||
_ => Err(ToolError::InvalidParameters(format!(
|
||||
"Invalid operation: {}. Valid operations are: 'extract_text', 'extract_images'",
|
||||
operation
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -342,10 +349,39 @@ mod tests {
|
||||
assert!(!content.is_empty(), "Extracted text should not be empty");
|
||||
let text = content[0].as_text().unwrap();
|
||||
println!("Extracted text:\n{}", text);
|
||||
assert!(text.contains("Page 1"), "Should contain page marker");
|
||||
assert!(
|
||||
text.contains("This is a test PDF"),
|
||||
"Should contain expected test content"
|
||||
text.contains("This is a test PDF") || text.contains("PDF Metadata"),
|
||||
"Should contain expected test content or metadata"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_text_extraction() {
|
||||
// Skip this test if we're not online
|
||||
// This is a simple test URL that should be stable
|
||||
let test_url = "https://example.com";
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
println!("Testing text extraction from URL: {}", test_url);
|
||||
|
||||
let result = pdf_tool(test_url, "extract_text", &cache_dir).await;
|
||||
|
||||
// If the test fails due to network issues, just skip it
|
||||
if let Err(err) = &result {
|
||||
if err.to_string().contains("network") || err.to_string().contains("connection") {
|
||||
println!("Skipping URL extraction test due to network issues");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(result.is_ok(), "URL text extraction should succeed");
|
||||
let content = result.unwrap();
|
||||
assert!(!content.is_empty(), "Extracted text should not be empty");
|
||||
let text = content[0].as_text().unwrap();
|
||||
println!("Extracted text from URL:\n{}", text);
|
||||
assert!(
|
||||
text.contains("Example Domain"),
|
||||
"Should contain expected content from example.com"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -396,6 +432,29 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_url_image_extraction_fails() {
|
||||
// Test that image extraction from URLs is properly rejected
|
||||
let test_url = "https://example.com";
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
println!(
|
||||
"Testing image extraction from URL (should fail): {}",
|
||||
test_url
|
||||
);
|
||||
|
||||
let result = pdf_tool(test_url, "extract_images", &cache_dir).await;
|
||||
assert!(result.is_err(), "URL image extraction should fail");
|
||||
|
||||
let error = result.unwrap_err();
|
||||
assert!(
|
||||
error
|
||||
.to_string()
|
||||
.contains("Image extraction is not supported for URLs"),
|
||||
"Should return the correct error message for URL image extraction"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_pdf_invalid_path() {
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
@@ -419,4 +478,65 @@ mod tests {
|
||||
|
||||
assert!(result.is_err(), "Should fail with invalid operation");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_large_pdf_text_extraction() {
|
||||
let large_pdf_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("src/computercontroller/tests/data/visa-rules-public.pdf");
|
||||
|
||||
// Skip test if the large PDF file doesn't exist (may not be committed to git)
|
||||
if !large_pdf_path.exists() {
|
||||
println!(
|
||||
"Skipping large PDF test as file doesn't exist: {}",
|
||||
large_pdf_path.display()
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
let cache_dir = tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
println!(
|
||||
"Testing large text extraction from: {}",
|
||||
large_pdf_path.display()
|
||||
);
|
||||
|
||||
let result = pdf_tool(large_pdf_path.to_str().unwrap(), "extract_text", &cache_dir).await;
|
||||
|
||||
assert!(result.is_ok(), "Large PDF text extraction should succeed");
|
||||
let content = result.unwrap();
|
||||
assert!(!content.is_empty(), "Extracted text should not be empty");
|
||||
let text = content[0].as_text().unwrap();
|
||||
|
||||
// Check if the text is large enough to be written to a file
|
||||
if text.contains("Large text extracted from PDF") {
|
||||
// For large PDFs, we should get the message about writing to a file
|
||||
assert!(
|
||||
text.contains("Text has been written to:"),
|
||||
"Should indicate where text was written"
|
||||
);
|
||||
|
||||
// Extract the file path from the output and verify it exists
|
||||
let file_path = text
|
||||
.lines()
|
||||
.find(|line| line.contains("Text has been written to:"))
|
||||
.and_then(|line| line.split(": ").nth(1))
|
||||
.expect("Should have a valid file path");
|
||||
|
||||
println!("Verifying text file exists: {}", file_path);
|
||||
assert!(PathBuf::from(file_path).exists(), "Text file should exist");
|
||||
|
||||
// Verify file contains actual content
|
||||
let file_content =
|
||||
fs::read_to_string(file_path).expect("Should be able to read text file");
|
||||
assert!(!file_content.is_empty(), "Text file should not be empty");
|
||||
} else {
|
||||
// If the text is not written to a file, it should contain PDF content directly
|
||||
assert!(
|
||||
text.contains("PDF Metadata:"),
|
||||
"Should contain PDF metadata"
|
||||
);
|
||||
// The text should not be empty (beyond just metadata)
|
||||
assert!(text.len() > 100, "Should contain substantial text content");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
Reference in New Issue
Block a user