feat: linux computer control (WIP) (#1227)

This commit is contained in:
Max Novich
2025-02-14 13:00:38 -08:00
committed by GitHub
parent 08b58c313e
commit 559bfe67e6
3 changed files with 313 additions and 3 deletions

View File

@@ -102,7 +102,7 @@ impl ComputerControllerRouter {
Can be combined with screenshot tool for visual task assistance.
"#},
_ => indoc! {r#"
"macos" => indoc! {r#"
Control the computer using AppleScript (macOS only). Automate applications and system features.
Key capabilities:
@@ -117,6 +117,20 @@ impl ComputerControllerRouter {
- Integration: Calendar, reminders, messages
- Data: Interact with spreadsheets and documents
Can be combined with screenshot tool for visual task assistance.
"#},
_ => indoc! {r#"
Control the computer using Linux system automation.
Features available:
- Shell scripting for system control
- X11/Wayland window management
- D-Bus for system services
- File and system management
- Desktop environment control (GNOME, KDE, etc.)
- Process management and monitoring
- System settings and configurations
Can be combined with screenshot tool for visual task assistance.
"#},
};
@@ -252,7 +266,7 @@ impl ComputerControllerRouter {
- System automation using PowerShell
- Consider the screenshot tool to work out what is on screen and what to do to help with the control task.
"#},
_ => indoc! {r#"
"macos" => indoc! {r#"
Here are some extra tools:
automation_script
- Create and run Shell and Ruby scripts
@@ -275,6 +289,30 @@ impl ComputerControllerRouter {
- Handle web-based workflows
This is often more reliable than web scraping for modern web applications.
"#},
_ => indoc! {r#"
Here are some extra tools:
automation_script
- Create and run Shell scripts
- Shell (bash) is recommended for most tasks
- Scripts can save their output to files
- Linux-specific features:
- System automation through shell scripting
- X11/Wayland window management
- D-Bus system services integration
- Desktop environment control
- Use the screenshot tool if needed to help with tasks
computer_control
- System automation using shell commands and system tools
- Desktop environment automation (GNOME, KDE, etc.)
- Consider the screenshot tool to work out what is on screen and what to do to help with the control task.
When you need to interact with websites or web applications, consider using tools like xdotool or wmctrl for:
- Window management
- Simulating keyboard/mouse input
- Automating UI interactions
- Desktop environment control
"#},
};
let instructions = formatdoc! {r#"

View File

@@ -0,0 +1,259 @@
use super::SystemAutomation;
use std::io::Result;
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::process::Command;
use std::sync::Once;
static INIT: Once = Once::new();
#[derive(Debug)]
pub enum DisplayServer {
X11,
Wayland,
Unknown,
}
pub struct LinuxAutomation {
display_server: DisplayServer,
}
impl Default for LinuxAutomation {
fn default() -> Self {
Self::new()
}
}
impl LinuxAutomation {
pub fn new() -> Self {
let automation = LinuxAutomation {
display_server: Self::detect_display_server(),
};
INIT.call_once(|| {
automation.initialize().unwrap_or_else(|e| {
eprintln!("Warning: Failed to initialize Linux automation: {}", e);
});
});
automation
}
fn detect_display_server() -> DisplayServer {
if let Ok(wayland_display) = std::env::var("WAYLAND_DISPLAY") {
if !wayland_display.is_empty() {
return DisplayServer::Wayland;
}
}
if let Ok(display) = std::env::var("DISPLAY") {
if !display.is_empty() {
return DisplayServer::X11;
}
}
DisplayServer::Unknown
}
fn initialize(&self) -> Result<()> {
// Check for common dependencies first
self.check_common_dependencies()?;
// Check display server specific dependencies
match self.display_server {
DisplayServer::X11 => self.check_x11_dependencies()?,
DisplayServer::Wayland => self.check_wayland_dependencies()?,
DisplayServer::Unknown => {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Unable to detect display server",
));
}
}
Ok(())
}
fn check_common_dependencies(&self) -> Result<()> {
let common_deps = ["bash", "python3"];
self.check_dependencies(&common_deps)
}
fn check_x11_dependencies(&self) -> Result<()> {
let x11_deps = ["xdotool", "wmctrl", "xclip", "xwininfo"];
self.check_dependencies(&x11_deps)
}
fn check_wayland_dependencies(&self) -> Result<()> {
let wayland_deps = ["wtype", "wl-clipboard"];
self.check_dependencies(&wayland_deps)
}
fn check_dependencies(&self, deps: &[&str]) -> Result<()> {
for dep in deps {
if !Command::new("which").arg(dep).output()?.status.success() {
return Err(std::io::Error::new(
std::io::ErrorKind::NotFound,
format!("Required dependency '{}' not found", dep),
));
}
}
Ok(())
}
fn execute_input_command(&self, cmd: &str) -> Result<String> {
match self.display_server {
DisplayServer::X11 => self.execute_x11_command(cmd),
DisplayServer::Wayland => self.execute_wayland_command(cmd),
DisplayServer::Unknown => Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Unknown display server",
)),
}
}
fn execute_x11_command(&self, cmd: &str) -> Result<String> {
if cmd.starts_with("click") {
Command::new("xdotool").arg("click").arg("1").output()?;
Ok(String::new())
} else if let Some(text) = cmd.strip_prefix("type ") {
Command::new("xdotool").arg("type").arg(text).output()?;
Ok(String::new())
} else if let Some(key) = cmd.strip_prefix("key ") {
Command::new("xdotool").arg("key").arg(key).output()?;
Ok(String::new())
} else if let Some(window) = cmd.strip_prefix("activate ") {
Command::new("wmctrl").arg("-a").arg(window).output()?;
Ok(String::new())
} else if cmd == "get clipboard" {
let output = Command::new("xclip")
.arg("-o")
.arg("-selection")
.arg("clipboard")
.output()?;
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
} else if let Some(text) = cmd.strip_prefix("set clipboard ") {
let mut child = Command::new("xclip")
.arg("-selection")
.arg("clipboard")
.stdin(std::process::Stdio::piped())
.spawn()?;
if let Some(mut stdin) = child.stdin.take() {
use std::io::Write;
stdin.write_all(text.as_bytes())?;
}
child.wait()?;
Ok(String::new())
} else {
Ok(String::new())
}
}
fn execute_wayland_command(&self, cmd: &str) -> Result<String> {
if let Some(text) = cmd.strip_prefix("type ") {
Command::new("wtype").arg(text).output()?;
Ok(String::new())
} else if let Some(key) = cmd.strip_prefix("key ") {
Command::new("wtype").arg(key).output()?;
Ok(String::new())
} else if cmd == "get clipboard" {
let output = Command::new("wl-paste").output()?;
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
} else if let Some(text) = cmd.strip_prefix("set clipboard ") {
let mut child = Command::new("wl-copy")
.stdin(std::process::Stdio::piped())
.spawn()?;
if let Some(mut stdin) = child.stdin.take() {
use std::io::Write;
stdin.write_all(text.as_bytes())?;
}
child.wait()?;
Ok(String::new())
} else {
// Some commands might not be available in Wayland
Ok(String::new())
}
}
fn create_python_script(&self, commands: &[&str]) -> String {
let mut script = String::from(
r#"#!/usr/bin/env python3
import subprocess
import os
import sys
import time
def run_command(cmd):
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
return result.stdout
except Exception as e:
print(f"Error executing {cmd}: {e}", file=sys.stderr)
return ""
"#,
);
for cmd in commands {
script.push_str(&format!("run_command('{}')\n", cmd));
}
script
}
}
impl SystemAutomation for LinuxAutomation {
fn execute_system_script(&self, script: &str) -> Result<String> {
// Parse the script into individual commands
let commands: Vec<_> = script
.lines()
.map(str::trim)
.filter(|line| !line.is_empty())
.collect();
// For complex automation sequences, use Python as an intermediary
if commands.len() > 1 {
let python_script = self.create_python_script(&commands);
let mut temp_path = self.get_temp_path();
temp_path.push("automation_script.py");
std::fs::write(&temp_path, python_script)?;
#[cfg(unix)]
std::fs::set_permissions(&temp_path, std::fs::Permissions::from_mode(0o755))?;
#[cfg(not(unix))]
{
// On non-Unix systems, we don't set execute permissions
// The script will be executed by the Python interpreter directly
}
let output = Command::new("python3").arg(&temp_path).output()?;
std::fs::remove_file(temp_path)?;
if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout).into_owned())
} else {
Err(std::io::Error::new(
std::io::ErrorKind::Other,
String::from_utf8_lossy(&output.stderr).into_owned(),
))
}
} else if let Some(cmd) = commands.first() {
// For single commands, execute directly
self.execute_input_command(cmd)
} else {
Ok(String::new())
}
}
fn get_shell_command(&self) -> (&'static str, &'static str) {
("bash", "-c")
}
fn get_temp_path(&self) -> PathBuf {
std::env::temp_dir()
}
}

View File

@@ -1,3 +1,4 @@
mod linux;
mod macos;
mod windows;
@@ -7,6 +8,9 @@ pub use self::windows::WindowsAutomation;
#[cfg(target_os = "macos")]
pub use self::macos::MacOSAutomation;
#[cfg(target_os = "linux")]
pub use self::linux::LinuxAutomation;
pub trait SystemAutomation: Send + Sync {
fn execute_system_script(&self, script: &str) -> std::io::Result<String>;
fn get_shell_command(&self) -> (&'static str, &'static str); // (shell, arg)
@@ -22,7 +26,16 @@ pub fn create_system_automation() -> Box<dyn SystemAutomation + Send + Sync> {
{
Box::new(MacOSAutomation)
}
#[cfg(not(any(target_os = "windows", target_os = "macos")))]
#[cfg(not(any(
target_os = "macos",
target_os = "windows",
target_os = "ios",
target_os = "none"
)))]
{
Box::new(LinuxAutomation::new())
}
#[cfg(any(target_os = "ios", target_os = "none"))]
{
unimplemented!("Unsupported operating system")
}