From 1707bf917d4b6b8519135cafd9e97deecb15f9e9 Mon Sep 17 00:00:00 2001
From: Aljaz Ceru <aljaz@ceru.si>
Date: Thu, 11 Sep 2025 16:33:46 +0200
Subject: [PATCH] transcription api

---
 examples/rust-client/Cargo.toml               |  10 +-
 examples/rust-client/src/realtime_playback.rs |   2 +-
 examples/rust-client/src/stdin_transcribe.rs  | 195 ++++++++++++++
 examples/rust-client/src/stream_transcribe.rs |   2 -
 .../src/system_audio_transcribe.rs            | 250 ++++++++++++++++++
 examples/rust-client/transcribe_video_call.sh | 139 ++++++++++
 6 files changed, 594 insertions(+), 4 deletions(-)
 create mode 100644 examples/rust-client/src/stdin_transcribe.rs
 create mode 100644 examples/rust-client/src/system_audio_transcribe.rs
 create mode 100755 examples/rust-client/transcribe_video_call.sh
diff --git a/examples/rust-client/Cargo.toml b/examples/rust-client/Cargo.toml
index 008fd98..bef0cc2 100644
--- a/examples/rust-client/Cargo.toml
+++ b/examples/rust-client/Cargo.toml
@@ -39,4 +39,12 @@ path = "src/live_transcribe.rs"
 
 [[bin]]
 name = "realtime-playback"
-path = "src/realtime_playback.rs"
\ No newline at end of file
+path = "src/realtime_playback.rs"
+
+[[bin]]
+name = "system-audio"
+path = "src/system_audio_transcribe.rs"
+
+[[bin]]
+name = "stdin-transcribe"
+path = "src/stdin_transcribe.rs"
\ No newline at end of file
diff --git a/examples/rust-client/src/realtime_playback.rs b/examples/rust-client/src/realtime_playback.rs
index 3c64cb5..81fd311 100644
--- a/examples/rust-client/src/realtime_playback.rs
+++ b/examples/rust-client/src/realtime_playback.rs
@@ -76,7 +76,7 @@ async fn main() -> Result<()> {
     // Also open the file for streaming to transcription service
     // We need to read the raw audio data for transcription
     let mut wav_reader = WavReader::open(&file_path)?;
-    let wav_spec = wav_reader.spec();
+    let _wav_spec = wav_reader.spec();
     
     // Collect all samples for streaming
     let samples: Vec<i16> = wav_reader.samples::<i16>()
diff --git a/examples/rust-client/src/stdin_transcribe.rs b/examples/rust-client/src/stdin_transcribe.rs
new file mode 100644
index 0000000..0d2dcb6
--- /dev/null
+++ b/examples/rust-client/src/stdin_transcribe.rs
@@ -0,0 +1,195 @@
+use anyhow::Result;
+use clap::Parser;
+use futures_util::StreamExt;
+use std::io::{self, Read};
+use tokio::sync::mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+use tracing::{error, info};
+
+// Import generated protobuf types
+pub mod transcription {
+    tonic::include_proto!("transcription");
+}
+
+use transcription::{
+    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
+};
+
+#[derive(Parser, Debug)]
+#[command(author, version, about = "Transcribe audio from stdin (for piping from parec)", long_about = None)]
+struct Args {
+    /// gRPC server address
+    #[arg(short, long, default_value = "http://localhost:50051")]
+    server: String,
+
+    /// Language code (e.g., "en", "es", "auto")
+    #[arg(short, long, default_value = "en")]
+    language: String,
+
+    /// Task: transcribe or translate
+    #[arg(short, long, default_value = "transcribe")]
+    task: String,
+
+    /// Model to use
+    #[arg(short, long, default_value = "base")]
+    model: String,
+
+    /// Show timestamps
+    #[arg(short = 'T', long)]
+    timestamps: bool,
+
+    /// Chunk size in seconds (for buffering)
+    #[arg(short, long, default_value = "3.0")]
+    chunk_seconds: f32,
+
+    /// Disable VAD (Voice Activity Detection) - useful for music/system audio
+    #[arg(long)]
+    no_vad: bool,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt::init();
+    let args = Args::parse();
+
+    info!("Connecting to transcription service at {}", args.server);
+    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
+
+    // Create channel for audio chunks
+    let (tx, rx) = mpsc::channel::<AudioChunk>(100);
+
+    // Spawn task to read from stdin and send chunks
+    let tx_clone = tx.clone();
+    let chunk_seconds = args.chunk_seconds;
+    std::thread::spawn(move || {
+        if let Err(e) = read_stdin_and_send(tx_clone, chunk_seconds) {
+            error!("Error reading stdin: {}", e);
+        }
+    });
+
+    // Create the first chunk with configuration
+    let config = AudioConfig {
+        language: args.language.clone(),
+        task: args.task.clone(),
+        model: args.model.clone(),
+        sample_rate: 16000,
+        vad_enabled: !args.no_vad,  // Disable VAD if --no-vad flag is used
+    };
+
+    // Send a configuration chunk first
+    let config_chunk = AudioChunk {
+        audio_data: vec![],
+        session_id: "stdin-transcribe".to_string(),
+        config: Some(config),
+    };
+
+    // Create stream from receiver
+    let stream = ReceiverStream::new(rx);
+    let stream = futures_util::stream::iter(vec![config_chunk]).chain(stream);
+
+    // Start streaming transcription
+    let request = tonic::Request::new(stream);
+    let mut response = client.stream_transcribe(request).await?.into_inner();
+
+    println!("\n🎧 Transcribing audio from stdin...");
+    println!("Press Ctrl+C to stop\n");
+    println!("{}", "─".repeat(80));
+
+    let mut current_line = String::new();
+
+    // Process transcription responses
+    while let Some(result) = response.message().await? {
+        if !result.text.is_empty() {
+            if args.timestamps {
+                if result.is_final {
+                    println!("[{:.1}s] {}", result.start_time, result.text);
+                    current_line.clear();
+                } else {
+                    print!("\r[{:.1}s] {:<80}", result.start_time, result.text);
+                    use std::io::{self as stdio, Write};
+                    stdio::stdout().flush()?;
+                    current_line = result.text.clone();
+                }
+            } else {
+                if result.is_final {
+                    println!("{}", result.text);
+                    current_line.clear();
+                } else {
+                    print!("\r{:<80}", result.text);
+                    use std::io::{self as stdio, Write};
+                    stdio::stdout().flush()?;
+                    current_line = result.text.clone();
+                }
+            }
+        }
+    }
+
+    // Clear any remaining interim text
+    if !current_line.is_empty() {
+        println!();
+    }
+
+    Ok(())
+}
+
+fn read_stdin_and_send(tx: mpsc::Sender<AudioChunk>, chunk_seconds: f32) -> Result<()> {
+    let stdin = io::stdin();
+    let mut handle = stdin.lock();
+    
+    // Calculate chunk size in bytes (16kHz, 16-bit mono)
+    let samples_per_chunk = (16000.0 * chunk_seconds) as usize;
+    let bytes_per_chunk = samples_per_chunk * 2; // 16-bit = 2 bytes
+    
+    let mut buffer = vec![0u8; bytes_per_chunk];
+    
+    info!("Reading audio from stdin (chunk size: {} bytes, {} seconds)", 
+          bytes_per_chunk, chunk_seconds);
+    
+    loop {
+        // Read a chunk from stdin
+        let mut total_read = 0;
+        while total_read < bytes_per_chunk {
+            match handle.read(&mut buffer[total_read..]) {
+                Ok(0) => {
+                    // EOF reached
+                    if total_read > 0 {
+                        // Send remaining data
+                        let audio_chunk = AudioChunk {
+                            audio_data: buffer[..total_read].to_vec(),
+                            session_id: String::new(),
+                            config: None,
+                        };
+                        let _ = tx.blocking_send(audio_chunk);
+                    }
+                    info!("End of stdin reached");
+                    return Ok(());
+                }
+                Ok(n) => {
+                    total_read += n;
+                }
+                Err(e) if e.kind() == io::ErrorKind::Interrupted => {
+                    // Retry on interrupt
+                    continue;
+                }
+                Err(e) => {
+                    error!("Error reading stdin: {}", e);
+                    return Err(e.into());
+                }
+            }
+        }
+        
+        // Send the chunk
+        let audio_chunk = AudioChunk {
+            audio_data: buffer.clone(),
+            session_id: String::new(),
+            config: None,
+        };
+        
+        if tx.blocking_send(audio_chunk).is_err() {
+            // Receiver dropped, exit
+            break;
+        }
+    }
+    
+    Ok(())
+}
\ No newline at end of file
diff --git a/examples/rust-client/src/stream_transcribe.rs b/examples/rust-client/src/stream_transcribe.rs
index e00747f..c5fb443 100644
--- a/examples/rust-client/src/stream_transcribe.rs
+++ b/examples/rust-client/src/stream_transcribe.rs
@@ -7,12 +7,10 @@ use anyhow::Result;
 use clap::Parser;
 use futures_util::StreamExt;
 use hound::WavReader;
-use std::fs::File;
 use std::time::Duration;
 use tokio::sync::mpsc;
 use tokio::time;
 use tokio_stream::wrappers::ReceiverStream;
-use tonic::transport::Channel;
 use tracing::info;
 
 // Import generated protobuf types
diff --git a/examples/rust-client/src/system_audio_transcribe.rs b/examples/rust-client/src/system_audio_transcribe.rs
new file mode 100644
index 0000000..b6d96d9
--- /dev/null
+++ b/examples/rust-client/src/system_audio_transcribe.rs
@@ -0,0 +1,250 @@
+use anyhow::Result;
+use clap::Parser;
+use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use futures_util::StreamExt;
+use std::sync::{Arc, Mutex};
+use tokio::sync::mpsc as tokio_mpsc;
+use tokio_stream::wrappers::ReceiverStream;
+use tracing::{error, info, warn};
+
+// Import generated protobuf types
+pub mod transcription {
+    tonic::include_proto!("transcription");
+}
+
+use transcription::{
+    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
+};
+
+#[derive(Parser, Debug)]
+#[command(author, version, about = "Capture and transcribe system audio", long_about = None)]
+struct Args {
+    /// gRPC server address
+    #[arg(short, long, default_value = "http://localhost:50051")]
+    server: String,
+
+    /// Language code (e.g., "en", "es", "auto")
+    #[arg(short, long, default_value = "en")]
+    language: String,
+
+    /// Task: transcribe or translate
+    #[arg(short, long, default_value = "transcribe")]
+    task: String,
+
+    /// Model to use
+    #[arg(short, long, default_value = "base")]
+    model: String,
+
+    /// List available audio devices
+    #[arg(long)]
+    list_devices: bool,
+
+    /// Audio device name or index to use (e.g., "pulse.monitor" for PulseAudio monitor)
+    #[arg(short, long)]
+    device: Option<String>,
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    tracing_subscriber::fmt::init();
+    let args = Args::parse();
+
+    // List devices if requested
+    if args.list_devices {
+        list_audio_devices()?;
+        return Ok(());
+    }
+
+    info!("Connecting to transcription service at {}", args.server);
+    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
+
+    // Create channel for audio chunks
+    let (tx, rx) = tokio_mpsc::channel::<AudioChunk>(100);
+
+    // Start audio capture in a separate thread
+    let device_name = args.device.clone();
+    std::thread::spawn(move || {
+        if let Err(e) = capture_system_audio(tx, device_name) {
+            error!("Audio capture error: {}", e);
+        }
+    });
+
+    // Create the first chunk with configuration
+    let config = AudioConfig {
+        language: args.language.clone(),
+        task: args.task.clone(),
+        model: args.model.clone(),
+        sample_rate: 16000,
+        vad_enabled: true,  // Enable VAD to filter silence
+    };
+
+    // Send a configuration chunk first
+    let config_chunk = AudioChunk {
+        audio_data: vec![],
+        session_id: "system-audio".to_string(),
+        config: Some(config),
+    };
+
+    // Create stream from receiver
+    let stream_vec = vec![config_chunk];
+    let stream = ReceiverStream::new(rx);
+    let stream = futures_util::stream::iter(stream_vec).chain(stream);
+
+    // Start streaming transcription
+    let request = tonic::Request::new(stream);
+    let mut response = client.stream_transcribe(request).await?.into_inner();
+
+    println!("\n🎧 Capturing system audio for transcription...");
+    println!("Press Ctrl+C to stop\n");
+    println!("{}", "─".repeat(80));
+
+    // Process transcription responses
+    while let Some(result) = response.message().await? {
+        if !result.text.is_empty() {
+            if result.is_final {
+                println!("[FINAL] {}", result.text);
+            } else {
+                print!("\r[INTERIM] {:<80}", result.text);
+                use std::io::{self, Write};
+                io::stdout().flush()?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// List all available audio devices
+fn list_audio_devices() -> Result<()> {
+    let host = cpal::default_host();
+    
+    println!("\n📊 Available Audio Devices:");
+    println!("{}", "─".repeat(80));
+    
+    // List input devices
+    println!("\n🎤 Input Devices:");
+    for (idx, device) in host.input_devices()?.enumerate() {
+        let name = device.name()?;
+        let is_monitor = name.contains("monitor") || name.contains("Monitor") || 
+                        name.contains("loopback") || name.contains("Loopback") ||
+                        name.contains("stereo mix") || name.contains("Stereo Mix");
+        
+        if is_monitor {
+            println!("  [{}] {} 🔊 (System Audio)", idx, name);
+        } else {
+            println!("  [{}] {}", idx, name);
+        }
+    }
+    
+    // Show default device
+    if let Some(device) = host.default_input_device() {
+        println!("\n⭐ Default Input: {}", device.name()?);
+    }
+    
+    println!("\n💡 Tips for capturing system audio:");
+    println!("  Linux: Look for devices with 'monitor' in the name (PulseAudio/PipeWire)");
+    println!("  Windows: Install VB-Cable or enable 'Stereo Mix' in sound settings");
+    println!("  macOS: Install BlackHole or Loopback for system audio capture");
+    
+    Ok(())
+}
+
+/// Capture audio from system (or specified device)
+fn capture_system_audio(tx: tokio_mpsc::Sender<AudioChunk>, device_name: Option<String>) -> Result<()> {
+    let host = cpal::default_host();
+    
+    // Find the appropriate audio device
+    let device = if let Some(name) = device_name {
+        // Try to find device by name
+        let mut found_device = None;
+        for input_device in host.input_devices()? {
+            if input_device.name()?.contains(&name) {
+                found_device = Some(input_device);
+                break;
+            }
+        }
+        found_device.ok_or_else(|| anyhow::anyhow!("Device '{}' not found. Use --list-devices to see available devices.", name))?
+    } else {
+        // Try to find a monitor/loopback device automatically
+        let mut monitor_device = None;
+        for input_device in host.input_devices()? {
+            let name = input_device.name()?;
+            if name.contains("monitor") || name.contains("Monitor") || 
+               name.contains("loopback") || name.contains("Loopback") ||
+               name.contains("stereo mix") || name.contains("Stereo Mix") {
+                info!("Found system audio device: {}", name);
+                monitor_device = Some(input_device);
+                break;
+            }
+        }
+        
+        if let Some(device) = monitor_device {
+            device
+        } else {
+            warn!("No system audio device found. Using default input device.");
+            warn!("To capture system audio, you may need to:");
+            warn!("  - Linux: Enable PulseAudio monitor");
+            warn!("  - Windows: Enable Stereo Mix or install VB-Cable");
+            warn!("  - macOS: Install BlackHole or Loopback");
+            host.default_input_device()
+                .ok_or_else(|| anyhow::anyhow!("No input device available"))?
+        }
+    };
+    
+    info!("Using audio device: {}", device.name()?);
+
+    // Configure audio capture for 16kHz mono PCM16
+    let config = cpal::StreamConfig {
+        channels: 1,
+        sample_rate: cpal::SampleRate(16000),
+        buffer_size: cpal::BufferSize::Default,
+    };
+
+    // Buffer to accumulate audio samples
+    let buffer = Arc::new(Mutex::new(Vec::new()));
+    let buffer_clone = buffer.clone();
+
+    // Create audio stream
+    let stream = device.build_input_stream(
+        &config,
+        move |data: &[i16], _: &cpal::InputCallbackInfo| {
+            let mut buf = buffer_clone.lock().unwrap();
+            buf.extend_from_slice(data);
+            
+            // Send chunks of ~3 seconds (48000 samples at 16kHz) for better accuracy
+            while buf.len() >= 48000 {
+                let chunk: Vec<i16> = buf.drain(..48000).collect();
+                
+                // Convert i16 to bytes
+                let bytes: Vec<u8> = chunk.iter()
+                    .flat_map(|&sample| sample.to_le_bytes())
+                    .collect();
+
+                let audio_chunk = AudioChunk {
+                    audio_data: bytes,
+                    session_id: String::new(),
+                    config: None,
+                };
+
+                // Send chunk (ignore errors if receiver is closed)
+                let tx_clone = tx.clone();
+                tokio::spawn(async move {
+                    let _ = tx_clone.send(audio_chunk).await;
+                });
+            }
+        },
+        move |err| {
+            error!("Audio stream error: {}", err);
+        },
+        None
+    )?;
+
+    // Start the stream
+    stream.play()?;
+    info!("Audio capture started");
+
+    // Keep the stream alive
+    loop {
+        std::thread::sleep(std::time::Duration::from_secs(1));
+    }
+}
\ No newline at end of file
diff --git a/examples/rust-client/transcribe_video_call.sh b/examples/rust-client/transcribe_video_call.sh
new file mode 100755
index 0000000..809434a
--- /dev/null
+++ b/examples/rust-client/transcribe_video_call.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Enhanced script for transcribing video calls on Ubuntu with PipeWire
+# Uses parec (PulseAudio compatibility) to capture system audio
+
+set -e
+
+echo "🎥 Video Call Transcription Service"
+echo "===================================="
+echo ""
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Check dependencies
+check_dependency() {
+    if ! command -v $1 &> /dev/null; then
+        echo -e "${RED}❌ $1 not found.${NC}"
+        echo "Please install: sudo apt-get install $2"
+        return 1
+    fi
+    return 0
+}
+
+echo "Checking dependencies..."
+check_dependency "parec" "pulseaudio-utils" || exit 1
+check_dependency "sox" "sox" || echo -e "${YELLOW}⚠️  sox not installed (optional but recommended)${NC}"
+
+# Function to find the monitor source for system audio
+find_monitor_source() {
+    # List all sources and find monitors (what you hear)
+    local monitors=$(pactl list sources short 2>/dev/null | grep -i "monitor" | awk '{print $2}')
+    
+    if [ -z "$monitors" ]; then
+        # Try pacmd if pactl doesn't work
+        monitors=$(pacmd list-sources 2>/dev/null | grep "name:" | grep "monitor" | sed 's/.*<\(.*\)>.*/\1/')
+    fi
+    
+    if [ -z "$monitors" ]; then
+        # Fallback: try to construct monitor name from default sink
+        local default_sink=$(pactl info 2>/dev/null | grep "Default Sink" | cut -d: -f2 | xargs)
+        if [ -n "$default_sink" ]; then
+            monitors="${default_sink}.monitor"
+        fi
+    fi
+    
+    echo "$monitors" | head -1
+}
+
+# List available sources
+if [ "$1" == "--list" ]; then
+    echo -e "${GREEN}📊 Available Audio Sources:${NC}"
+    echo ""
+    pactl list sources short 2>/dev/null || pacmd list-sources 2>/dev/null | grep "name:"
+    echo ""
+    echo -e "${GREEN}💡 Monitor sources (system audio):${NC}"
+    pactl list sources short 2>/dev/null | grep -i "monitor" || echo "No monitor sources found"
+    exit 0
+fi
+
+# Help message
+if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "  --list             List all available audio sources"
+    echo "  --source SOURCE    Use specific audio source"
+    echo "  --microphone       Capture microphone instead of system audio"
+    echo "  --combined         Capture both microphone and system audio"
+    echo "  --help, -h         Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0                 # Auto-detect and transcribe system audio"
+    echo "  $0 --microphone    # Transcribe from microphone"
+    echo "  $0 --combined      # Transcribe both mic and system audio"
+    echo ""
+    exit 0
+fi
+
+# Determine what to capture
+if [ "$1" == "--microphone" ]; then
+    echo -e "${GREEN}🎤 Using microphone input${NC}"
+    # Run the existing live-transcribe for microphone
+    exec cargo run --bin live-transcribe
+    exit 0
+elif [ "$1" == "--combined" ]; then
+    echo -e "${YELLOW}🎤+🔊 Combined audio capture not yet implemented${NC}"
+    echo "For now, please run two separate instances:"
+    echo "  1. $0 (for system audio)"
+    echo "  2. $0 --microphone (for mic)"
+    exit 1
+elif [ "$1" == "--source" ] && [ -n "$2" ]; then
+    SOURCE="$2"
+    echo -e "${GREEN}📡 Using specified source: $SOURCE${NC}"
+else
+    # Auto-detect monitor source
+    SOURCE=$(find_monitor_source)
+    if [ -z "$SOURCE" ]; then
+        echo -e "${RED}❌ Could not find system audio monitor source${NC}"
+        echo ""
+        echo "This might happen if:"
+        echo "  1. No audio is currently playing"
+        echo "  2. PipeWire/PulseAudio is not running"
+        echo ""
+        echo "Try:"
+        echo "  1. Play some audio (music/video)"
+        echo "  2. Run: $0 --list"
+        echo "  3. Use a specific source: $0 --source <source_name>"
+        exit 1
+    fi
+    echo -e "${GREEN}📡 Found system audio source: $SOURCE${NC}"
+fi
+
+echo ""
+echo -e "${GREEN}🎬 Starting video call transcription...${NC}"
+echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
+echo ""
+echo "💡 Tips for best results:"
+echo "  • Join your video call first"
+echo "  • Use headphones to avoid echo"
+echo "  • Close other audio sources (music, videos)"
+echo "  • Speak clearly for better transcription"
+echo ""
+echo "────────────────────────────────────────────────────────────────────"
+echo ""
+
+# Start audio capture and transcription
+echo -e "${GREEN}Starting audio capture from: $SOURCE${NC}"
+echo -e "${GREEN}Starting transcription service...${NC}"
+echo ""
+
+# Use our new stdin-transcribe binary that accepts piped audio
+# parec captures system audio and pipes it directly to our transcriber
+# --no-vad disables Voice Activity Detection for system audio (YouTube, music, etc.)
+parec --format=s16le --rate=16000 --channels=1 --device="$SOURCE" 2>/dev/null | \
+cargo run --bin stdin-transcribe -- --language en --chunk-seconds 2.5 --no-vad