transcription api

2025-12-17 07:14:24 +01:00 · 2025-09-11 16:33:46 +02:00
parent 080dd6776d
commit 1707bf917d
6 changed files with 594 additions and 4 deletions
--- a/examples/rust-client/Cargo.toml
+++ b/examples/rust-client/Cargo.toml
@@ -40,3 +40,11 @@ path = "src/live_transcribe.rs"
 [[bin]]
 name = "realtime-playback"
 path = "src/realtime_playback.rs"
 [[bin]]
 name = "system-audio"
 path = "src/system_audio_transcribe.rs"
 [[bin]]
 name = "stdin-transcribe"
 path = "src/stdin_transcribe.rs"
--- a/examples/rust-client/src/realtime_playback.rs
+++ b/examples/rust-client/src/realtime_playback.rs
@@ -76,7 +76,7 @@ async fn main() -> Result<()> {
    // Also open the file for streaming to transcription service
    // We need to read the raw audio data for transcription
    let mut wav_reader = WavReader::open(&file_path)?;
-    let wav_spec = wav_reader.spec();
+    let _wav_spec = wav_reader.spec();
    // Collect all samples for streaming
    let samples: Vec<i16> = wav_reader.samples::<i16>()
--- a/examples/rust-client/src/stdin_transcribe.rs
+++ b/examples/rust-client/src/stdin_transcribe.rs
@@ -0,0 +1,195 @@
 use anyhow::Result;
 use clap::Parser;
 use futures_util::StreamExt;
 use std::io::{self, Read};
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{error, info};
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about = "Transcribe audio from stdin (for piping from parec)", long_about = None)]
 struct Args {
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "en")]
    language: String,
    /// Task: transcribe or translate
    #[arg(short, long, default_value = "transcribe")]
    task: String,
    /// Model to use
    #[arg(short, long, default_value = "base")]
    model: String,
    /// Show timestamps
    #[arg(short = 'T', long)]
    timestamps: bool,
    /// Chunk size in seconds (for buffering)
    #[arg(short, long, default_value = "3.0")]
    chunk_seconds: f32,
    /// Disable VAD (Voice Activity Detection) - useful for music/system audio
    #[arg(long)]
    no_vad: bool,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
    // Create channel for audio chunks
    let (tx, rx) = mpsc::channel::<AudioChunk>(100);
    // Spawn task to read from stdin and send chunks
    let tx_clone = tx.clone();
    let chunk_seconds = args.chunk_seconds;
    std::thread::spawn(move || {
        if let Err(e) = read_stdin_and_send(tx_clone, chunk_seconds) {
            error!("Error reading stdin: {}", e);
        }
    });
    // Create the first chunk with configuration
    let config = AudioConfig {
        language: args.language.clone(),
        task: args.task.clone(),
        model: args.model.clone(),
        sample_rate: 16000,
        vad_enabled: !args.no_vad,  // Disable VAD if --no-vad flag is used
    };
    // Send a configuration chunk first
    let config_chunk = AudioChunk {
        audio_data: vec![],
        session_id: "stdin-transcribe".to_string(),
        config: Some(config),
    };
    // Create stream from receiver
    let stream = ReceiverStream::new(rx);
    let stream = futures_util::stream::iter(vec![config_chunk]).chain(stream);
    // Start streaming transcription
    let request = tonic::Request::new(stream);
    let mut response = client.stream_transcribe(request).await?.into_inner();
    println!("\n🎧 Transcribing audio from stdin...");
    println!("Press Ctrl+C to stop\n");
    println!("{}", "─".repeat(80));
    let mut current_line = String::new();
    // Process transcription responses
    while let Some(result) = response.message().await? {
        if !result.text.is_empty() {
            if args.timestamps {
                if result.is_final {
                    println!("[{:.1}s] {}", result.start_time, result.text);
                    current_line.clear();
                } else {
                    print!("\r[{:.1}s] {:<80}", result.start_time, result.text);
                    use std::io::{self as stdio, Write};
                    stdio::stdout().flush()?;
                    current_line = result.text.clone();
                }
            } else {
                if result.is_final {
                    println!("{}", result.text);
                    current_line.clear();
                } else {
                    print!("\r{:<80}", result.text);
                    use std::io::{self as stdio, Write};
                    stdio::stdout().flush()?;
                    current_line = result.text.clone();
                }
            }
        }
    }
    // Clear any remaining interim text
    if !current_line.is_empty() {
        println!();
    }
    Ok(())
 }
 fn read_stdin_and_send(tx: mpsc::Sender<AudioChunk>, chunk_seconds: f32) -> Result<()> {
    let stdin = io::stdin();
    let mut handle = stdin.lock();
    // Calculate chunk size in bytes (16kHz, 16-bit mono)
    let samples_per_chunk = (16000.0 * chunk_seconds) as usize;
    let bytes_per_chunk = samples_per_chunk * 2; // 16-bit = 2 bytes
    let mut buffer = vec![0u8; bytes_per_chunk];
    info!("Reading audio from stdin (chunk size: {} bytes, {} seconds)", 
          bytes_per_chunk, chunk_seconds);
    loop {
        // Read a chunk from stdin
        let mut total_read = 0;
        while total_read < bytes_per_chunk {
            match handle.read(&mut buffer[total_read..]) {
                Ok(0) => {
                    // EOF reached
                    if total_read > 0 {
                        // Send remaining data
                        let audio_chunk = AudioChunk {
                            audio_data: buffer[..total_read].to_vec(),
                            session_id: String::new(),
                            config: None,
                        };
                        let _ = tx.blocking_send(audio_chunk);
                    }
                    info!("End of stdin reached");
                    return Ok(());
                }
                Ok(n) => {
                    total_read += n;
                }
                Err(e) if e.kind() == io::ErrorKind::Interrupted => {
                    // Retry on interrupt
                    continue;
                }
                Err(e) => {
                    error!("Error reading stdin: {}", e);
                    return Err(e.into());
                }
            }
        }
        // Send the chunk
        let audio_chunk = AudioChunk {
            audio_data: buffer.clone(),
            session_id: String::new(),
            config: None,
        };
        if tx.blocking_send(audio_chunk).is_err() {
            // Receiver dropped, exit
            break;
        }
    }
    Ok(())
 }
--- a/examples/rust-client/src/stream_transcribe.rs
+++ b/examples/rust-client/src/stream_transcribe.rs
@@ -7,12 +7,10 @@ use anyhow::Result;
 use clap::Parser;
 use futures_util::StreamExt;
 use hound::WavReader;
 use std::fs::File;
 use std::time::Duration;
 use tokio::sync::mpsc;
 use tokio::time;
 use tokio_stream::wrappers::ReceiverStream;
 use tonic::transport::Channel;
 use tracing::info;
 // Import generated protobuf types
--- a/examples/rust-client/src/system_audio_transcribe.rs
+++ b/examples/rust-client/src/system_audio_transcribe.rs
@@ -0,0 +1,250 @@
 use anyhow::Result;
 use clap::Parser;
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
 use futures_util::StreamExt;
 use std::sync::{Arc, Mutex};
 use tokio::sync::mpsc as tokio_mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{error, info, warn};
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about = "Capture and transcribe system audio", long_about = None)]
 struct Args {
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "en")]
    language: String,
    /// Task: transcribe or translate
    #[arg(short, long, default_value = "transcribe")]
    task: String,
    /// Model to use
    #[arg(short, long, default_value = "base")]
    model: String,
    /// List available audio devices
    #[arg(long)]
    list_devices: bool,
    /// Audio device name or index to use (e.g., "pulse.monitor" for PulseAudio monitor)
    #[arg(short, long)]
    device: Option<String>,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    // List devices if requested
    if args.list_devices {
        list_audio_devices()?;
        return Ok(());
    }
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
    // Create channel for audio chunks
    let (tx, rx) = tokio_mpsc::channel::<AudioChunk>(100);
    // Start audio capture in a separate thread
    let device_name = args.device.clone();
    std::thread::spawn(move || {
        if let Err(e) = capture_system_audio(tx, device_name) {
            error!("Audio capture error: {}", e);
        }
    });
    // Create the first chunk with configuration
    let config = AudioConfig {
        language: args.language.clone(),
        task: args.task.clone(),
        model: args.model.clone(),
        sample_rate: 16000,
        vad_enabled: true,  // Enable VAD to filter silence
    };
    // Send a configuration chunk first
    let config_chunk = AudioChunk {
        audio_data: vec![],
        session_id: "system-audio".to_string(),
        config: Some(config),
    };
    // Create stream from receiver
    let stream_vec = vec![config_chunk];
    let stream = ReceiverStream::new(rx);
    let stream = futures_util::stream::iter(stream_vec).chain(stream);
    // Start streaming transcription
    let request = tonic::Request::new(stream);
    let mut response = client.stream_transcribe(request).await?.into_inner();
    println!("\n🎧 Capturing system audio for transcription...");
    println!("Press Ctrl+C to stop\n");
    println!("{}", "─".repeat(80));
    // Process transcription responses
    while let Some(result) = response.message().await? {
        if !result.text.is_empty() {
            if result.is_final {
                println!("[FINAL] {}", result.text);
            } else {
                print!("\r[INTERIM] {:<80}", result.text);
                use std::io::{self, Write};
                io::stdout().flush()?;
            }
        }
    }
    Ok(())
 }
 /// List all available audio devices
 fn list_audio_devices() -> Result<()> {
    let host = cpal::default_host();
    println!("\n📊 Available Audio Devices:");
    println!("{}", "─".repeat(80));
    // List input devices
    println!("\n🎤 Input Devices:");
    for (idx, device) in host.input_devices()?.enumerate() {
        let name = device.name()?;
        let is_monitor = name.contains("monitor") || name.contains("Monitor") || 
                        name.contains("loopback") || name.contains("Loopback") ||
                        name.contains("stereo mix") || name.contains("Stereo Mix");
        if is_monitor {
            println!("  [{}] {} 🔊 (System Audio)", idx, name);
        } else {
            println!("  [{}] {}", idx, name);
        }
    }
    // Show default device
    if let Some(device) = host.default_input_device() {
        println!("\n⭐ Default Input: {}", device.name()?);
    }
    println!("\n💡 Tips for capturing system audio:");
    println!("  Linux: Look for devices with 'monitor' in the name (PulseAudio/PipeWire)");
    println!("  Windows: Install VB-Cable or enable 'Stereo Mix' in sound settings");
    println!("  macOS: Install BlackHole or Loopback for system audio capture");
    Ok(())
 }
 /// Capture audio from system (or specified device)
 fn capture_system_audio(tx: tokio_mpsc::Sender<AudioChunk>, device_name: Option<String>) -> Result<()> {
    let host = cpal::default_host();
    // Find the appropriate audio device
    let device = if let Some(name) = device_name {
        // Try to find device by name
        let mut found_device = None;
        for input_device in host.input_devices()? {
            if input_device.name()?.contains(&name) {
                found_device = Some(input_device);
                break;
            }
        }
        found_device.ok_or_else(|| anyhow::anyhow!("Device '{}' not found. Use --list-devices to see available devices.", name))?
    } else {
        // Try to find a monitor/loopback device automatically
        let mut monitor_device = None;
        for input_device in host.input_devices()? {
            let name = input_device.name()?;
            if name.contains("monitor") || name.contains("Monitor") || 
               name.contains("loopback") || name.contains("Loopback") ||
               name.contains("stereo mix") || name.contains("Stereo Mix") {
                info!("Found system audio device: {}", name);
                monitor_device = Some(input_device);
                break;
            }
        }
        if let Some(device) = monitor_device {
            device
        } else {
            warn!("No system audio device found. Using default input device.");
            warn!("To capture system audio, you may need to:");
            warn!("  - Linux: Enable PulseAudio monitor");
            warn!("  - Windows: Enable Stereo Mix or install VB-Cable");
            warn!("  - macOS: Install BlackHole or Loopback");
            host.default_input_device()
                .ok_or_else(|| anyhow::anyhow!("No input device available"))?
        }
    };
    info!("Using audio device: {}", device.name()?);
    // Configure audio capture for 16kHz mono PCM16
    let config = cpal::StreamConfig {
        channels: 1,
        sample_rate: cpal::SampleRate(16000),
        buffer_size: cpal::BufferSize::Default,
    };
    // Buffer to accumulate audio samples
    let buffer = Arc::new(Mutex::new(Vec::new()));
    let buffer_clone = buffer.clone();
    // Create audio stream
    let stream = device.build_input_stream(
        &config,
        move |data: &[i16], _: &cpal::InputCallbackInfo| {
            let mut buf = buffer_clone.lock().unwrap();
            buf.extend_from_slice(data);
            // Send chunks of ~3 seconds (48000 samples at 16kHz) for better accuracy
            while buf.len() >= 48000 {
                let chunk: Vec<i16> = buf.drain(..48000).collect();
                // Convert i16 to bytes
                let bytes: Vec<u8> = chunk.iter()
                    .flat_map(|&sample| sample.to_le_bytes())
                    .collect();
                let audio_chunk = AudioChunk {
                    audio_data: bytes,
                    session_id: String::new(),
                    config: None,
                };
                // Send chunk (ignore errors if receiver is closed)
                let tx_clone = tx.clone();
                tokio::spawn(async move {
                    let _ = tx_clone.send(audio_chunk).await;
                });
            }
        },
        move |err| {
            error!("Audio stream error: {}", err);
        },
        None
    )?;
    // Start the stream
    stream.play()?;
    info!("Audio capture started");
    // Keep the stream alive
    loop {
        std::thread::sleep(std::time::Duration::from_secs(1));
    }
 }
--- a/examples/rust-client/transcribe_video_call.sh
+++ b/examples/rust-client/transcribe_video_call.sh
@@ -0,0 +1,139 @@
 #!/bin/bash
 # Enhanced script for transcribing video calls on Ubuntu with PipeWire
 # Uses parec (PulseAudio compatibility) to capture system audio
 set -e
 echo "🎥 Video Call Transcription Service"
 echo "===================================="
 echo ""
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 # Check dependencies
 check_dependency() {
    if ! command -v $1 &> /dev/null; then
        echo -e "${RED}❌ $1 not found.${NC}"
        echo "Please install: sudo apt-get install $2"
        return 1
    fi
    return 0
 }
 echo "Checking dependencies..."
 check_dependency "parec" "pulseaudio-utils" || exit 1
 check_dependency "sox" "sox" || echo -e "${YELLOW}⚠️  sox not installed (optional but recommended)${NC}"
 # Function to find the monitor source for system audio
 find_monitor_source() {
    # List all sources and find monitors (what you hear)
    local monitors=$(pactl list sources short 2>/dev/null | grep -i "monitor" | awk '{print $2}')
    if [ -z "$monitors" ]; then
        # Try pacmd if pactl doesn't work
        monitors=$(pacmd list-sources 2>/dev/null | grep "name:" | grep "monitor" | sed 's/.*<\(.*\)>.*/\1/')
    fi
    if [ -z "$monitors" ]; then
        # Fallback: try to construct monitor name from default sink
        local default_sink=$(pactl info 2>/dev/null | grep "Default Sink" | cut -d: -f2 | xargs)
        if [ -n "$default_sink" ]; then
            monitors="${default_sink}.monitor"
        fi
    fi
    echo "$monitors" | head -1
 }
 # List available sources
 if [ "$1" == "--list" ]; then
    echo -e "${GREEN}📊 Available Audio Sources:${NC}"
    echo ""
    pactl list sources short 2>/dev/null || pacmd list-sources 2>/dev/null | grep "name:"
    echo ""
    echo -e "${GREEN}💡 Monitor sources (system audio):${NC}"
    pactl list sources short 2>/dev/null | grep -i "monitor" || echo "No monitor sources found"
    exit 0
 fi
 # Help message
 if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --list             List all available audio sources"
    echo "  --source SOURCE    Use specific audio source"
    echo "  --microphone       Capture microphone instead of system audio"
    echo "  --combined         Capture both microphone and system audio"
    echo "  --help, -h         Show this help message"
    echo ""
    echo "Examples:"
    echo "  $0                 # Auto-detect and transcribe system audio"
    echo "  $0 --microphone    # Transcribe from microphone"
    echo "  $0 --combined      # Transcribe both mic and system audio"
    echo ""
    exit 0
 fi
 # Determine what to capture
 if [ "$1" == "--microphone" ]; then
    echo -e "${GREEN}🎤 Using microphone input${NC}"
    # Run the existing live-transcribe for microphone
    exec cargo run --bin live-transcribe
    exit 0
 elif [ "$1" == "--combined" ]; then
    echo -e "${YELLOW}🎤+🔊 Combined audio capture not yet implemented${NC}"
    echo "For now, please run two separate instances:"
    echo "  1. $0 (for system audio)"
    echo "  2. $0 --microphone (for mic)"
    exit 1
 elif [ "$1" == "--source" ] && [ -n "$2" ]; then
    SOURCE="$2"
    echo -e "${GREEN}📡 Using specified source: $SOURCE${NC}"
 else
    # Auto-detect monitor source
    SOURCE=$(find_monitor_source)
    if [ -z "$SOURCE" ]; then
        echo -e "${RED}❌ Could not find system audio monitor source${NC}"
        echo ""
        echo "This might happen if:"
        echo "  1. No audio is currently playing"
        echo "  2. PipeWire/PulseAudio is not running"
        echo ""
        echo "Try:"
        echo "  1. Play some audio (music/video)"
        echo "  2. Run: $0 --list"
        echo "  3. Use a specific source: $0 --source <source_name>"
        exit 1
    fi
    echo -e "${GREEN}📡 Found system audio source: $SOURCE${NC}"
 fi
 echo ""
 echo -e "${GREEN}🎬 Starting video call transcription...${NC}"
 echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
 echo ""
 echo "💡 Tips for best results:"
 echo "  • Join your video call first"
 echo "  • Use headphones to avoid echo"
 echo "  • Close other audio sources (music, videos)"
 echo "  • Speak clearly for better transcription"
 echo ""
 echo "────────────────────────────────────────────────────────────────────"
 echo ""
 # Start audio capture and transcription
 echo -e "${GREEN}Starting audio capture from: $SOURCE${NC}"
 echo -e "${GREEN}Starting transcription service...${NC}"
 echo ""
 # Use our new stdin-transcribe binary that accepts piped audio
 # parec captures system audio and pipes it directly to our transcriber
 # --no-vad disables Voice Activity Detection for system audio (YouTube, music, etc.)
 parec --format=s16le --rate=16000 --channels=1 --device="$SOURCE" 2>/dev/null | \
 cargo run --bin stdin-transcribe -- --language en --chunk-seconds 2.5 --no-vad