From 1707bf917d4b6b8519135cafd9e97deecb15f9e9 Mon Sep 17 00:00:00 2001 From: Aljaz Ceru Date: Thu, 11 Sep 2025 16:33:46 +0200 Subject: [PATCH] transcription api --- examples/rust-client/Cargo.toml | 10 +- examples/rust-client/src/realtime_playback.rs | 2 +- examples/rust-client/src/stdin_transcribe.rs | 195 ++++++++++++++ examples/rust-client/src/stream_transcribe.rs | 2 - .../src/system_audio_transcribe.rs | 250 ++++++++++++++++++ examples/rust-client/transcribe_video_call.sh | 139 ++++++++++ 6 files changed, 594 insertions(+), 4 deletions(-) create mode 100644 examples/rust-client/src/stdin_transcribe.rs create mode 100644 examples/rust-client/src/system_audio_transcribe.rs create mode 100755 examples/rust-client/transcribe_video_call.sh diff --git a/examples/rust-client/Cargo.toml b/examples/rust-client/Cargo.toml index 008fd98..bef0cc2 100644 --- a/examples/rust-client/Cargo.toml +++ b/examples/rust-client/Cargo.toml @@ -39,4 +39,12 @@ path = "src/live_transcribe.rs" [[bin]] name = "realtime-playback" -path = "src/realtime_playback.rs" \ No newline at end of file +path = "src/realtime_playback.rs" + +[[bin]] +name = "system-audio" +path = "src/system_audio_transcribe.rs" + +[[bin]] +name = "stdin-transcribe" +path = "src/stdin_transcribe.rs" \ No newline at end of file diff --git a/examples/rust-client/src/realtime_playback.rs b/examples/rust-client/src/realtime_playback.rs index 3c64cb5..81fd311 100644 --- a/examples/rust-client/src/realtime_playback.rs +++ b/examples/rust-client/src/realtime_playback.rs @@ -76,7 +76,7 @@ async fn main() -> Result<()> { // Also open the file for streaming to transcription service // We need to read the raw audio data for transcription let mut wav_reader = WavReader::open(&file_path)?; - let wav_spec = wav_reader.spec(); + let _wav_spec = wav_reader.spec(); // Collect all samples for streaming let samples: Vec = wav_reader.samples::() diff --git a/examples/rust-client/src/stdin_transcribe.rs b/examples/rust-client/src/stdin_transcribe.rs new file mode 100644 index 0000000..0d2dcb6 --- /dev/null +++ b/examples/rust-client/src/stdin_transcribe.rs @@ -0,0 +1,195 @@ +use anyhow::Result; +use clap::Parser; +use futures_util::StreamExt; +use std::io::{self, Read}; +use tokio::sync::mpsc; +use tokio_stream::wrappers::ReceiverStream; +use tracing::{error, info}; + +// Import generated protobuf types +pub mod transcription { + tonic::include_proto!("transcription"); +} + +use transcription::{ + transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig, +}; + +#[derive(Parser, Debug)] +#[command(author, version, about = "Transcribe audio from stdin (for piping from parec)", long_about = None)] +struct Args { + /// gRPC server address + #[arg(short, long, default_value = "http://localhost:50051")] + server: String, + + /// Language code (e.g., "en", "es", "auto") + #[arg(short, long, default_value = "en")] + language: String, + + /// Task: transcribe or translate + #[arg(short, long, default_value = "transcribe")] + task: String, + + /// Model to use + #[arg(short, long, default_value = "base")] + model: String, + + /// Show timestamps + #[arg(short = 'T', long)] + timestamps: bool, + + /// Chunk size in seconds (for buffering) + #[arg(short, long, default_value = "3.0")] + chunk_seconds: f32, + + /// Disable VAD (Voice Activity Detection) - useful for music/system audio + #[arg(long)] + no_vad: bool, +} + +#[tokio::main] +async fn main() -> Result<()> { + tracing_subscriber::fmt::init(); + let args = Args::parse(); + + info!("Connecting to transcription service at {}", args.server); + let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?; + + // Create channel for audio chunks + let (tx, rx) = mpsc::channel::(100); + + // Spawn task to read from stdin and send chunks + let tx_clone = tx.clone(); + let chunk_seconds = args.chunk_seconds; + std::thread::spawn(move || { + if let Err(e) = read_stdin_and_send(tx_clone, chunk_seconds) { + error!("Error reading stdin: {}", e); + } + }); + + // Create the first chunk with configuration + let config = AudioConfig { + language: args.language.clone(), + task: args.task.clone(), + model: args.model.clone(), + sample_rate: 16000, + vad_enabled: !args.no_vad, // Disable VAD if --no-vad flag is used + }; + + // Send a configuration chunk first + let config_chunk = AudioChunk { + audio_data: vec![], + session_id: "stdin-transcribe".to_string(), + config: Some(config), + }; + + // Create stream from receiver + let stream = ReceiverStream::new(rx); + let stream = futures_util::stream::iter(vec![config_chunk]).chain(stream); + + // Start streaming transcription + let request = tonic::Request::new(stream); + let mut response = client.stream_transcribe(request).await?.into_inner(); + + println!("\nšŸŽ§ Transcribing audio from stdin..."); + println!("Press Ctrl+C to stop\n"); + println!("{}", "─".repeat(80)); + + let mut current_line = String::new(); + + // Process transcription responses + while let Some(result) = response.message().await? { + if !result.text.is_empty() { + if args.timestamps { + if result.is_final { + println!("[{:.1}s] {}", result.start_time, result.text); + current_line.clear(); + } else { + print!("\r[{:.1}s] {:<80}", result.start_time, result.text); + use std::io::{self as stdio, Write}; + stdio::stdout().flush()?; + current_line = result.text.clone(); + } + } else { + if result.is_final { + println!("{}", result.text); + current_line.clear(); + } else { + print!("\r{:<80}", result.text); + use std::io::{self as stdio, Write}; + stdio::stdout().flush()?; + current_line = result.text.clone(); + } + } + } + } + + // Clear any remaining interim text + if !current_line.is_empty() { + println!(); + } + + Ok(()) +} + +fn read_stdin_and_send(tx: mpsc::Sender, chunk_seconds: f32) -> Result<()> { + let stdin = io::stdin(); + let mut handle = stdin.lock(); + + // Calculate chunk size in bytes (16kHz, 16-bit mono) + let samples_per_chunk = (16000.0 * chunk_seconds) as usize; + let bytes_per_chunk = samples_per_chunk * 2; // 16-bit = 2 bytes + + let mut buffer = vec![0u8; bytes_per_chunk]; + + info!("Reading audio from stdin (chunk size: {} bytes, {} seconds)", + bytes_per_chunk, chunk_seconds); + + loop { + // Read a chunk from stdin + let mut total_read = 0; + while total_read < bytes_per_chunk { + match handle.read(&mut buffer[total_read..]) { + Ok(0) => { + // EOF reached + if total_read > 0 { + // Send remaining data + let audio_chunk = AudioChunk { + audio_data: buffer[..total_read].to_vec(), + session_id: String::new(), + config: None, + }; + let _ = tx.blocking_send(audio_chunk); + } + info!("End of stdin reached"); + return Ok(()); + } + Ok(n) => { + total_read += n; + } + Err(e) if e.kind() == io::ErrorKind::Interrupted => { + // Retry on interrupt + continue; + } + Err(e) => { + error!("Error reading stdin: {}", e); + return Err(e.into()); + } + } + } + + // Send the chunk + let audio_chunk = AudioChunk { + audio_data: buffer.clone(), + session_id: String::new(), + config: None, + }; + + if tx.blocking_send(audio_chunk).is_err() { + // Receiver dropped, exit + break; + } + } + + Ok(()) +} \ No newline at end of file diff --git a/examples/rust-client/src/stream_transcribe.rs b/examples/rust-client/src/stream_transcribe.rs index e00747f..c5fb443 100644 --- a/examples/rust-client/src/stream_transcribe.rs +++ b/examples/rust-client/src/stream_transcribe.rs @@ -7,12 +7,10 @@ use anyhow::Result; use clap::Parser; use futures_util::StreamExt; use hound::WavReader; -use std::fs::File; use std::time::Duration; use tokio::sync::mpsc; use tokio::time; use tokio_stream::wrappers::ReceiverStream; -use tonic::transport::Channel; use tracing::info; // Import generated protobuf types diff --git a/examples/rust-client/src/system_audio_transcribe.rs b/examples/rust-client/src/system_audio_transcribe.rs new file mode 100644 index 0000000..b6d96d9 --- /dev/null +++ b/examples/rust-client/src/system_audio_transcribe.rs @@ -0,0 +1,250 @@ +use anyhow::Result; +use clap::Parser; +use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; +use futures_util::StreamExt; +use std::sync::{Arc, Mutex}; +use tokio::sync::mpsc as tokio_mpsc; +use tokio_stream::wrappers::ReceiverStream; +use tracing::{error, info, warn}; + +// Import generated protobuf types +pub mod transcription { + tonic::include_proto!("transcription"); +} + +use transcription::{ + transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig, +}; + +#[derive(Parser, Debug)] +#[command(author, version, about = "Capture and transcribe system audio", long_about = None)] +struct Args { + /// gRPC server address + #[arg(short, long, default_value = "http://localhost:50051")] + server: String, + + /// Language code (e.g., "en", "es", "auto") + #[arg(short, long, default_value = "en")] + language: String, + + /// Task: transcribe or translate + #[arg(short, long, default_value = "transcribe")] + task: String, + + /// Model to use + #[arg(short, long, default_value = "base")] + model: String, + + /// List available audio devices + #[arg(long)] + list_devices: bool, + + /// Audio device name or index to use (e.g., "pulse.monitor" for PulseAudio monitor) + #[arg(short, long)] + device: Option, +} + +#[tokio::main] +async fn main() -> Result<()> { + tracing_subscriber::fmt::init(); + let args = Args::parse(); + + // List devices if requested + if args.list_devices { + list_audio_devices()?; + return Ok(()); + } + + info!("Connecting to transcription service at {}", args.server); + let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?; + + // Create channel for audio chunks + let (tx, rx) = tokio_mpsc::channel::(100); + + // Start audio capture in a separate thread + let device_name = args.device.clone(); + std::thread::spawn(move || { + if let Err(e) = capture_system_audio(tx, device_name) { + error!("Audio capture error: {}", e); + } + }); + + // Create the first chunk with configuration + let config = AudioConfig { + language: args.language.clone(), + task: args.task.clone(), + model: args.model.clone(), + sample_rate: 16000, + vad_enabled: true, // Enable VAD to filter silence + }; + + // Send a configuration chunk first + let config_chunk = AudioChunk { + audio_data: vec![], + session_id: "system-audio".to_string(), + config: Some(config), + }; + + // Create stream from receiver + let stream_vec = vec![config_chunk]; + let stream = ReceiverStream::new(rx); + let stream = futures_util::stream::iter(stream_vec).chain(stream); + + // Start streaming transcription + let request = tonic::Request::new(stream); + let mut response = client.stream_transcribe(request).await?.into_inner(); + + println!("\nšŸŽ§ Capturing system audio for transcription..."); + println!("Press Ctrl+C to stop\n"); + println!("{}", "─".repeat(80)); + + // Process transcription responses + while let Some(result) = response.message().await? { + if !result.text.is_empty() { + if result.is_final { + println!("[FINAL] {}", result.text); + } else { + print!("\r[INTERIM] {:<80}", result.text); + use std::io::{self, Write}; + io::stdout().flush()?; + } + } + } + + Ok(()) +} + +/// List all available audio devices +fn list_audio_devices() -> Result<()> { + let host = cpal::default_host(); + + println!("\nšŸ“Š Available Audio Devices:"); + println!("{}", "─".repeat(80)); + + // List input devices + println!("\nšŸŽ¤ Input Devices:"); + for (idx, device) in host.input_devices()?.enumerate() { + let name = device.name()?; + let is_monitor = name.contains("monitor") || name.contains("Monitor") || + name.contains("loopback") || name.contains("Loopback") || + name.contains("stereo mix") || name.contains("Stereo Mix"); + + if is_monitor { + println!(" [{}] {} šŸ”Š (System Audio)", idx, name); + } else { + println!(" [{}] {}", idx, name); + } + } + + // Show default device + if let Some(device) = host.default_input_device() { + println!("\n⭐ Default Input: {}", device.name()?); + } + + println!("\nšŸ’” Tips for capturing system audio:"); + println!(" Linux: Look for devices with 'monitor' in the name (PulseAudio/PipeWire)"); + println!(" Windows: Install VB-Cable or enable 'Stereo Mix' in sound settings"); + println!(" macOS: Install BlackHole or Loopback for system audio capture"); + + Ok(()) +} + +/// Capture audio from system (or specified device) +fn capture_system_audio(tx: tokio_mpsc::Sender, device_name: Option) -> Result<()> { + let host = cpal::default_host(); + + // Find the appropriate audio device + let device = if let Some(name) = device_name { + // Try to find device by name + let mut found_device = None; + for input_device in host.input_devices()? { + if input_device.name()?.contains(&name) { + found_device = Some(input_device); + break; + } + } + found_device.ok_or_else(|| anyhow::anyhow!("Device '{}' not found. Use --list-devices to see available devices.", name))? + } else { + // Try to find a monitor/loopback device automatically + let mut monitor_device = None; + for input_device in host.input_devices()? { + let name = input_device.name()?; + if name.contains("monitor") || name.contains("Monitor") || + name.contains("loopback") || name.contains("Loopback") || + name.contains("stereo mix") || name.contains("Stereo Mix") { + info!("Found system audio device: {}", name); + monitor_device = Some(input_device); + break; + } + } + + if let Some(device) = monitor_device { + device + } else { + warn!("No system audio device found. Using default input device."); + warn!("To capture system audio, you may need to:"); + warn!(" - Linux: Enable PulseAudio monitor"); + warn!(" - Windows: Enable Stereo Mix or install VB-Cable"); + warn!(" - macOS: Install BlackHole or Loopback"); + host.default_input_device() + .ok_or_else(|| anyhow::anyhow!("No input device available"))? + } + }; + + info!("Using audio device: {}", device.name()?); + + // Configure audio capture for 16kHz mono PCM16 + let config = cpal::StreamConfig { + channels: 1, + sample_rate: cpal::SampleRate(16000), + buffer_size: cpal::BufferSize::Default, + }; + + // Buffer to accumulate audio samples + let buffer = Arc::new(Mutex::new(Vec::new())); + let buffer_clone = buffer.clone(); + + // Create audio stream + let stream = device.build_input_stream( + &config, + move |data: &[i16], _: &cpal::InputCallbackInfo| { + let mut buf = buffer_clone.lock().unwrap(); + buf.extend_from_slice(data); + + // Send chunks of ~3 seconds (48000 samples at 16kHz) for better accuracy + while buf.len() >= 48000 { + let chunk: Vec = buf.drain(..48000).collect(); + + // Convert i16 to bytes + let bytes: Vec = chunk.iter() + .flat_map(|&sample| sample.to_le_bytes()) + .collect(); + + let audio_chunk = AudioChunk { + audio_data: bytes, + session_id: String::new(), + config: None, + }; + + // Send chunk (ignore errors if receiver is closed) + let tx_clone = tx.clone(); + tokio::spawn(async move { + let _ = tx_clone.send(audio_chunk).await; + }); + } + }, + move |err| { + error!("Audio stream error: {}", err); + }, + None + )?; + + // Start the stream + stream.play()?; + info!("Audio capture started"); + + // Keep the stream alive + loop { + std::thread::sleep(std::time::Duration::from_secs(1)); + } +} \ No newline at end of file diff --git a/examples/rust-client/transcribe_video_call.sh b/examples/rust-client/transcribe_video_call.sh new file mode 100755 index 0000000..809434a --- /dev/null +++ b/examples/rust-client/transcribe_video_call.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Enhanced script for transcribing video calls on Ubuntu with PipeWire +# Uses parec (PulseAudio compatibility) to capture system audio + +set -e + +echo "šŸŽ„ Video Call Transcription Service" +echo "====================================" +echo "" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Check dependencies +check_dependency() { + if ! command -v $1 &> /dev/null; then + echo -e "${RED}āŒ $1 not found.${NC}" + echo "Please install: sudo apt-get install $2" + return 1 + fi + return 0 +} + +echo "Checking dependencies..." +check_dependency "parec" "pulseaudio-utils" || exit 1 +check_dependency "sox" "sox" || echo -e "${YELLOW}āš ļø sox not installed (optional but recommended)${NC}" + +# Function to find the monitor source for system audio +find_monitor_source() { + # List all sources and find monitors (what you hear) + local monitors=$(pactl list sources short 2>/dev/null | grep -i "monitor" | awk '{print $2}') + + if [ -z "$monitors" ]; then + # Try pacmd if pactl doesn't work + monitors=$(pacmd list-sources 2>/dev/null | grep "name:" | grep "monitor" | sed 's/.*<\(.*\)>.*/\1/') + fi + + if [ -z "$monitors" ]; then + # Fallback: try to construct monitor name from default sink + local default_sink=$(pactl info 2>/dev/null | grep "Default Sink" | cut -d: -f2 | xargs) + if [ -n "$default_sink" ]; then + monitors="${default_sink}.monitor" + fi + fi + + echo "$monitors" | head -1 +} + +# List available sources +if [ "$1" == "--list" ]; then + echo -e "${GREEN}šŸ“Š Available Audio Sources:${NC}" + echo "" + pactl list sources short 2>/dev/null || pacmd list-sources 2>/dev/null | grep "name:" + echo "" + echo -e "${GREEN}šŸ’” Monitor sources (system audio):${NC}" + pactl list sources short 2>/dev/null | grep -i "monitor" || echo "No monitor sources found" + exit 0 +fi + +# Help message +if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then + echo "Usage: $0 [OPTIONS]" + echo "" + echo "Options:" + echo " --list List all available audio sources" + echo " --source SOURCE Use specific audio source" + echo " --microphone Capture microphone instead of system audio" + echo " --combined Capture both microphone and system audio" + echo " --help, -h Show this help message" + echo "" + echo "Examples:" + echo " $0 # Auto-detect and transcribe system audio" + echo " $0 --microphone # Transcribe from microphone" + echo " $0 --combined # Transcribe both mic and system audio" + echo "" + exit 0 +fi + +# Determine what to capture +if [ "$1" == "--microphone" ]; then + echo -e "${GREEN}šŸŽ¤ Using microphone input${NC}" + # Run the existing live-transcribe for microphone + exec cargo run --bin live-transcribe + exit 0 +elif [ "$1" == "--combined" ]; then + echo -e "${YELLOW}šŸŽ¤+šŸ”Š Combined audio capture not yet implemented${NC}" + echo "For now, please run two separate instances:" + echo " 1. $0 (for system audio)" + echo " 2. $0 --microphone (for mic)" + exit 1 +elif [ "$1" == "--source" ] && [ -n "$2" ]; then + SOURCE="$2" + echo -e "${GREEN}šŸ“” Using specified source: $SOURCE${NC}" +else + # Auto-detect monitor source + SOURCE=$(find_monitor_source) + if [ -z "$SOURCE" ]; then + echo -e "${RED}āŒ Could not find system audio monitor source${NC}" + echo "" + echo "This might happen if:" + echo " 1. No audio is currently playing" + echo " 2. PipeWire/PulseAudio is not running" + echo "" + echo "Try:" + echo " 1. Play some audio (music/video)" + echo " 2. Run: $0 --list" + echo " 3. Use a specific source: $0 --source " + exit 1 + fi + echo -e "${GREEN}šŸ“” Found system audio source: $SOURCE${NC}" +fi + +echo "" +echo -e "${GREEN}šŸŽ¬ Starting video call transcription...${NC}" +echo -e "${YELLOW}Press Ctrl+C to stop${NC}" +echo "" +echo "šŸ’” Tips for best results:" +echo " • Join your video call first" +echo " • Use headphones to avoid echo" +echo " • Close other audio sources (music, videos)" +echo " • Speak clearly for better transcription" +echo "" +echo "────────────────────────────────────────────────────────────────────" +echo "" + +# Start audio capture and transcription +echo -e "${GREEN}Starting audio capture from: $SOURCE${NC}" +echo -e "${GREEN}Starting transcription service...${NC}" +echo "" + +# Use our new stdin-transcribe binary that accepts piped audio +# parec captures system audio and pipes it directly to our transcriber +# --no-vad disables Voice Activity Detection for system audio (YouTube, music, etc.) +parec --format=s16le --rate=16000 --channels=1 --device="$SOURCE" 2>/dev/null | \ +cargo run --bin stdin-transcribe -- --language en --chunk-seconds 2.5 --no-vad