transcription api

This commit is contained in:
2025-09-11 16:33:46 +02:00
parent 080dd6776d
commit 1707bf917d
6 changed files with 594 additions and 4 deletions

View File

@@ -40,3 +40,11 @@ path = "src/live_transcribe.rs"
[[bin]] [[bin]]
name = "realtime-playback" name = "realtime-playback"
path = "src/realtime_playback.rs" path = "src/realtime_playback.rs"
[[bin]]
name = "system-audio"
path = "src/system_audio_transcribe.rs"
[[bin]]
name = "stdin-transcribe"
path = "src/stdin_transcribe.rs"

View File

@@ -76,7 +76,7 @@ async fn main() -> Result<()> {
// Also open the file for streaming to transcription service // Also open the file for streaming to transcription service
// We need to read the raw audio data for transcription // We need to read the raw audio data for transcription
let mut wav_reader = WavReader::open(&file_path)?; let mut wav_reader = WavReader::open(&file_path)?;
let wav_spec = wav_reader.spec(); let _wav_spec = wav_reader.spec();
// Collect all samples for streaming // Collect all samples for streaming
let samples: Vec<i16> = wav_reader.samples::<i16>() let samples: Vec<i16> = wav_reader.samples::<i16>()

View File

@@ -0,0 +1,195 @@
use anyhow::Result;
use clap::Parser;
use futures_util::StreamExt;
use std::io::{self, Read};
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tracing::{error, info};
// Import generated protobuf types
pub mod transcription {
tonic::include_proto!("transcription");
}
use transcription::{
transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
};
#[derive(Parser, Debug)]
#[command(author, version, about = "Transcribe audio from stdin (for piping from parec)", long_about = None)]
struct Args {
/// gRPC server address
#[arg(short, long, default_value = "http://localhost:50051")]
server: String,
/// Language code (e.g., "en", "es", "auto")
#[arg(short, long, default_value = "en")]
language: String,
/// Task: transcribe or translate
#[arg(short, long, default_value = "transcribe")]
task: String,
/// Model to use
#[arg(short, long, default_value = "base")]
model: String,
/// Show timestamps
#[arg(short = 'T', long)]
timestamps: bool,
/// Chunk size in seconds (for buffering)
#[arg(short, long, default_value = "3.0")]
chunk_seconds: f32,
/// Disable VAD (Voice Activity Detection) - useful for music/system audio
#[arg(long)]
no_vad: bool,
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt::init();
let args = Args::parse();
info!("Connecting to transcription service at {}", args.server);
let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
// Create channel for audio chunks
let (tx, rx) = mpsc::channel::<AudioChunk>(100);
// Spawn task to read from stdin and send chunks
let tx_clone = tx.clone();
let chunk_seconds = args.chunk_seconds;
std::thread::spawn(move || {
if let Err(e) = read_stdin_and_send(tx_clone, chunk_seconds) {
error!("Error reading stdin: {}", e);
}
});
// Create the first chunk with configuration
let config = AudioConfig {
language: args.language.clone(),
task: args.task.clone(),
model: args.model.clone(),
sample_rate: 16000,
vad_enabled: !args.no_vad, // Disable VAD if --no-vad flag is used
};
// Send a configuration chunk first
let config_chunk = AudioChunk {
audio_data: vec![],
session_id: "stdin-transcribe".to_string(),
config: Some(config),
};
// Create stream from receiver
let stream = ReceiverStream::new(rx);
let stream = futures_util::stream::iter(vec![config_chunk]).chain(stream);
// Start streaming transcription
let request = tonic::Request::new(stream);
let mut response = client.stream_transcribe(request).await?.into_inner();
println!("\n🎧 Transcribing audio from stdin...");
println!("Press Ctrl+C to stop\n");
println!("{}", "".repeat(80));
let mut current_line = String::new();
// Process transcription responses
while let Some(result) = response.message().await? {
if !result.text.is_empty() {
if args.timestamps {
if result.is_final {
println!("[{:.1}s] {}", result.start_time, result.text);
current_line.clear();
} else {
print!("\r[{:.1}s] {:<80}", result.start_time, result.text);
use std::io::{self as stdio, Write};
stdio::stdout().flush()?;
current_line = result.text.clone();
}
} else {
if result.is_final {
println!("{}", result.text);
current_line.clear();
} else {
print!("\r{:<80}", result.text);
use std::io::{self as stdio, Write};
stdio::stdout().flush()?;
current_line = result.text.clone();
}
}
}
}
// Clear any remaining interim text
if !current_line.is_empty() {
println!();
}
Ok(())
}
fn read_stdin_and_send(tx: mpsc::Sender<AudioChunk>, chunk_seconds: f32) -> Result<()> {
let stdin = io::stdin();
let mut handle = stdin.lock();
// Calculate chunk size in bytes (16kHz, 16-bit mono)
let samples_per_chunk = (16000.0 * chunk_seconds) as usize;
let bytes_per_chunk = samples_per_chunk * 2; // 16-bit = 2 bytes
let mut buffer = vec![0u8; bytes_per_chunk];
info!("Reading audio from stdin (chunk size: {} bytes, {} seconds)",
bytes_per_chunk, chunk_seconds);
loop {
// Read a chunk from stdin
let mut total_read = 0;
while total_read < bytes_per_chunk {
match handle.read(&mut buffer[total_read..]) {
Ok(0) => {
// EOF reached
if total_read > 0 {
// Send remaining data
let audio_chunk = AudioChunk {
audio_data: buffer[..total_read].to_vec(),
session_id: String::new(),
config: None,
};
let _ = tx.blocking_send(audio_chunk);
}
info!("End of stdin reached");
return Ok(());
}
Ok(n) => {
total_read += n;
}
Err(e) if e.kind() == io::ErrorKind::Interrupted => {
// Retry on interrupt
continue;
}
Err(e) => {
error!("Error reading stdin: {}", e);
return Err(e.into());
}
}
}
// Send the chunk
let audio_chunk = AudioChunk {
audio_data: buffer.clone(),
session_id: String::new(),
config: None,
};
if tx.blocking_send(audio_chunk).is_err() {
// Receiver dropped, exit
break;
}
}
Ok(())
}

View File

@@ -7,12 +7,10 @@ use anyhow::Result;
use clap::Parser; use clap::Parser;
use futures_util::StreamExt; use futures_util::StreamExt;
use hound::WavReader; use hound::WavReader;
use std::fs::File;
use std::time::Duration; use std::time::Duration;
use tokio::sync::mpsc; use tokio::sync::mpsc;
use tokio::time; use tokio::time;
use tokio_stream::wrappers::ReceiverStream; use tokio_stream::wrappers::ReceiverStream;
use tonic::transport::Channel;
use tracing::info; use tracing::info;
// Import generated protobuf types // Import generated protobuf types

View File

@@ -0,0 +1,250 @@
use anyhow::Result;
use clap::Parser;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use futures_util::StreamExt;
use std::sync::{Arc, Mutex};
use tokio::sync::mpsc as tokio_mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tracing::{error, info, warn};
// Import generated protobuf types
pub mod transcription {
tonic::include_proto!("transcription");
}
use transcription::{
transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
};
#[derive(Parser, Debug)]
#[command(author, version, about = "Capture and transcribe system audio", long_about = None)]
struct Args {
/// gRPC server address
#[arg(short, long, default_value = "http://localhost:50051")]
server: String,
/// Language code (e.g., "en", "es", "auto")
#[arg(short, long, default_value = "en")]
language: String,
/// Task: transcribe or translate
#[arg(short, long, default_value = "transcribe")]
task: String,
/// Model to use
#[arg(short, long, default_value = "base")]
model: String,
/// List available audio devices
#[arg(long)]
list_devices: bool,
/// Audio device name or index to use (e.g., "pulse.monitor" for PulseAudio monitor)
#[arg(short, long)]
device: Option<String>,
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt::init();
let args = Args::parse();
// List devices if requested
if args.list_devices {
list_audio_devices()?;
return Ok(());
}
info!("Connecting to transcription service at {}", args.server);
let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
// Create channel for audio chunks
let (tx, rx) = tokio_mpsc::channel::<AudioChunk>(100);
// Start audio capture in a separate thread
let device_name = args.device.clone();
std::thread::spawn(move || {
if let Err(e) = capture_system_audio(tx, device_name) {
error!("Audio capture error: {}", e);
}
});
// Create the first chunk with configuration
let config = AudioConfig {
language: args.language.clone(),
task: args.task.clone(),
model: args.model.clone(),
sample_rate: 16000,
vad_enabled: true, // Enable VAD to filter silence
};
// Send a configuration chunk first
let config_chunk = AudioChunk {
audio_data: vec![],
session_id: "system-audio".to_string(),
config: Some(config),
};
// Create stream from receiver
let stream_vec = vec![config_chunk];
let stream = ReceiverStream::new(rx);
let stream = futures_util::stream::iter(stream_vec).chain(stream);
// Start streaming transcription
let request = tonic::Request::new(stream);
let mut response = client.stream_transcribe(request).await?.into_inner();
println!("\n🎧 Capturing system audio for transcription...");
println!("Press Ctrl+C to stop\n");
println!("{}", "".repeat(80));
// Process transcription responses
while let Some(result) = response.message().await? {
if !result.text.is_empty() {
if result.is_final {
println!("[FINAL] {}", result.text);
} else {
print!("\r[INTERIM] {:<80}", result.text);
use std::io::{self, Write};
io::stdout().flush()?;
}
}
}
Ok(())
}
/// List all available audio devices
fn list_audio_devices() -> Result<()> {
let host = cpal::default_host();
println!("\n📊 Available Audio Devices:");
println!("{}", "".repeat(80));
// List input devices
println!("\n🎤 Input Devices:");
for (idx, device) in host.input_devices()?.enumerate() {
let name = device.name()?;
let is_monitor = name.contains("monitor") || name.contains("Monitor") ||
name.contains("loopback") || name.contains("Loopback") ||
name.contains("stereo mix") || name.contains("Stereo Mix");
if is_monitor {
println!(" [{}] {} 🔊 (System Audio)", idx, name);
} else {
println!(" [{}] {}", idx, name);
}
}
// Show default device
if let Some(device) = host.default_input_device() {
println!("\n⭐ Default Input: {}", device.name()?);
}
println!("\n💡 Tips for capturing system audio:");
println!(" Linux: Look for devices with 'monitor' in the name (PulseAudio/PipeWire)");
println!(" Windows: Install VB-Cable or enable 'Stereo Mix' in sound settings");
println!(" macOS: Install BlackHole or Loopback for system audio capture");
Ok(())
}
/// Capture audio from system (or specified device)
fn capture_system_audio(tx: tokio_mpsc::Sender<AudioChunk>, device_name: Option<String>) -> Result<()> {
let host = cpal::default_host();
// Find the appropriate audio device
let device = if let Some(name) = device_name {
// Try to find device by name
let mut found_device = None;
for input_device in host.input_devices()? {
if input_device.name()?.contains(&name) {
found_device = Some(input_device);
break;
}
}
found_device.ok_or_else(|| anyhow::anyhow!("Device '{}' not found. Use --list-devices to see available devices.", name))?
} else {
// Try to find a monitor/loopback device automatically
let mut monitor_device = None;
for input_device in host.input_devices()? {
let name = input_device.name()?;
if name.contains("monitor") || name.contains("Monitor") ||
name.contains("loopback") || name.contains("Loopback") ||
name.contains("stereo mix") || name.contains("Stereo Mix") {
info!("Found system audio device: {}", name);
monitor_device = Some(input_device);
break;
}
}
if let Some(device) = monitor_device {
device
} else {
warn!("No system audio device found. Using default input device.");
warn!("To capture system audio, you may need to:");
warn!(" - Linux: Enable PulseAudio monitor");
warn!(" - Windows: Enable Stereo Mix or install VB-Cable");
warn!(" - macOS: Install BlackHole or Loopback");
host.default_input_device()
.ok_or_else(|| anyhow::anyhow!("No input device available"))?
}
};
info!("Using audio device: {}", device.name()?);
// Configure audio capture for 16kHz mono PCM16
let config = cpal::StreamConfig {
channels: 1,
sample_rate: cpal::SampleRate(16000),
buffer_size: cpal::BufferSize::Default,
};
// Buffer to accumulate audio samples
let buffer = Arc::new(Mutex::new(Vec::new()));
let buffer_clone = buffer.clone();
// Create audio stream
let stream = device.build_input_stream(
&config,
move |data: &[i16], _: &cpal::InputCallbackInfo| {
let mut buf = buffer_clone.lock().unwrap();
buf.extend_from_slice(data);
// Send chunks of ~3 seconds (48000 samples at 16kHz) for better accuracy
while buf.len() >= 48000 {
let chunk: Vec<i16> = buf.drain(..48000).collect();
// Convert i16 to bytes
let bytes: Vec<u8> = chunk.iter()
.flat_map(|&sample| sample.to_le_bytes())
.collect();
let audio_chunk = AudioChunk {
audio_data: bytes,
session_id: String::new(),
config: None,
};
// Send chunk (ignore errors if receiver is closed)
let tx_clone = tx.clone();
tokio::spawn(async move {
let _ = tx_clone.send(audio_chunk).await;
});
}
},
move |err| {
error!("Audio stream error: {}", err);
},
None
)?;
// Start the stream
stream.play()?;
info!("Audio capture started");
// Keep the stream alive
loop {
std::thread::sleep(std::time::Duration::from_secs(1));
}
}

View File

@@ -0,0 +1,139 @@
#!/bin/bash
# Enhanced script for transcribing video calls on Ubuntu with PipeWire
# Uses parec (PulseAudio compatibility) to capture system audio
set -e
echo "🎥 Video Call Transcription Service"
echo "===================================="
echo ""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Check dependencies
check_dependency() {
if ! command -v $1 &> /dev/null; then
echo -e "${RED}$1 not found.${NC}"
echo "Please install: sudo apt-get install $2"
return 1
fi
return 0
}
echo "Checking dependencies..."
check_dependency "parec" "pulseaudio-utils" || exit 1
check_dependency "sox" "sox" || echo -e "${YELLOW}⚠️ sox not installed (optional but recommended)${NC}"
# Function to find the monitor source for system audio
find_monitor_source() {
# List all sources and find monitors (what you hear)
local monitors=$(pactl list sources short 2>/dev/null | grep -i "monitor" | awk '{print $2}')
if [ -z "$monitors" ]; then
# Try pacmd if pactl doesn't work
monitors=$(pacmd list-sources 2>/dev/null | grep "name:" | grep "monitor" | sed 's/.*<\(.*\)>.*/\1/')
fi
if [ -z "$monitors" ]; then
# Fallback: try to construct monitor name from default sink
local default_sink=$(pactl info 2>/dev/null | grep "Default Sink" | cut -d: -f2 | xargs)
if [ -n "$default_sink" ]; then
monitors="${default_sink}.monitor"
fi
fi
echo "$monitors" | head -1
}
# List available sources
if [ "$1" == "--list" ]; then
echo -e "${GREEN}📊 Available Audio Sources:${NC}"
echo ""
pactl list sources short 2>/dev/null || pacmd list-sources 2>/dev/null | grep "name:"
echo ""
echo -e "${GREEN}💡 Monitor sources (system audio):${NC}"
pactl list sources short 2>/dev/null | grep -i "monitor" || echo "No monitor sources found"
exit 0
fi
# Help message
if [ "$1" == "--help" ] || [ "$1" == "-h" ]; then
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --list List all available audio sources"
echo " --source SOURCE Use specific audio source"
echo " --microphone Capture microphone instead of system audio"
echo " --combined Capture both microphone and system audio"
echo " --help, -h Show this help message"
echo ""
echo "Examples:"
echo " $0 # Auto-detect and transcribe system audio"
echo " $0 --microphone # Transcribe from microphone"
echo " $0 --combined # Transcribe both mic and system audio"
echo ""
exit 0
fi
# Determine what to capture
if [ "$1" == "--microphone" ]; then
echo -e "${GREEN}🎤 Using microphone input${NC}"
# Run the existing live-transcribe for microphone
exec cargo run --bin live-transcribe
exit 0
elif [ "$1" == "--combined" ]; then
echo -e "${YELLOW}🎤+🔊 Combined audio capture not yet implemented${NC}"
echo "For now, please run two separate instances:"
echo " 1. $0 (for system audio)"
echo " 2. $0 --microphone (for mic)"
exit 1
elif [ "$1" == "--source" ] && [ -n "$2" ]; then
SOURCE="$2"
echo -e "${GREEN}📡 Using specified source: $SOURCE${NC}"
else
# Auto-detect monitor source
SOURCE=$(find_monitor_source)
if [ -z "$SOURCE" ]; then
echo -e "${RED}❌ Could not find system audio monitor source${NC}"
echo ""
echo "This might happen if:"
echo " 1. No audio is currently playing"
echo " 2. PipeWire/PulseAudio is not running"
echo ""
echo "Try:"
echo " 1. Play some audio (music/video)"
echo " 2. Run: $0 --list"
echo " 3. Use a specific source: $0 --source <source_name>"
exit 1
fi
echo -e "${GREEN}📡 Found system audio source: $SOURCE${NC}"
fi
echo ""
echo -e "${GREEN}🎬 Starting video call transcription...${NC}"
echo -e "${YELLOW}Press Ctrl+C to stop${NC}"
echo ""
echo "💡 Tips for best results:"
echo " • Join your video call first"
echo " • Use headphones to avoid echo"
echo " • Close other audio sources (music, videos)"
echo " • Speak clearly for better transcription"
echo ""
echo "────────────────────────────────────────────────────────────────────"
echo ""
# Start audio capture and transcription
echo -e "${GREEN}Starting audio capture from: $SOURCE${NC}"
echo -e "${GREEN}Starting transcription service...${NC}"
echo ""
# Use our new stdin-transcribe binary that accepts piped audio
# parec captures system audio and pipes it directly to our transcriber
# --no-vad disables Voice Activity Detection for system audio (YouTube, music, etc.)
parec --format=s16le --rate=16000 --channels=1 --device="$SOURCE" 2>/dev/null | \
cargo run --bin stdin-transcribe -- --language en --chunk-seconds 2.5 --no-vad