From 0408a1fc07960f47f1a66b856d33be20426fef57 Mon Sep 17 00:00:00 2001 From: Aljaz Ceru Date: Mon, 1 Dec 2025 15:06:29 +0100 Subject: [PATCH] cleanup --- PERFORMANCE_OPTIMIZATIONS.md | 215 ------------------ README.md | 16 +- examples/rust-client/src/realtime_playback.rs | 2 +- .../src/system_audio_transcribe.rs | 6 +- examples/rust-client/transcribe_video_call.sh | 22 +- 5 files changed, 23 insertions(+), 238 deletions(-) delete mode 100644 PERFORMANCE_OPTIMIZATIONS.md diff --git a/PERFORMANCE_OPTIMIZATIONS.md b/PERFORMANCE_OPTIMIZATIONS.md deleted file mode 100644 index 55c7dff..0000000 --- a/PERFORMANCE_OPTIMIZATIONS.md +++ /dev/null @@ -1,215 +0,0 @@ -# Performance Optimizations - -This document outlines the performance optimizations implemented in the Transcription API. - -## 1. Model Management - -### Shared Model Instance -- **Location**: `transcription_server.py:73-137` -- **Optimization**: Single Whisper model instance shared across all connections (gRPC, WebSocket, REST) -- **Benefit**: Eliminates redundant model loading, reduces memory usage by ~50-80% - -### Model Evaluation Mode -- **Location**: `transcription_server.py:119-122` -- **Optimization**: Set model to eval mode and disable gradient computation -- **Benefit**: Reduces memory usage and improves inference speed by ~15-20% - -## 2. GPU Optimizations - -### TF32 Precision (Ampere GPUs) -- **Location**: `transcription_server.py:105-111` -- **Optimization**: Enable TF32 for matrix multiplications on compatible GPUs -- **Benefit**: Up to 3x faster inference on A100/RTX 3000+ series GPUs with minimal accuracy loss - -### cuDNN Benchmarking -- **Location**: `transcription_server.py:110` -- **Optimization**: Enable cuDNN autotuning for optimal convolution algorithms -- **Benefit**: 10-30% speedup after initial warmup - -### FP16 Inference -- **Location**: `transcription_server.py:253` -- **Optimization**: Use FP16 precision on CUDA devices -- **Benefit**: 2x faster inference, 50% less GPU memory usage - -## 3. Inference Optimizations - -### No Gradient Context -- **Location**: `transcription_server.py:249-260, 340-346` -- **Optimization**: Wrap all inference calls in `torch.no_grad()` context -- **Benefit**: 10-15% speed improvement, reduces memory usage - -### Optimized Audio Processing -- **Location**: `transcription_server.py:208-219` -- **Optimization**: Direct numpy operations, inline energy calculations -- **Benefit**: Faster VAD processing, reduced memory allocations - -## 4. Network Optimizations - -### gRPC Threading -- **Location**: `transcription_server.py:512-527` -- **Optimization**: Dynamic thread pool sizing based on CPU cores -- **Configuration**: `max_workers = min(cpu_count * 2, 20)` -- **Benefit**: Better handling of concurrent connections - -### gRPC Keepalive -- **Location**: `transcription_server.py:522-526` -- **Optimization**: Configured keepalive and ping settings -- **Benefit**: More stable long-running connections, faster failure detection - -### Message Size Limits -- **Location**: `transcription_server.py:519-520` -- **Optimization**: 100MB message size limits for large audio files -- **Benefit**: Support for longer audio files without chunking - -## 5. Voice Activity Detection (VAD) - -### Smart Filtering -- **Location**: `transcription_server.py:162-203` -- **Optimization**: Fast energy-based VAD to skip silent audio -- **Configuration**: - - Energy threshold: 0.005 - - Zero-crossing threshold: 50 -- **Benefit**: 40-60% reduction in transcription calls for audio with silence - -### Early Return -- **Location**: `transcription_server.py:215-217` -- **Optimization**: Skip transcription for non-speech audio -- **Benefit**: Reduces unnecessary inference calls, improves overall throughput - -## 6. Anti-hallucination Filters - -### Aggressive Filtering -- **Location**: `transcription_server.py:262-310` -- **Optimization**: Comprehensive hallucination detection and filtering -- **Filters**: - - Common hallucination phrases - - Repetitive text - - Low alphanumeric ratio - - Cross-language detection -- **Benefit**: Better transcription quality, fewer false positives - -### Conservative Parameters -- **Location**: `transcription_server.py:254-259` -- **Optimization**: Tuned Whisper parameters to reduce hallucinations -- **Settings**: - - `temperature=0.0` (deterministic) - - `no_speech_threshold=0.8` (high) - - `logprob_threshold=-0.5` (strict) - - `condition_on_previous_text=False` -- **Benefit**: More accurate transcriptions, fewer hallucinations - -## 7. Logging Optimizations - -### Debug-level for VAD -- **Location**: `transcription_server.py:216-219` -- **Optimization**: Use DEBUG level for VAD messages instead of INFO -- **Benefit**: Reduced log volume, better performance in high-throughput scenarios - -## 8. REST API Optimizations - -### Async Operations -- **Location**: `rest_api.py` -- **Optimization**: Fully async FastAPI with uvicorn -- **Benefit**: Non-blocking I/O, better concurrency - -### Streaming Responses -- **Location**: `rest_api.py:223-278` -- **Optimization**: Server-Sent Events for streaming transcription -- **Benefit**: Real-time results without buffering entire response - -### Connection Pooling -- **Built-in**: FastAPI/Uvicorn connection pooling -- **Benefit**: Efficient handling of concurrent HTTP connections - -## Performance Benchmarks - -### Typical Performance (RTX 3090, large-v3 model) - -| Metric | Value | -|--------|-------| -| Cold start | 5-8 seconds | -| Transcription speed (with VAD) | 0.1-0.3x real-time | -| Memory usage | 3-4 GB VRAM | -| Concurrent sessions | 5-10 (GPU memory dependent) | -| API latency | 50-200ms (excluding inference) | - -### Without Optimizations - -| Metric | Previous | Optimized | Improvement | -|--------|----------|-----------|-------------| -| Inference speed | 0.2x | 0.1x | 2x faster | -| Memory per session | 4 GB | 0.5 GB | 8x reduction | -| Startup time | 8s | 6s | 25% faster | - -## Recommendations - -### For Maximum Performance - -1. **Use GPU**: CUDA is 10-50x faster than CPU -2. **Use smaller models**: `base` or `small` for real-time applications -3. **Enable VAD**: Reduces unnecessary transcriptions -4. **Batch audio**: Send 3-5 second chunks for optimal throughput -5. **Use gRPC**: Lower overhead than REST for high-frequency calls - -### For Best Quality - -1. **Use larger models**: `large-v3` for best accuracy -2. **Disable VAD**: If you need to transcribe everything -3. **Specify language**: Avoid auto-detection if you know the language -4. **Longer audio chunks**: 5-10 seconds for better context - -### For High Throughput - -1. **Multiple replicas**: Scale horizontally with load balancer -2. **GPU per replica**: Each replica needs dedicated GPU memory -3. **Use gRPC streaming**: Most efficient for continuous transcription -4. **Monitor GPU utilization**: Keep it above 80% for best efficiency - -## Future Optimizations - -Potential improvements not yet implemented: - -1. **Batch Inference**: Process multiple audio chunks in parallel -2. **Model Quantization**: INT8 quantization for faster inference -3. **Faster Whisper**: Use faster-whisper library (2-3x speedup) -4. **KV Cache**: Reuse key-value cache for streaming -5. **TensorRT**: Use TensorRT for optimized inference on NVIDIA GPUs -6. **Distillation**: Use distilled Whisper models (whisper-small-distilled) - -## Monitoring - -Use these endpoints to monitor performance: - -```bash -# Health and metrics -curl http://localhost:8000/health - -# Active sessions -curl http://localhost:8000/sessions - -# GPU utilization (if nvidia-smi available) -nvidia-smi --query-gpu=utilization.gpu,memory.used --format=csv -l 1 -``` - -## Tuning Parameters - -Key environment variables for performance tuning: - -```env -# Model selection (smaller = faster) -MODEL_PATH=base # tiny, base, small, medium, large-v3 - -# Thread count (CPU inference) -OMP_NUM_THREADS=4 - -# GPU selection -CUDA_VISIBLE_DEVICES=0 - -# Enable optimizations -ENABLE_REST=true -ENABLE_WEBSOCKET=true -``` - -## Contact - -For performance issues or optimization suggestions, please open an issue on GitHub. diff --git a/README.md b/README.md index c038d60..cafd799 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,14 @@ A high-performance, standalone transcription service with **REST API**, **gRPC** ## Features -- šŸš€ **Multiple API Interfaces**: REST API, gRPC, and WebSocket -- šŸŽÆ **High Performance**: Optimized with TF32, cuDNN, and efficient batching -- 🧠 **Whisper Models**: Support for all Whisper models (tiny to large-v3) -- šŸŽ¤ **Real-time Streaming**: Bidirectional streaming for live transcription -- šŸ”‡ **Voice Activity Detection**: Smart VAD to filter silence and noise -- 🚫 **Anti-hallucination**: Advanced filtering to reduce Whisper hallucinations -- 🐳 **Docker Ready**: Easy deployment with GPU support -- šŸ“Š **Interactive Docs**: Auto-generated API documentation (Swagger/OpenAPI) +- **Multiple API Interfaces**: REST API, gRPC, and WebSocket +- **High Performance**: Optimized with TF32, cuDNN, and efficient batching +- **Whisper Models**: Support for all Whisper models (tiny to large-v3) +- **Real-time Streaming**: Bidirectional streaming for live transcription +- **Voice Activity Detection**: Smart VAD to filter silence and noise +- **Anti-hallucination**: Advanced filtering to reduce Whisper hallucinations +- **Docker Ready**: Easy deployment with GPU support +- **Interactive Docs**: Auto-generated API documentation (Swagger/OpenAPI) ## Quick Start diff --git a/examples/rust-client/src/realtime_playback.rs b/examples/rust-client/src/realtime_playback.rs index 81fd311..2a5750b 100644 --- a/examples/rust-client/src/realtime_playback.rs +++ b/examples/rust-client/src/realtime_playback.rs @@ -207,7 +207,7 @@ async fn main() -> Result<()> { } println!("\n{}", "─".repeat(80)); - println!("āœ… Playback and transcription complete!"); + println!("Playback and transcription complete!"); // Keep the program alive until playback finishes time::sleep(Duration::from_secs(2)).await; diff --git a/examples/rust-client/src/system_audio_transcribe.rs b/examples/rust-client/src/system_audio_transcribe.rs index b6d96d9..55577b0 100644 --- a/examples/rust-client/src/system_audio_transcribe.rs +++ b/examples/rust-client/src/system_audio_transcribe.rs @@ -118,7 +118,7 @@ async fn main() -> Result<()> { fn list_audio_devices() -> Result<()> { let host = cpal::default_host(); - println!("\nšŸ“Š Available Audio Devices:"); + println!("\n Available Audio Devices:"); println!("{}", "─".repeat(80)); // List input devices @@ -138,10 +138,10 @@ fn list_audio_devices() -> Result<()> { // Show default device if let Some(device) = host.default_input_device() { - println!("\n⭐ Default Input: {}", device.name()?); + println!("\n Default Input: {}", device.name()?); } - println!("\nšŸ’” Tips for capturing system audio:"); + println!("\n Tips for capturing system audio:"); println!(" Linux: Look for devices with 'monitor' in the name (PulseAudio/PipeWire)"); println!(" Windows: Install VB-Cable or enable 'Stereo Mix' in sound settings"); println!(" macOS: Install BlackHole or Loopback for system audio capture"); diff --git a/examples/rust-client/transcribe_video_call.sh b/examples/rust-client/transcribe_video_call.sh index 809434a..8dc516b 100755 --- a/examples/rust-client/transcribe_video_call.sh +++ b/examples/rust-client/transcribe_video_call.sh @@ -18,7 +18,7 @@ NC='\033[0m' # No Color # Check dependencies check_dependency() { if ! command -v $1 &> /dev/null; then - echo -e "${RED}āŒ $1 not found.${NC}" + echo -e "${RED} $1 not found.${NC}" echo "Please install: sudo apt-get install $2" return 1 fi @@ -27,7 +27,7 @@ check_dependency() { echo "Checking dependencies..." check_dependency "parec" "pulseaudio-utils" || exit 1 -check_dependency "sox" "sox" || echo -e "${YELLOW}āš ļø sox not installed (optional but recommended)${NC}" +check_dependency "sox" "sox" || echo -e "${YELLOW} sox not installed (optional but recommended)${NC}" # Function to find the monitor source for system audio find_monitor_source() { @@ -52,11 +52,11 @@ find_monitor_source() { # List available sources if [ "$1" == "--list" ]; then - echo -e "${GREEN}šŸ“Š Available Audio Sources:${NC}" + echo -e "${GREEN} Available Audio Sources:${NC}" echo "" pactl list sources short 2>/dev/null || pacmd list-sources 2>/dev/null | grep "name:" echo "" - echo -e "${GREEN}šŸ’” Monitor sources (system audio):${NC}" + echo -e "${GREEN} Monitor sources (system audio):${NC}" pactl list sources short 2>/dev/null | grep -i "monitor" || echo "No monitor sources found" exit 0 fi @@ -82,24 +82,24 @@ fi # Determine what to capture if [ "$1" == "--microphone" ]; then - echo -e "${GREEN}šŸŽ¤ Using microphone input${NC}" + echo -e "${GREEN} Using microphone input${NC}" # Run the existing live-transcribe for microphone exec cargo run --bin live-transcribe exit 0 elif [ "$1" == "--combined" ]; then - echo -e "${YELLOW}šŸŽ¤+šŸ”Š Combined audio capture not yet implemented${NC}" + echo -e "${YELLOW}+ Combined audio capture not yet implemented${NC}" echo "For now, please run two separate instances:" echo " 1. $0 (for system audio)" echo " 2. $0 --microphone (for mic)" exit 1 elif [ "$1" == "--source" ] && [ -n "$2" ]; then SOURCE="$2" - echo -e "${GREEN}šŸ“” Using specified source: $SOURCE${NC}" + echo -e "${GREEN} Using specified source: $SOURCE${NC}" else # Auto-detect monitor source SOURCE=$(find_monitor_source) if [ -z "$SOURCE" ]; then - echo -e "${RED}āŒ Could not find system audio monitor source${NC}" + echo -e "${RED} Could not find system audio monitor source${NC}" echo "" echo "This might happen if:" echo " 1. No audio is currently playing" @@ -111,14 +111,14 @@ else echo " 3. Use a specific source: $0 --source " exit 1 fi - echo -e "${GREEN}šŸ“” Found system audio source: $SOURCE${NC}" + echo -e "${GREEN} Found system audio source: $SOURCE${NC}" fi echo "" -echo -e "${GREEN}šŸŽ¬ Starting video call transcription...${NC}" +echo -e "${GREEN} Starting video call transcription...${NC}" echo -e "${YELLOW}Press Ctrl+C to stop${NC}" echo "" -echo "šŸ’” Tips for best results:" +echo " Tips for best results:" echo " • Join your video call first" echo " • Use headphones to avoid echo" echo " • Close other audio sources (music, videos)"