initial commit

2025-12-17 07:14:24 +01:00 · 2025-09-11 09:59:16 +02:00
commit ab17a8ac21
19 changed files with 2587 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
 */*/target/*
 */target/*
 .history
 *.log
 *.tmp
 *.bak
 *.swp
--- a/81
+++ b/81
@@ -0,0 +1,81 @@
 # Use NVIDIA CUDA base image with PyTorch pre-installed for faster builds
 # This image includes CUDA, cuDNN, and PyTorch - saving significant build time
 FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 as base
 # Set timezone to avoid interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=UTC
 # Install Python 3.11 and system dependencies
 RUN apt-get update && apt-get install -y \
    python3.11 \
    python3.11-dev \
    python3-pip \
    ffmpeg \
    libsndfile1 \
    git \
    tzdata \
    && rm -rf /var/lib/apt/lists/* \
    && update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
 # Set Python environment
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1
 # Set working directory
 WORKDIR /app
 # Install PyTorch with CUDA support first (if not using pre-built image)
 # This is much faster than installing via requirements.txt
 RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 # Copy requirements and install remaining dependencies
 COPY requirements.txt .
 # Remove torch from requirements since we already installed it
 RUN grep -v "^torch" requirements.txt > requirements_no_torch.txt && \
    pip install -r requirements_no_torch.txt
 # Install grpcio-tools for protobuf generation
 RUN pip install grpcio-tools==1.60.0
 # Copy proto files and generate gRPC code
 COPY proto/ ./proto/
 RUN mkdir -p ./src && \
    python -m grpc_tools.protoc \
    -I./proto \
    --python_out=./src \
    --grpc_python_out=./src \
    ./proto/transcription.proto
 # Note: Don't modify imports - they work as-is when sys.path is set correctly
 # Copy application code
 COPY src/ ./src/
 COPY entrypoint.sh ./
 RUN chmod +x ./entrypoint.sh
 # Environment variables
 ENV MODEL_PATH=large-v3 \
    GRPC_PORT=50051 \
    WEBSOCKET_PORT=8765 \
    ENABLE_WEBSOCKET=true \
    CACHE_DIR=/app/models \
    TORCH_HOME=/app/models \
    HF_HOME=/app/models
 # Create model cache directory
 RUN mkdir -p /app/models
 # Volume for model cache
 VOLUME ["/app/models"]
 # Expose ports
 EXPOSE 50051 8765
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import grpc; channel = grpc.insecure_channel('localhost:50051'); channel.channel_ready()" || exit 1
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/Dockerfile.pytorch
+++ b/Dockerfile.pytorch
@@ -0,0 +1,69 @@
 # Alternative: Use official PyTorch image with CUDA support
 # This image has PyTorch, CUDA, and cuDNN pre-installed
 FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
 # Set timezone to avoid interactive prompts
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=UTC
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    ffmpeg \
    libsndfile1 \
    git \
    tzdata \
    && rm -rf /var/lib/apt/lists/*
 # Set Python environment
 ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1
 # Set working directory
 WORKDIR /app
 # Copy requirements and install dependencies (excluding torch)
 COPY requirements.txt .
 RUN grep -v "^torch" requirements.txt > requirements_no_torch.txt && \
    pip install -r requirements_no_torch.txt && \
    pip install grpcio-tools==1.60.0
 # Copy proto files and generate gRPC code
 COPY proto/ ./proto/
 RUN mkdir -p ./src && \
    python -m grpc_tools.protoc \
    -I./proto \
    --python_out=./src \
    --grpc_python_out=./src \
    ./proto/transcription.proto
 # Note: Don't modify imports - they work as-is when sys.path is set correctly
 # Copy application code
 COPY src/ ./src/
 COPY entrypoint.sh ./
 RUN chmod +x ./entrypoint.sh
 # Environment variables
 ENV MODEL_PATH=large-v3 \
    GRPC_PORT=50051 \
    WEBSOCKET_PORT=8765 \
    ENABLE_WEBSOCKET=true \
    CACHE_DIR=/app/models \
    TORCH_HOME=/app/models \
    HF_HOME=/app/models
 # Create model cache directory
 RUN mkdir -p /app/models
 # Volume for model cache
 VOLUME ["/app/models"]
 # Expose ports
 EXPOSE 50051 8765
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import grpc; channel = grpc.insecure_channel('localhost:50051'); channel.channel_ready()" || exit 1
 ENTRYPOINT ["/app/entrypoint.sh"]
--- a/102
+++ b/102
@@ -0,0 +1,102 @@
 # Makefile for Transcription API Service
 .PHONY: help build run stop clean test proto rust-client
 # Variables
 DOCKER_IMAGE = transcription-api
 DOCKER_TAG = latest
 GRPC_PORT = 50051
 WEBSOCKET_PORT = 8765
 MODEL = base
 help: ## Show this help message
 	@echo "Usage: make [target]"
 	@echo ""
 	@echo "Available targets:"
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "  %-15s %s\n", $$1, $$2}'
 build: ## Build Docker image
 	docker build -t $(DOCKER_IMAGE):$(DOCKER_TAG) .
 run: ## Run service with docker-compose
 	MODEL_PATH=$(MODEL) docker compose up -d
 	@echo "Service started!"
 	@echo "gRPC endpoint: localhost:$(GRPC_PORT)"
 	@echo "WebSocket endpoint: ws://localhost:$(WEBSOCKET_PORT)"
 run-gpu: ## Run service with GPU support
 	MODEL_PATH=large-v3 CUDA_VISIBLE_DEVICES=0 docker compose up -d
 	@echo "Service started with GPU support!"
 stop: ## Stop the service
 	docker compose down
 logs: ## Show service logs
 	docker compose logs -f
 clean: ## Clean up containers and volumes
 	docker compose down -v
 	docker system prune -f
 proto: ## Generate protobuf code
 	python -m grpc_tools.protoc \
 		-I./proto \
 		--python_out=./src \
 		--grpc_python_out=./src \
 		./proto/transcription.proto
 	@echo "Generated Python protobuf code in src/"
 rust-client: ## Build Rust client examples
 	cd examples/rust-client && cargo build --release
 	@echo "Rust clients built in examples/rust-client/target/release/"
 test-grpc: ## Test gRPC connection
 	@command -v grpcurl >/dev/null 2>&1 || { echo "grpcurl not installed. Install from https://github.com/fullstorydev/grpcurl"; exit 1; }
 	grpcurl -plaintext localhost:$(GRPC_PORT) list
 	grpcurl -plaintext localhost:$(GRPC_PORT) transcription.TranscriptionService/HealthCheck
 test-websocket: ## Test WebSocket connection
 	@echo "Testing WebSocket connection..."
 	@python3 -c "import asyncio, websockets, json; \
 		async def test(): \
 			async with websockets.connect('ws://localhost:$(WEBSOCKET_PORT)') as ws: \
 				data = await ws.recv(); \
 				print('Connected:', json.loads(data)); \
 		asyncio.run(test())"
 install-deps: ## Install Python dependencies
 	pip install -r requirements.txt
 docker-push: ## Push Docker image to registry
 	docker tag $(DOCKER_IMAGE):$(DOCKER_TAG) $(DOCKER_REGISTRY)/$(DOCKER_IMAGE):$(DOCKER_TAG)
 	docker push $(DOCKER_REGISTRY)/$(DOCKER_IMAGE):$(DOCKER_TAG)
 # Model management
 download-models: ## Download Whisper models
 	@echo "Downloading Whisper models..."
 	python -c "import whisper; \
 		for model in ['tiny', 'base', 'small']: \
 			print(f'Downloading {model}...'); \
 			whisper.load_model(model)"
 # Development
 dev-run: ## Run service locally (without Docker)
 	cd src && python transcription_server.py
 dev-install: ## Install development dependencies
 	pip install -r requirements.txt
 	pip install black flake8 pytest pytest-asyncio
 format: ## Format Python code
 	black src/
 lint: ## Lint Python code
 	flake8 src/
 # Benchmarking
 benchmark: ## Run performance benchmark
 	@echo "Running transcription benchmark..."
 	time curl -X POST \
 		-H "Content-Type: application/octet-stream" \
 		--data-binary @test_audio.wav \
 		http://localhost:$(GRPC_PORT)/benchmark
--- a/README.md
+++ b/README.md
@@ -0,0 +1,412 @@
 # Transcription API Service
 A high-performance, standalone transcription service with gRPC and WebSocket support, optimized for real-time speech-to-text applications. Perfect for desktop applications, web services, and IoT devices.
 ## Features
 - **Dual Protocol Support**: Both gRPC (recommended) and WebSocket
 - **Real-Time Streaming**: Bidirectional audio streaming with immediate transcription
 - **Multiple Models**: Support for all Whisper models (tiny to large-v3)
 - **Language Support**: 50+ languages with automatic detection
 - **Docker Ready**: Simple deployment with Docker Compose
 - **Production Ready**: Health checks, monitoring, and graceful shutdown
 - **Rust Client Examples**: Ready-to-use Rust client for desktop applications
 ## Quick Start
 ### Using Docker Compose (Recommended)
 ```bash
 # Clone the repository
 cd transcription-api
 # Start the service (uses 'base' model by default)
 docker compose up -d
 # Check logs
 docker compose logs -f
 # Stop the service
 docker compose down
 ```
 ### Configuration
 Edit `.env` or `docker-compose.yml` to configure:
 ```env
 MODEL_PATH=base          # tiny, base, small, medium, large, large-v3
 GRPC_PORT=50051         # gRPC service port
 WEBSOCKET_PORT=8765     # WebSocket service port
 ENABLE_WEBSOCKET=true   # Enable WebSocket support
 CUDA_VISIBLE_DEVICES=0  # GPU device ID (if available)
 ```
 ## API Protocols
 ### gRPC (Recommended for Desktop Apps)
 **Why gRPC?**
 - Strongly typed with Protocol Buffers
 - Excellent performance with HTTP/2
 - Built-in streaming support
 - Auto-generated client code
 - Better error handling
 **Proto Definition**: See `proto/transcription.proto`
 **Service Methods**:
 - `StreamTranscribe`: Bidirectional streaming for real-time transcription
 - `TranscribeFile`: Single file transcription
 - `GetCapabilities`: Query available models and languages
 - `HealthCheck`: Service health status
 ### WebSocket (Alternative)
 **Protocol**:
 ```javascript
 // Connect
 ws://localhost:8765
 // Send audio
 {
  "type": "audio",
  "data": "base64_encoded_pcm16_audio"
 }
 // Receive transcription
 {
  "type": "transcription",
  "text": "Hello world",
  "start_time": 0.0,
  "end_time": 1.5,
  "is_final": true,
  "timestamp": 1234567890
 }
 // Stop
 {
  "type": "stop"
 }
 ```
 ## Rust Client Usage
 ### Installation
 ```toml
 # Add to your Cargo.toml
 [dependencies]
 tonic = "0.10"
 tokio = { version = "1.35", features = ["full"] }
 # ... see examples/rust-client/Cargo.toml for full list
 ```
 ### Live Microphone Transcription
 ```rust
 use transcription_client::TranscriptionClient;
 #[tokio::main]
 async fn main() -> Result<()> {
    // Connect to service
    let mut client = TranscriptionClient::connect("http://localhost:50051").await?;
    // Start streaming from microphone
    let stream = client.stream_from_microphone(
        "auto",       // language
        "transcribe", // task
        "base"        // model
    ).await?;
    // Process transcriptions
    while let Some(transcription) = stream.next().await {
        println!("{}", transcription.text);
    }
    Ok(())
 }
 ```
 ### Build and Run Examples
 ```bash
 cd examples/rust-client
 # Build
 cargo build --release
 # Run live transcription from microphone
 cargo run --bin live-transcribe
 # Transcribe a file
 cargo run --bin file-transcribe -- audio.wav
 # Stream a WAV file
 cargo run --bin stream-transcribe -- audio.wav --realtime
 ```
 ## Audio Requirements
 - **Format**: PCM16 (16-bit signed integer)
 - **Sample Rate**: 16kHz
 - **Channels**: Mono
 - **Chunk Size**: Minimum ~500 bytes (flexible for real-time)
 ## Performance Optimization
 ### For Real-Time Applications
 1. **Use gRPC**: Lower latency than WebSocket
 2. **Small Chunks**: Send audio in 0.5-1 second chunks
 3. **Model Selection**:
   - `tiny`: Fastest, lowest accuracy (real-time on CPU)
   - `base`: Good balance (near real-time on CPU)
   - `small`: Better accuracy (may lag on CPU)
   - `large-v3`: Best accuracy (requires GPU for real-time)
 ### GPU Acceleration
 ```yaml
 # docker-compose.yml
 environment:
  - CUDA_VISIBLE_DEVICES=0
 deploy:
  resources:
    reservations:
      devices:
        - driver: nvidia
          count: 1
          capabilities: [gpu]
 ```
 ## Architecture
 ```
 ┌─────────────┐
 │ Rust App    │
 │ (Desktop)   │
 └──────┬──────┘
       │ gRPC/HTTP2
       ▼
 ┌─────────────┐
 │ Transcription│
 │   Service    │
 │  ┌────────┐ │
 │  │Whisper │ │
 │  │ Model  │ │
 │  └────────┘ │
 └─────────────┘
 ```
 ### Components
 1. **gRPC Server**: Handles streaming audio and returns transcriptions
 2. **WebSocket Server**: Alternative protocol for web clients
 3. **Transcription Engine**: Whisper/SimulStreaming for speech-to-text
 4. **Session Manager**: Handles multiple concurrent streams
 5. **Model Cache**: Prevents re-downloading models
 ## Advanced Configuration
 ### Using SimulStreaming
 For even lower latency, mount SimulStreaming:
 ```yaml
 volumes:
  - ./SimulStreaming:/app/SimulStreaming
 environment:
  - SIMULSTREAMING_PATH=/app/SimulStreaming
 ```
 ### Custom Models
 Mount your own Whisper models:
 ```yaml
 volumes:
  - ./models:/app/models
 environment:
  - MODEL_PATH=/app/models/custom-model.pt
 ```
 ### Monitoring
 The service exposes metrics on `/metrics` (when enabled):
 ```bash
 curl http://localhost:9090/metrics
 ```
 ## API Reference
 ### gRPC Methods
 #### StreamTranscribe
 ```protobuf
 rpc StreamTranscribe(stream AudioChunk) returns (stream TranscriptionResult);
 ```
 Bidirectional streaming for real-time transcription. Send audio chunks, receive transcriptions.
 #### TranscribeFile
 ```protobuf
 rpc TranscribeFile(AudioFile) returns (TranscriptionResponse);
 ```
 Transcribe a complete audio file in one request.
 #### GetCapabilities
 ```protobuf
 rpc GetCapabilities(Empty) returns (Capabilities);
 ```
 Query available models, languages, and features.
 #### HealthCheck
 ```protobuf
 rpc HealthCheck(Empty) returns (HealthStatus);
 ```
 Check service health and status.
 ## Language Support
 Supports 50+ languages including:
 - English (en)
 - Spanish (es)
 - French (fr)
 - German (de)
 - Italian (it)
 - Portuguese (pt)
 - Russian (ru)
 - Chinese (zh)
 - Japanese (ja)
 - Korean (ko)
 - And many more...
 Use `"auto"` for automatic language detection.
 ## Troubleshooting
 ### Service won't start
 - Check if ports 50051 and 8765 are available
 - Ensure Docker has enough memory (minimum 4GB)
 - Check logs: `docker compose logs transcription-api`
 ### Slow transcription
 - Use a smaller model (tiny or base)
 - Enable GPU if available
 - Reduce audio quality to 16kHz mono
 - Send smaller chunks more frequently
 ### Connection refused
 - Check firewall settings
 - Ensure service is running: `docker compose ps`
 - Verify correct ports in client configuration
 ### High memory usage
 - Models are cached in memory for performance
 - Use smaller models for limited memory systems
 - Set memory limits in docker-compose.yml
 ## Development
 ### Building from Source
 ```bash
 # Install dependencies
 pip install -r requirements.txt
 # Generate gRPC code
 python -m grpc_tools.protoc \
    -I./proto \
    --python_out=./src \
    --grpc_python_out=./src \
    ./proto/transcription.proto
 # Run the service
 python src/transcription_server.py
 ```
 ### Running Tests
 ```bash
 # Test gRPC connection
 grpcurl -plaintext localhost:50051 list
 # Test health check
 grpcurl -plaintext localhost:50051 transcription.TranscriptionService/HealthCheck
 # Test with example audio
 python test_client.py
 ```
 ## Production Deployment
 ### Docker Swarm
 ```bash
 docker stack deploy -c docker-compose.yml transcription
 ```
 ### Kubernetes
 ```yaml
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: transcription-api
 spec:
  replicas: 3
  selector:
    matchLabels:
      app: transcription-api
  template:
    metadata:
      labels:
        app: transcription-api
    spec:
      containers:
      - name: transcription-api
        image: transcription-api:latest
        ports:
        - containerPort: 50051
          name: grpc
        - containerPort: 8765
          name: websocket
        env:
        - name: MODEL_PATH
          value: "base"
        resources:
          requests:
            memory: "4Gi"
            cpu: "2"
          limits:
            memory: "8Gi"
            cpu: "4"
 ```
 ### Security
 For production:
 1. Enable TLS for gRPC
 2. Use WSS for WebSocket
 3. Add authentication
 4. Rate limiting
 5. Input validation
 ## License
 MIT License - See LICENSE file for details
 ## Contributing
 Contributions welcome! Please read CONTRIBUTING.md for guidelines.
 ## Support
 - GitHub Issues: [Report bugs or request features]
 - Documentation: [Full API documentation]
 - Examples: See `examples/` directory
--- a/build.sh
+++ b/build.sh
@@ -0,0 +1,78 @@
 #!/bin/bash
 # Build script with options for different configurations
 set -e
 # Default values
 DOCKERFILE="Dockerfile"
 USE_CACHE=true
 PLATFORM="linux/amd64"
 # Parse arguments
 while [[ $# -gt 0 ]]; do
    case $1 in
        --pytorch)
            DOCKERFILE="Dockerfile.pytorch"
            echo "Using PyTorch base image (faster build)"
            shift
            ;;
        --cuda)
            DOCKERFILE="Dockerfile"
            echo "Using NVIDIA CUDA base image"
            shift
            ;;
        --no-cache)
            USE_CACHE=false
            echo "Building without cache"
            shift
            ;;
        --platform)
            PLATFORM="$2"
            echo "Building for platform: $PLATFORM"
            shift 2
            ;;
        --help)
            echo "Usage: ./build.sh [options]"
            echo "Options:"
            echo "  --pytorch    Use PyTorch base image (fastest)"
            echo "  --cuda       Use NVIDIA CUDA base image (default)"
            echo "  --no-cache   Build without using cache"
            echo "  --platform   Target platform (default: linux/amd64)"
            echo "  --help       Show this help message"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
 done
 # Build command
 BUILD_CMD="docker build"
 if [ "$USE_CACHE" = false ]; then
    BUILD_CMD="$BUILD_CMD --no-cache"
 fi
 BUILD_CMD="$BUILD_CMD --platform $PLATFORM -f $DOCKERFILE -t transcription-api:latest ."
 echo "Building transcription-api..."
 echo "Command: $BUILD_CMD"
 # Execute build
 eval $BUILD_CMD
 if [ $? -eq 0 ]; then
    echo ""
    echo "Build successful!"
    echo ""
    echo "To run the service:"
    echo "  docker compose up -d"
    echo ""
    echo "Or with GPU support:"
    echo "  docker compose --profile gpu up -d"
 else
    echo "Build failed!"
    exit 1
 fi
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,105 @@
 services:
  # Main service with GPU support (if available)
  # For CPU-only: Use --profile cpu or set CUDA_VISIBLE_DEVICES=""
  transcription-api:
    build: 
      context: .
      dockerfile: ${DOCKERFILE:-Dockerfile}  # Can use Dockerfile.pytorch for faster builds
    container_name: transcription-api
    environment:
      # Model configuration
      - MODEL_PATH=${MODEL_PATH:-large-v3}  # Options: tiny, base, small, medium, large, large-v2, large-v3
      # Model cache paths (shared with main project)
      - HF_HOME=/app/models
      - TORCH_HOME=/app/models
      - TRANSFORMERS_CACHE=/app/models
      # Server ports
      - GRPC_PORT=50051
      - WEBSOCKET_PORT=8765
      - ENABLE_WEBSOCKET=true
      # Performance tuning
      - OMP_NUM_THREADS=4
      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0}  # GPU 0 by default
    ports:
      - "50051:50051"  # gRPC port
      - "8765:8765"    # WebSocket port
    volumes:
      # Model cache - prevents re-downloading models
      - whisper-models:/app/models
      # Optional: Mount SimulStreaming if available
      # - ./SimulStreaming:/app/SimulStreaming
    restart: unless-stopped
    # Resource limits (adjust based on your system)
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          cpus: '2'
          memory: 4G
          # GPU support (requires nvidia-docker2 or Docker 19.03+)
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    # Health check
    healthcheck:
      test: ["CMD", "python", "-c", "import grpc; channel = grpc.insecure_channel('localhost:50051'); channel.channel_ready()"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
  # CPU-only service (for systems without GPU)
  transcription-api-cpu:
    profiles: ["cpu"]  # Only start with --profile cpu
    build: 
      context: .
      dockerfile: ${DOCKERFILE:-Dockerfile.pytorch}
    container_name: transcription-api-cpu
    environment:
      - MODEL_PATH=${MODEL_PATH:-base}  # Smaller model for CPU
      - HF_HOME=/app/models
      - TORCH_HOME=/app/models
      - TRANSFORMERS_CACHE=/app/models
      - GRPC_PORT=50051
      - WEBSOCKET_PORT=8765
      - ENABLE_WEBSOCKET=true
      - CUDA_VISIBLE_DEVICES=  # No GPU
    ports:
      - "50051:50051"
      - "8765:8765"
    volumes:
      - whisper-models:/app/models
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G
        reservations:
          cpus: '2'
          memory: 4G
          # No GPU devices for CPU profile
    healthcheck:
      test: ["CMD", "python", "-c", "import grpc; channel = grpc.insecure_channel('localhost:50051'); channel.channel_ready()"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
 volumes:
  # Share models with the main transcription project
  # This references the volume from the parent project
  whisper-models:
    external: true
    name: real-time-transcriptions_whisper-models
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,59 @@
 #!/bin/bash
 set -e
 # Download model if not already cached
 # Whisper stores models as .pt files in the root of the cache directory
 if [ ! -z "$MODEL_PATH" ]; then
    MODEL_FILE="/app/models/$MODEL_PATH.pt"
    # Check if model file exists
    if [ ! -f "$MODEL_FILE" ]; then
        echo "Model $MODEL_PATH not found at $MODEL_FILE, downloading..."
        python -c "
 import whisper
 import os
 # Set all cache paths to use the shared volume
 os.environ['TORCH_HOME'] = '/app/models'
 os.environ['HF_HOME'] = '/app/models'
 os.environ['TRANSFORMERS_CACHE'] = '/app/models'
 os.environ['XDG_CACHE_HOME'] = '/app/models'
 model_name = '$MODEL_PATH'
 print(f'Downloading model {model_name}...')
 model = whisper.load_model(model_name, download_root='/app/models')
 print(f'Model {model_name} downloaded and cached successfully')
 "
    else
        echo "Model $MODEL_PATH already cached at $MODEL_FILE"
        # Just verify it loads properly
        python -c "
 import whisper
 import os
 os.environ['TORCH_HOME'] = '/app/models'
 os.environ['XDG_CACHE_HOME'] = '/app/models'
 model = whisper.load_model('$MODEL_PATH', download_root='/app/models')
 print(f'Model $MODEL_PATH loaded successfully from cache')
 "
    fi
 fi
 # Generate gRPC code if not already generated
 if [ ! -f "/app/src/transcription_pb2.py" ]; then
    echo "Generating gRPC code from proto files..."
    python -m grpc_tools.protoc \
        -I/app/proto \
        --python_out=/app/src \
        --grpc_python_out=/app/src \
        /app/proto/transcription.proto
    # Fix imports in generated files (keep absolute import)
    # No need to modify - the generated import should work as-is
 fi
 # Start the transcription server
 echo "Starting Transcription API Server..."
 echo "gRPC Port: $GRPC_PORT"
 echo "WebSocket Port: $WEBSOCKET_PORT (Enabled: $ENABLE_WEBSOCKET)"
 echo "Model: $MODEL_PATH"
 cd /app/src
 exec python transcription_server.py
--- a/examples/rust-client/Cargo.toml
+++ b/examples/rust-client/Cargo.toml
@@ -0,0 +1,42 @@
 [package]
 name = "transcription-client"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 # gRPC and Protocol Buffers
 tonic = "0.10"
 prost = "0.12"
 tokio = { version = "1.35", features = ["full"] }
 tokio-stream = "0.1"
 # Audio handling
 hound = "3.5"  # WAV file reading/writing
 cpal = "0.15"  # Cross-platform audio capture
 rodio = "0.17"  # Audio playback
 # Utilities
 anyhow = "1.0"
 tracing = "0.1"
 tracing-subscriber = "0.3"
 clap = { version = "4.4", features = ["derive"] }
 futures-util = "0.3"
 [build-dependencies]
 tonic-build = "0.10"
 [[bin]]
 name = "stream-transcribe"
 path = "src/stream_transcribe.rs"
 [[bin]]
 name = "file-transcribe"
 path = "src/file_transcribe.rs"
 [[bin]]
 name = "live-transcribe"
 path = "src/live_transcribe.rs"
 [[bin]]
 name = "realtime-playback"
 path = "src/realtime_playback.rs"
--- a/examples/rust-client/README.md
+++ b/examples/rust-client/README.md
@@ -0,0 +1,126 @@
 # Rust Transcription Client Examples
 This directory contains Rust client examples for the Transcription API service.
 ## Available Clients
 ### 1. `file-transcribe` - File Transcription
 Transcribe audio files either by sending the entire file or streaming in real-time chunks.
 ```bash
 # Send entire file at once (fast, but no real-time feedback)
 cargo run --bin file-transcribe -- audio.wav
 # Stream file in chunks for real-time transcription (like YouTube)
 cargo run --bin file-transcribe -- audio.wav --stream
 # With VAD (Voice Activity Detection) to filter silence
 cargo run --bin file-transcribe -- audio.wav --stream --vad
 # Specify model and language
 cargo run --bin file-transcribe -- audio.wav --stream --model large-v3 --language en
 ```
 ### 2. `realtime-playback` - Play Audio with Live Transcription
 Plays audio through your speakers while showing real-time transcriptions, similar to YouTube's live captions.
 ```bash
 # Basic usage - plays audio and shows transcriptions
 cargo run --bin realtime-playback -- audio.wav
 # With timestamps for each transcription
 cargo run --bin realtime-playback -- audio.wav --timestamps
 # With VAD to reduce noise transcriptions
 cargo run --bin realtime-playback -- audio.wav --vad
 # Using a specific model
 cargo run --bin realtime-playback -- audio.wav --model large-v3
 ```
 ### 3. `stream-transcribe` - Stream WAV Files
 Streams WAV files chunk by chunk for transcription.
 ```bash
 # Stream without delays (fast processing)
 cargo run --bin stream-transcribe -- audio.wav
 # Simulate real-time streaming with delays
 cargo run --bin stream-transcribe -- audio.wav --realtime
 ```
 ### 4. `live-transcribe` - Live Microphone Transcription
 Captures audio from your microphone and transcribes in real-time.
 ```bash
 # Use default microphone
 cargo run --bin live-transcribe
 # Specify server and language
 cargo run --bin live-transcribe -- --server http://localhost:50051 --language en
 ```
 ## Building
 ```bash
 # Build all binaries
 cargo build --release
 # Build specific binary
 cargo build --release --bin realtime-playback
 ```
 ## Common Options
 All clients support these common options:
 - `--server <URL>` - gRPC server address (default: http://localhost:50051)
 - `--language <code>` - Language code: en, es, fr, de, etc., or "auto" (default: auto)
 - `--model <name>` - Model to use: tiny, base, small, medium, large-v3 (default: base)
 - `--vad` - Enable Voice Activity Detection to filter silence
 ## Features
 ### Real-time Streaming
 The `--stream` flag in `file-transcribe` and the `realtime-playback` binary both support real-time streaming, which means:
 - Audio is sent in small chunks (0.5 second intervals)
 - Transcriptions appear as the audio is being processed
 - Similar experience to YouTube's live captions
 - Lower latency compared to sending entire file
 ### Voice Activity Detection (VAD)
 When `--vad` is enabled, the service will:
 - Filter out silence and background noise
 - Reduce false transcriptions (like repeated "Thank you")
 - Improve transcription quality for speech-only content
 ### Audio Playback
 The `realtime-playback` binary uses the `rodio` library to:
 - Play audio through your system's default audio output
 - Synchronize playback with transcription display
 - Support multiple audio formats (WAV, MP3, FLAC, etc.)
 ## Requirements
 - Rust 1.70 or later
 - The Transcription API server running (usually on localhost:50051)
 - For live transcription: A working microphone
 - For playback: Audio output device (speakers/headphones)
 ## Troubleshooting
 ### "Connection refused" error
 Make sure the Transcription API server is running:
 ```bash
 cd ../../
 docker compose up
 ```
 ### No audio playback
 - Check your system's default audio output device
 - Ensure the audio file format is supported (WAV, MP3, FLAC)
 - Try with a different audio file
 ### Poor transcription quality
 - Use a larger model (e.g., `--model large-v3`)
 - Enable VAD to filter noise (`--vad`)
 - Ensure audio quality is good (16kHz or higher recommended)
--- a/examples/rust-client/build.rs
+++ b/examples/rust-client/build.rs
@@ -0,0 +1,10 @@
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Compile protobuf files
    tonic_build::configure()
        .build_server(false)  // We only need the client
        .compile(
            &["../../proto/transcription.proto"],
            &["../../proto"],
        )?;
    Ok(())
 }
--- a/examples/rust-client/src/file_transcribe.rs
+++ b/examples/rust-client/src/file_transcribe.rs
@@ -0,0 +1,232 @@
 //! File transcription using gRPC
 //! 
 //! This example shows how to transcribe an audio file.
 //! Use --stream flag for real-time streaming instead of sending the entire file.
 use anyhow::Result;
 use clap::Parser;
 use std::fs;
 use tonic::transport::Channel;
 use tracing::{info, debug};
 use futures_util::StreamExt;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use std::time::Duration;
 use tokio::time;
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioConfig, AudioFile, AudioChunk,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
    /// Audio file path
    file: String,
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "auto")]
    language: String,
    /// Task: transcribe or translate
    #[arg(short, long, default_value = "transcribe")]
    task: String,
    /// Model to use
    #[arg(short, long, default_value = "base")]
    model: String,
    /// Stream the file in chunks for real-time transcription
    #[arg(long)]
    stream: bool,
    /// Enable VAD (Voice Activity Detection)
    #[arg(short, long)]
    vad: bool,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    // Initialize logging
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    info!("Reading audio file: {}", args.file);
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
    if args.stream {
        // Stream mode - send file in chunks for real-time transcription
        stream_file(&mut client, &args).await?;
    } else {
        // Normal mode - send entire file at once
        transcribe_entire_file(&mut client, &args).await?;
    }
    Ok(())
 }
 async fn transcribe_entire_file(
    client: &mut TranscriptionServiceClient<Channel>,
    args: &Args,
 ) -> Result<()> {
    let audio_data = fs::read(&args.file)?;
    // Determine format from extension
    let format = match args.file.split('.').last() {
        Some("wav") => "wav",
        Some("mp3") => "mp3",
        Some("webm") => "webm",
        _ => "wav",  // Default to WAV
    };
    let config = AudioConfig {
        language: args.language.clone(),
        task: args.task.clone(),
        model: args.model.clone(),
        sample_rate: 16000,
        vad_enabled: args.vad,
    };
    let request = AudioFile {
        audio_data,
        format: format.to_string(),
        config: Some(config),
    };
    info!("Sending entire file for transcription...");
    let response = client.transcribe_file(request).await?;
    let result = response.into_inner();
    println!("\n=== Transcription Results ===");
    println!("Language: {}", result.detected_language);
    println!("Duration: {:.2} seconds", result.duration_seconds);
    println!("\nFull Text:");
    println!("{}", result.full_text);
    if !result.segments.is_empty() {
        println!("\n=== Segments ===");
        for (i, segment) in result.segments.iter().enumerate() {
            println!(
                "[{:03}] {:.2}s - {:.2}s (conf: {:.2}): {}",
                i + 1,
                segment.start_time,
                segment.end_time,
                segment.confidence,
                segment.text
            );
        }
    }
    Ok(())
 }
 async fn stream_file(
    client: &mut TranscriptionServiceClient<Channel>,
    args: &Args,
 ) -> Result<()> {
    let audio_data = fs::read(&args.file)?;
    info!("Streaming file in real-time chunks...");
    // Create channel for audio chunks
    let (tx, rx) = mpsc::channel::<AudioChunk>(100);
    // Spawn task to send audio chunks
    let tx_clone = tx.clone();
    let language = args.language.clone();
    let task = args.task.clone();
    let model = args.model.clone();
    let vad = args.vad;
    tokio::spawn(async move {
        // Send configuration first
        let config = AudioConfig {
            language,
            task,
            model,
            sample_rate: 16000,
            vad_enabled: vad,
        };
        let config_chunk = AudioChunk {
            audio_data: vec![],
            session_id: "file-stream".to_string(),
            config: Some(config),
        };
        if tx_clone.send(config_chunk).await.is_err() {
            return;
        }
        // Assuming PCM16 audio at 16kHz
        // Send in 3 second chunks for better accuracy (96000 bytes = 48000 samples = 3 seconds)
        let chunk_size = 96000;
        for (idx, chunk) in audio_data.chunks(chunk_size).enumerate() {
            let audio_chunk = AudioChunk {
                audio_data: chunk.to_vec(),
                session_id: String::new(),
                config: None,
            };
            debug!("Sending chunk {} ({} bytes)", idx, chunk.len());
            if tx_clone.send(audio_chunk).await.is_err() {
                break;
            }
            // Simulate real-time streaming (3 seconds per chunk)
            time::sleep(Duration::from_secs(3)).await;
        }
        info!("Finished streaming all chunks");
    });
    // Create stream and start transcription
    let stream = ReceiverStream::new(rx);
    let response = client.stream_transcribe(stream).await?;
    let mut result_stream = response.into_inner();
    println!("\n=== Real-time Transcription ===");
    println!("Streaming and transcribing...\n");
    let mut full_transcript = String::new();
    // Process results
    while let Some(result) = result_stream.next().await {
        match result {
            Ok(transcription) => {
                println!("[{:06.2}s - {:06.2}s] {}", 
                         transcription.start_time,
                         transcription.end_time,
                         transcription.text);
                if transcription.is_final {
                    full_transcript.push_str(&transcription.text);
                    full_transcript.push(' ');
                }
            }
            Err(e) => {
                eprintln!("Error: {}", e);
                break;
            }
        }
    }
    println!("\n=== Full Transcript ===");
    println!("{}", full_transcript.trim());
    Ok(())
 }
--- a/examples/rust-client/src/live_transcribe.rs
+++ b/examples/rust-client/src/live_transcribe.rs
@@ -0,0 +1,188 @@
 //! Live microphone transcription using gRPC streaming
 //! 
 //! This example shows how to capture audio from the microphone
 //! and stream it to the transcription service in real-time.
 use anyhow::Result;
 use clap::Parser;
 use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
 use futures_util::StreamExt;
 use std::sync::{Arc, Mutex};
 use tokio::sync::mpsc as tokio_mpsc;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{error, info};
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "auto")]
    language: String,
    /// Task: transcribe or translate
    #[arg(short, long, default_value = "transcribe")]
    task: String,
    /// Model to use
    #[arg(short, long, default_value = "base")]
    model: String,
    /// Session ID
    #[arg(long)]
    session_id: Option<String>,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    // Initialize logging
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server).await?;
    // Create channel for audio data
    let (audio_tx, audio_rx) = tokio_mpsc::channel::<AudioChunk>(100);
    // Start audio capture in a separate thread
    let audio_tx_clone = audio_tx.clone();
    std::thread::spawn(move || {
        if let Err(e) = capture_audio(audio_tx_clone) {
            error!("Audio capture error: {}", e);
        }
    });
    // Send initial configuration
    let session_id = args.session_id.unwrap_or_else(|| {
        format!("rust-client-{}", std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_secs())
    });
    info!("Starting transcription session: {}", session_id);
    // Create the first chunk with configuration
    let config = AudioConfig {
        language: args.language.clone(),
        task: args.task.clone(),
        model: args.model.clone(),
        sample_rate: 16000,
        vad_enabled: false,
    };
    // Send a configuration chunk first
    let config_chunk = AudioChunk {
        audio_data: vec![],
        session_id: session_id.clone(),
        config: Some(config),
    };
    audio_tx.send(config_chunk).await?;
    // Create stream from receiver
    let audio_stream = ReceiverStream::new(audio_rx);
    // Start bidirectional streaming
    let response = client.stream_transcribe(audio_stream).await?;
    let mut stream = response.into_inner();
    info!("Listening... Press Ctrl+C to stop");
    // Process transcription results
    while let Some(result) = stream.next().await {
        match result {
            Ok(transcription) => {
                if transcription.is_final {
                    println!("\n[FINAL] {}", transcription.text);
                } else {
                    print!("\r[PARTIAL] {}       ", transcription.text);
                    use std::io::{self, Write};
                    io::stdout().flush()?;
                }
            }
            Err(e) => {
                error!("Transcription error: {}", e);
                break;
            }
        }
    }
    Ok(())
 }
 /// Capture audio from the default microphone
 fn capture_audio(tx: tokio_mpsc::Sender<AudioChunk>) -> Result<()> {
    let host = cpal::default_host();
    let device = host.default_input_device()
        .ok_or_else(|| anyhow::anyhow!("No input device available"))?;
    info!("Using audio device: {}", device.name()?);
    // Configure audio capture for 16kHz mono PCM16
    let config = cpal::StreamConfig {
        channels: 1,
        sample_rate: cpal::SampleRate(16000),
        buffer_size: cpal::BufferSize::Default,
    };
    // Buffer to accumulate audio samples
    let buffer = Arc::new(Mutex::new(Vec::new()));
    let buffer_clone = buffer.clone();
    // Create audio stream
    let stream = device.build_input_stream(
        &config,
        move |data: &[i16], _: &cpal::InputCallbackInfo| {
            let mut buf = buffer_clone.lock().unwrap();
            buf.extend_from_slice(data);
            // Send chunks of ~0.5 seconds (8000 samples at 16kHz)
            while buf.len() >= 8000 {
                let chunk: Vec<i16> = buf.drain(..8000).collect();
                // Convert i16 to bytes
                let bytes: Vec<u8> = chunk.iter()
                    .flat_map(|&sample| sample.to_le_bytes())
                    .collect();
                // Send audio chunk
                let audio_chunk = AudioChunk {
                    audio_data: bytes,
                    session_id: String::new(),  // Already set in config chunk
                    config: None,
                };
                // Use blocking send since we're in a non-async context
                if let Err(e) = tx.blocking_send(audio_chunk) {
                    error!("Failed to send audio chunk: {}", e);
                }
            }
        },
        move |err| {
            error!("Audio stream error: {}", err);
        },
        None,
    )?;
    stream.play()?;
    // Keep the stream alive
    std::thread::park();
    Ok(())
 }
--- a/examples/rust-client/src/realtime_playback.rs
+++ b/examples/rust-client/src/realtime_playback.rs
@@ -0,0 +1,216 @@
 //! Real-time audio playback with synchronized transcription
 //! 
 //! This example plays an audio file while streaming it for transcription,
 //! showing transcriptions in real-time similar to YouTube.
 use anyhow::Result;
 use clap::Parser;
 use futures_util::StreamExt;
 use rodio::{Decoder, OutputStream, Source};
 use std::fs::File;
 use std::io::BufReader;
 use std::time::{Duration, Instant};
 use tokio::sync::mpsc;
 use tokio::time;
 use tokio_stream::wrappers::ReceiverStream;
 use tracing::{info, debug};
 use hound::WavReader;
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
    /// Audio file path (WAV, MP3, FLAC, etc.)
    file: String,
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "auto")]
    language: String,
    /// Model to use
    #[arg(short, long, default_value = "base")]
    model: String,
    /// Enable VAD (Voice Activity Detection)
    #[arg(short, long)]
    vad: bool,
    /// Show timestamps
    #[arg(short = 't', long)]
    timestamps: bool,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    // Initialize logging
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    let file_path = args.file.clone();
    info!("Loading audio file: {}", file_path);
    // Start audio playback in a separate thread
    let (_stream, stream_handle) = OutputStream::try_default()?;
    let file = BufReader::new(File::open(&file_path)?);
    let source = Decoder::new(file)?;
    let sample_rate = source.sample_rate();
    let channels = source.channels();
    // Convert to f32 samples for playback
    let source = source.convert_samples::<f32>();
    info!("Audio format: {} Hz, {} channels", sample_rate, channels);
    // Also open the file for streaming to transcription service
    // We need to read the raw audio data for transcription
    let mut wav_reader = WavReader::open(&file_path)?;
    let wav_spec = wav_reader.spec();
    // Collect all samples for streaming
    let samples: Vec<i16> = wav_reader.samples::<i16>()
        .collect::<hound::Result<Vec<_>>>()?;
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server.clone()).await?;
    // Create channel for audio chunks
    let (tx, rx) = mpsc::channel::<AudioChunk>(100);
    // Calculate chunk duration for synchronization
    let chunk_samples = 16000 * 3; // 3 second chunks at 16kHz for better accuracy
    let chunk_duration = Duration::from_secs(3);
    // Start playback
    println!("\n🎵 Starting audio playback with real-time transcription...\n");
    println!("{}", "─".repeat(80));
    let start_time = Instant::now();
    // Play audio
    stream_handle.play_raw(source)?;
    // Spawn task to stream audio chunks to transcription service
    let tx_clone = tx.clone();
    let show_timestamps = args.timestamps;
    tokio::spawn(async move {
        // Send configuration first
        let config = AudioConfig {
            language: args.language.clone(),
            task: "transcribe".to_string(),
            model: args.model.clone(),
            sample_rate: 16000,
            vad_enabled: args.vad,
        };
        let config_chunk = AudioChunk {
            audio_data: vec![],
            session_id: "realtime-playback".to_string(),
            config: Some(config),
        };
        if tx_clone.send(config_chunk).await.is_err() {
            return;
        }
        // Stream audio chunks synchronized with playback
        for (chunk_idx, chunk) in samples.chunks(chunk_samples).enumerate() {
            let chunk_start = Instant::now();
            // Convert samples to bytes
            let bytes: Vec<u8> = chunk.iter()
                .flat_map(|&s| s.to_le_bytes())
                .collect();
            let audio_chunk = AudioChunk {
                audio_data: bytes,
                session_id: String::new(),
                config: None,
            };
            debug!("Sending chunk {} ({} samples)", chunk_idx, chunk.len());
            if tx_clone.send(audio_chunk).await.is_err() {
                break;
            }
            // Synchronize with playback timing
            let elapsed = chunk_start.elapsed();
            if elapsed < chunk_duration {
                time::sleep(chunk_duration - elapsed).await;
            }
        }
        info!("Finished streaming audio chunks");
    });
    // Create stream and start transcription
    let stream = ReceiverStream::new(rx);
    let response = client.stream_transcribe(stream).await?;
    let mut result_stream = response.into_inner();
    // Process transcription results
    let mut last_text = String::new();
    let mut current_line = String::new();
    while let Some(result) = result_stream.next().await {
        match result {
            Ok(transcription) => {
                let elapsed = start_time.elapsed().as_secs_f32();
                // Clear previous line if text has changed significantly
                if !transcription.text.is_empty() && transcription.text != last_text {
                    if show_timestamps {
                        // Show with timestamps
                        println!("[{:06.2}s] {}", 
                                elapsed,
                                transcription.text);
                    } else {
                        // Update current line for continuous display
                        if transcription.is_final {
                            // Final transcription for this segment
                            println!("{}", transcription.text);
                            current_line.clear();
                        } else {
                            // Interim result - update in place
                            print!("\r{:<80}", transcription.text);
                            use std::io::{self, Write};
                            io::stdout().flush()?;
                            current_line = transcription.text.clone();
                        }
                    }
                    last_text = transcription.text.clone();
                }
            }
            Err(e) => {
                eprintln!("\nTranscription error: {}", e);
                break;
            }
        }
    }
    // Clear any remaining interim text
    if !current_line.is_empty() {
        println!();
    }
    println!("\n{}", "─".repeat(80));
    println!("✅ Playback and transcription complete!");
    // Keep the program alive until playback finishes
    time::sleep(Duration::from_secs(2)).await;
    Ok(())
 }
--- a/examples/rust-client/src/stream_transcribe.rs
+++ b/examples/rust-client/src/stream_transcribe.rs
@@ -0,0 +1,142 @@
 //! Stream WAV file for real-time transcription
 //! 
 //! This example shows how to stream a WAV file chunk by chunk
 //! to simulate real-time transcription.
 use anyhow::Result;
 use clap::Parser;
 use futures_util::StreamExt;
 use hound::WavReader;
 use std::fs::File;
 use std::time::Duration;
 use tokio::sync::mpsc;
 use tokio::time;
 use tokio_stream::wrappers::ReceiverStream;
 use tonic::transport::Channel;
 use tracing::info;
 // Import generated protobuf types
 pub mod transcription {
    tonic::include_proto!("transcription");
 }
 use transcription::{
    transcription_service_client::TranscriptionServiceClient, AudioChunk, AudioConfig,
 };
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
    /// WAV file path
    file: String,
    /// gRPC server address
    #[arg(short, long, default_value = "http://localhost:50051")]
    server: String,
    /// Language code (e.g., "en", "es", "auto")
    #[arg(short, long, default_value = "auto")]
    language: String,
    /// Simulate real-time by adding delays
    #[arg(short, long)]
    realtime: bool,
 }
 #[tokio::main]
 async fn main() -> Result<()> {
    // Initialize logging
    tracing_subscriber::fmt::init();
    let args = Args::parse();
    info!("Reading WAV file: {}", args.file);
    let mut reader = WavReader::open(&args.file)?;
    let spec = reader.spec();
    info!("WAV specs: {} Hz, {} channels, {} bits", 
          spec.sample_rate, spec.channels, spec.bits_per_sample);
    // Collect samples
    let samples: Vec<i16> = reader.samples::<i16>()
        .collect::<hound::Result<Vec<_>>>()?;
    info!("Connecting to transcription service at {}", args.server);
    let mut client = TranscriptionServiceClient::connect(args.server).await?;
    // Create channel for audio chunks
    let (tx, rx) = mpsc::channel::<AudioChunk>(100);
    // Spawn task to send audio chunks
    let tx_clone = tx.clone();
    let realtime = args.realtime;
    tokio::spawn(async move {
        // Send configuration first
        let config = AudioConfig {
            language: args.language.clone(),
            task: "transcribe".to_string(),
            model: "base".to_string(),
            sample_rate: 16000,
            vad_enabled: false,
        };
        let config_chunk = AudioChunk {
            audio_data: vec![],
            session_id: "stream-test".to_string(),
            config: Some(config),
        };
        if tx_clone.send(config_chunk).await.is_err() {
            return;
        }
        // Send audio in chunks of 3 seconds for better accuracy (48000 samples at 16kHz)
        let chunk_size = 48000;
        for chunk in samples.chunks(chunk_size) {
            // Convert samples to bytes
            let bytes: Vec<u8> = chunk.iter()
                .flat_map(|&s| s.to_le_bytes())
                .collect();
            let audio_chunk = AudioChunk {
                audio_data: bytes,
                session_id: String::new(),
                config: None,
            };
            if tx_clone.send(audio_chunk).await.is_err() {
                break;
            }
            // Simulate real-time streaming
            if realtime {
                time::sleep(Duration::from_secs(3)).await;
            }
        }
    });
    // Create stream and start transcription
    let stream = ReceiverStream::new(rx);
    let response = client.stream_transcribe(stream).await?;
    let mut result_stream = response.into_inner();
    info!("Streaming audio and receiving transcriptions...");
    // Process results
    while let Some(result) = result_stream.next().await {
        match result {
            Ok(transcription) => {
                println!("[{:.2}s - {:.2}s] {}", 
                         transcription.start_time,
                         transcription.end_time,
                         transcription.text);
            }
            Err(e) => {
                eprintln!("Error: {}", e);
                break;
            }
        }
    }
    Ok(())
 }
--- a/generate_proto.py
+++ b/generate_proto.py
@@ -0,0 +1,84 @@
 #!/usr/bin/env python3
 """
 Generate Python code from protobuf definitions
 Run this before starting the service for the first time
 """
 import os
 import sys
 import subprocess
 def generate_proto():
    """Generate Python code from proto files"""
    proto_dir = "proto"
    output_dir = "src"
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Find all .proto files
    proto_files = [f for f in os.listdir(proto_dir) if f.endswith('.proto')]
    if not proto_files:
        print("No .proto files found in proto/ directory")
        return False
    for proto_file in proto_files:
        proto_path = os.path.join(proto_dir, proto_file)
        print(f"Generating code for {proto_file}...")
        # Generate Python code
        cmd = [
            sys.executable, "-m", "grpc_tools.protoc",
            f"-I{proto_dir}",
            f"--python_out={output_dir}",
            f"--grpc_python_out={output_dir}",
            proto_path
        ]
        try:
            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
            print(f"✓ Generated {proto_file.replace('.proto', '_pb2.py')} and {proto_file.replace('.proto', '_pb2_grpc.py')}")
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to generate code for {proto_file}")
            print(f"  Error: {e.stderr}")
            return False
    # Fix imports in generated files
    print("Fixing imports in generated files...")
    grpc_file = os.path.join(output_dir, "transcription_pb2_grpc.py")
    if os.path.exists(grpc_file):
        with open(grpc_file, 'r') as f:
            content = f.read()
        # Fix relative import
        content = content.replace(
            "import transcription_pb2",
            "from . import transcription_pb2"
        )
        with open(grpc_file, 'w') as f:
            f.write(content)
        print("✓ Fixed imports")
    print("\nProtobuf generation complete!")
    print(f"Generated files are in {output_dir}/")
    return True
 if __name__ == "__main__":
    # Check if grpcio-tools is installed
    try:
        import grpc_tools
    except ImportError:
        print("Error: grpcio-tools not installed")
        print("Run: pip install grpcio-tools")
        sys.exit(1)
    if generate_proto():
        print("\nYou can now start the service with:")
        print("  python src/transcription_server.py")
        print("Or with Docker:")
        print("  docker compose up")
    else:
        sys.exit(1)
--- a/proto/transcription.proto
+++ b/proto/transcription.proto
@@ -0,0 +1,91 @@
 syntax = "proto3";
 package transcription;
 // The transcription service provides real-time speech-to-text capabilities
 service TranscriptionService {
  // Bidirectional streaming: send audio chunks, receive transcriptions
  rpc StreamTranscribe(stream AudioChunk) returns (stream TranscriptionResult);
  // Unary call for single audio file transcription
  rpc TranscribeFile(AudioFile) returns (TranscriptionResponse);
  // Get available models and languages
  rpc GetCapabilities(Empty) returns (Capabilities);
  // Health check
  rpc HealthCheck(Empty) returns (HealthStatus);
 }
 // Audio chunk for streaming
 message AudioChunk {
  bytes audio_data = 1;  // PCM16 audio data (16-bit, 16kHz, mono)
  string session_id = 2; // Optional session ID for tracking
  AudioConfig config = 3; // Optional config (only needed in first chunk)
 }
 // Audio configuration
 message AudioConfig {
  string language = 1;    // Language code (e.g., "en", "es", "auto")
  string task = 2;        // "transcribe" or "translate"
  string model = 3;       // Model size: "tiny", "base", "small", "medium", "large-v3"
  int32 sample_rate = 4; // Sample rate (default: 16000)
  bool vad_enabled = 5;   // Voice Activity Detection
 }
 // Transcription result for streaming
 message TranscriptionResult {
  string text = 1;           // Transcribed text
  float start_time = 2;      // Start time in seconds
  float end_time = 3;        // End time in seconds
  bool is_final = 4;         // Is this a final result?
  float confidence = 5;      // Confidence score (0-1)
  string language = 6;       // Detected language
  string session_id = 7;     // Session ID for tracking
  int64 timestamp_ms = 8;    // Server timestamp in milliseconds
 }
 // Complete audio file for transcription
 message AudioFile {
  bytes audio_data = 1;      // Complete audio file data
  string format = 2;         // Format: "wav", "mp3", "webm", "raw_pcm16"
  AudioConfig config = 3;    // Audio configuration
 }
 // Response for file transcription
 message TranscriptionResponse {
  repeated TranscriptionSegment segments = 1;
  string full_text = 2;      // Complete transcription
  string detected_language = 3;
  float duration_seconds = 4;
 }
 // Transcription segment
 message TranscriptionSegment {
  string text = 1;
  float start_time = 2;
  float end_time = 3;
  float confidence = 4;
 }
 // Service capabilities
 message Capabilities {
  repeated string available_models = 1;
  repeated string supported_languages = 2;
  repeated string supported_formats = 3;
  int32 max_audio_length_seconds = 4;
  bool streaming_supported = 5;
  bool vad_supported = 6;
 }
 // Health status
 message HealthStatus {
  bool healthy = 1;
  string status = 2;
  string model_loaded = 3;
  int64 uptime_seconds = 4;
  int32 active_sessions = 5;
 }
 // Empty message for requests without parameters
 message Empty {}
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,25 @@
 # Core dependencies
 grpcio==1.60.0
 grpcio-tools==1.60.0
 websockets==12.0
 fastapi==0.109.0
 uvicorn[standard]==0.27.0
 # Audio processing
 numpy==1.24.3
 soundfile==0.12.1
 librosa==0.10.1
 # SimulStreaming/Whisper dependencies
 torch>=2.0.0
 transformers>=4.36.0
 openai-whisper>=20231117
 # Utilities
 python-multipart==0.0.6
 aiofiles==23.2.1
 pydantic==2.5.3
 python-dotenv==1.0.0
 # Monitoring
 prometheus-client==0.19.0
--- a/src/transcription_server.py
+++ b/src/transcription_server.py
@@ -0,0 +1,518 @@
 #!/usr/bin/env python3
 """
 Standalone Transcription Service with gRPC and WebSocket support
 Optimized for real-time streaming transcription
 """
 import os
 import sys
 import asyncio
 import logging
 import time
 import json
 import base64
 from typing import Optional, AsyncIterator, Dict, List
 from dataclasses import dataclass, asdict
 from concurrent import futures
 import threading
 from datetime import datetime
 # Add current directory to path for generated protobuf imports
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import grpc
 import numpy as np
 import soundfile
 import librosa
 import torch
 # Add SimulStreaming to path if available
 simulstreaming_path = os.environ.get('SIMULSTREAMING_PATH', '/app/SimulStreaming')
 if os.path.exists(simulstreaming_path):
    sys.path.insert(0, simulstreaming_path)
    USE_SIMULSTREAMING = True
    try:
        from simulstreaming_whisper import simulwhisper_args, simul_asr_factory
    except ImportError:
        USE_SIMULSTREAMING = False
        import whisper
 else:
    USE_SIMULSTREAMING = False
    import whisper
 # Import generated protobuf classes (will be generated later)
 from transcription_pb2 import (
    AudioChunk, AudioFile, TranscriptionResult, TranscriptionResponse,
    TranscriptionSegment, Capabilities, HealthStatus, Empty, AudioConfig
 )
 import transcription_pb2_grpc
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 SAMPLE_RATE = 16000
 MAX_AUDIO_LENGTH = 30 * 60  # 30 minutes
@dataclass
 class TranscriptionSession:
    """Manages a single transcription session"""
    session_id: str
    config: AudioConfig
    audio_buffer: bytearray
    start_time: float
    last_activity: float
    transcriptions: List[dict]
 class TranscriptionEngine:
    """Core transcription engine using Whisper or SimulStreaming"""
    def __init__(self, model_name: str = "large-v3"):
        self.model_name = model_name
        self.model = None
        self.processor = None
        self.online_processor = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.load_model()
    def load_model(self):
        """Load the transcription model"""
        if USE_SIMULSTREAMING:
            self._load_simulstreaming()
        else:
            self._load_whisper()
    def _load_simulstreaming(self):
        """Load SimulStreaming for real-time transcription"""
        try:
            import argparse
            parser = argparse.ArgumentParser()
            # Add SimulStreaming arguments
            simulwhisper_args(parser)
            args = parser.parse_args([
                '--model_path', self.model_name,
                '--lan', 'auto',
                '--task', 'transcribe',
                '--backend', 'whisper',
                '--min-chunk-size', '0.5',
                '--beams', '1',
            ])
            # Create processor
            self.processor, self.online_processor = simul_asr_factory(args)
            logger.info(f"Loaded SimulStreaming with model: {self.model_name}")
        except Exception as e:
            logger.error(f"Failed to load SimulStreaming: {e}")
            logger.info("Falling back to standard Whisper")
            USE_SIMULSTREAMING = False
            self._load_whisper()
    def _load_whisper(self):
        """Load standard Whisper model"""
        try:
            # Use the shared volume for model caching
            download_root = os.environ.get('TORCH_HOME', '/app/models')
            self.model = whisper.load_model(self.model_name, device=self.device, download_root=download_root)
            logger.info(f"Loaded Whisper model: {self.model_name} on {self.device} from {download_root}")
        except Exception as e:
            logger.error(f"Failed to load Whisper model: {e}")
            raise
    def is_speech(self, audio: np.ndarray, energy_threshold: float = 0.002, zero_crossing_threshold: int = 50) -> bool:
        """
        Simple Voice Activity Detection
        Returns True if the audio chunk likely contains speech
        """
        # Check if audio is too quiet (likely silence)
        energy = np.sqrt(np.mean(audio**2))
        if energy < energy_threshold:
            return False
        # Check zero crossing rate (helps distinguish speech from noise)
        zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) > 0)
        # Speech typically has moderate zero crossing rate
        # Pure noise tends to have very high zero crossing rate
        if zero_crossings > len(audio) * zero_crossing_threshold / SAMPLE_RATE:
            return False
        return True
    def transcribe_chunk(self, audio_data: bytes, language: str = "auto", vad_enabled: bool = True) -> Optional[dict]:
        """Transcribe a single audio chunk"""
        try:
            # Convert bytes to numpy array
            audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            # Check if audio contains speech (VAD) - only if enabled
            if vad_enabled:
                energy = np.sqrt(np.mean(audio**2))
                if not self.is_speech(audio):
                    logger.info(f"No speech detected in audio chunk (energy: {energy:.4f}), skipping transcription")
                    return None
                else:
                    logger.info(f"Speech detected in chunk (energy: {energy:.4f})")
            if USE_SIMULSTREAMING and self.online_processor:
                # Use SimulStreaming for real-time processing
                self.online_processor.insert_audio_chunk(audio)
                result = self.online_processor.process_iter()
                if result and result[0] is not None:
                    return {
                        'text': result[2],
                        'start_time': result[0],
                        'end_time': result[1],
                        'is_final': True,
                        'confidence': 0.95  # SimulStreaming doesn't provide confidence
                    }
            else:
                # Use standard Whisper
                if self.model:
                    # Pad audio to minimum length if needed
                    if len(audio) < SAMPLE_RATE:
                        audio = np.pad(audio, (0, SAMPLE_RATE - len(audio)))
                    # Use more conservative settings to reduce hallucinations
                    result = self.model.transcribe(
                        audio,
                        language=None if language == "auto" else language,
                        fp16=self.device == "cuda",
                        temperature=0.0,  # More deterministic, less hallucination
                        no_speech_threshold=0.6,  # Higher threshold for detecting non-speech
                        logprob_threshold=-1.0,  # Filter out low probability results
                        compression_ratio_threshold=2.4  # Filter out repetitive results
                    )
                    if result and result.get('text'):
                        text = result['text'].strip()
                        # Filter out common hallucinations
                        hallucination_phrases = [
                            "thank you", "thanks", "you", "uh", "um", 
                            "thank you for watching", "please subscribe",
                            "bye", "bye-bye", ".", "...", ""
                        ]
                        # Check if the result is just a hallucination
                        text_lower = text.lower().strip()
                        if text_lower in hallucination_phrases:
                            logger.debug(f"Filtered out hallucination: {text}")
                            return None
                        # Check for repetitive text (another sign of hallucination)
                        words = text.lower().split()
                        if len(words) > 1 and len(set(words)) == 1:
                            logger.debug(f"Filtered out repetitive text: {text}")
                            return None
                        return {
                            'text': text,
                            'start_time': 0,
                            'end_time': len(audio) / SAMPLE_RATE,
                            'is_final': True,
                            'confidence': 0.9,
                            'language': result.get('language', language)
                        }
        except Exception as e:
            logger.error(f"Error transcribing chunk: {e}")
        return None
    def transcribe_file(self, audio_data: bytes, format: str, config: AudioConfig) -> dict:
        """Transcribe a complete audio file"""
        try:
            # Convert audio to numpy array based on format
            if format == "raw_pcm16":
                audio = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            else:
                # Use librosa for other formats
                import io
                audio, _ = librosa.load(io.BytesIO(audio_data), sr=SAMPLE_RATE)
            # Transcribe with Whisper
            if self.model:
                result = self.model.transcribe(
                    audio,
                    language=None if config.language == "auto" else config.language,
                    task=config.task or "transcribe",
                    fp16=self.device == "cuda"
                )
                segments = []
                for seg in result.get('segments', []):
                    segments.append({
                        'text': seg['text'].strip(),
                        'start_time': seg['start'],
                        'end_time': seg['end'],
                        'confidence': seg.get('avg_logprob', 0) + 1.0  # Convert to 0-1 range
                    })
                return {
                    'segments': segments,
                    'full_text': result['text'].strip(),
                    'detected_language': result.get('language', config.language),
                    'duration_seconds': len(audio) / SAMPLE_RATE
                }
        except Exception as e:
            logger.error(f"Error transcribing file: {e}")
        return {
            'segments': [],
            'full_text': '',
            'detected_language': 'unknown',
            'duration_seconds': 0
        }
 class TranscriptionServicer(transcription_pb2_grpc.TranscriptionServiceServicer):
    """gRPC service implementation"""
    def __init__(self):
        self.engine = TranscriptionEngine()
        self.sessions: Dict[str, TranscriptionSession] = {}
        self.start_time = time.time()
    async def StreamTranscribe(self, request_iterator: AsyncIterator[AudioChunk],
                              context: grpc.aio.ServicerContext) -> AsyncIterator[TranscriptionResult]:
        """Bidirectional streaming transcription"""
        session_id = None
        config = None
        audio_buffer = bytearray()
        try:
            async for chunk in request_iterator:
                # Get session ID and config from first chunk
                if not session_id:
                    session_id = chunk.session_id or str(time.time())
                    config = chunk.config or AudioConfig(
                        language="auto",
                        task="transcribe"
                    )
                # Add audio to buffer
                audio_buffer.extend(chunk.audio_data)
                # Process when we have enough audio (3 seconds for better accuracy)
                min_bytes = int(SAMPLE_RATE * 3.0 * 2)  # 3 seconds of PCM16
                while len(audio_buffer) >= min_bytes:
                    # Extract chunk to process
                    audio_chunk = bytes(audio_buffer[:min_bytes])
                    audio_buffer = audio_buffer[min_bytes:]
                    # Transcribe
                    logger.debug(f"Processing audio chunk of {len(audio_chunk)} bytes")
                    result = self.engine.transcribe_chunk(
                        audio_chunk,
                        language=config.language,
                        vad_enabled=config.vad_enabled if config else False
                    )
                    logger.debug(f"Transcription result: {result}")
                    if result:
                        # Send transcription result
                        yield TranscriptionResult(
                            text=result['text'],
                            start_time=result['start_time'],
                            end_time=result['end_time'],
                            is_final=result['is_final'],
                            confidence=result.get('confidence', 0.9),
                            language=result.get('language', config.language),
                            session_id=session_id,
                            timestamp_ms=int(time.time() * 1000)
                        )
            # Process remaining audio
            if audio_buffer:
                result = self.engine.transcribe_chunk(
                    bytes(audio_buffer),
                    language=config.language,
                    vad_enabled=config.vad_enabled if config else False
                )
                if result:
                    yield TranscriptionResult(
                        text=result['text'],
                        start_time=result['start_time'],
                        end_time=result['end_time'],
                        is_final=True,
                        confidence=result.get('confidence', 0.9),
                        language=result.get('language', config.language),
                        session_id=session_id,
                        timestamp_ms=int(time.time() * 1000)
                    )
        except Exception as e:
            logger.error(f"Error in StreamTranscribe: {e}")
            context.abort(grpc.StatusCode.INTERNAL, str(e))
    async def TranscribeFile(self, request: AudioFile, context: grpc.aio.ServicerContext) -> TranscriptionResponse:
        """Transcribe a complete audio file"""
        try:
            result = self.engine.transcribe_file(
                request.audio_data,
                request.format,
                request.config
            )
            segments = [
                TranscriptionSegment(
                    text=seg['text'],
                    start_time=seg['start_time'],
                    end_time=seg['end_time'],
                    confidence=seg['confidence']
                )
                for seg in result['segments']
            ]
            return TranscriptionResponse(
                segments=segments,
                full_text=result['full_text'],
                detected_language=result['detected_language'],
                duration_seconds=result['duration_seconds']
            )
        except Exception as e:
            logger.error(f"Error in TranscribeFile: {e}")
            context.abort(grpc.StatusCode.INTERNAL, str(e))
    async def GetCapabilities(self, request: Empty, context: grpc.aio.ServicerContext) -> Capabilities:
        """Get service capabilities"""
        return Capabilities(
            available_models=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"],
            supported_languages=["auto", "en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ko"],
            supported_formats=["wav", "mp3", "webm", "raw_pcm16"],
            max_audio_length_seconds=MAX_AUDIO_LENGTH,
            streaming_supported=True,
            vad_supported=False  # Can be implemented later
        )
    async def HealthCheck(self, request: Empty, context: grpc.aio.ServicerContext) -> HealthStatus:
        """Health check endpoint"""
        return HealthStatus(
            healthy=True,
            status="running",
            model_loaded=self.engine.model_name,
            uptime_seconds=int(time.time() - self.start_time),
            active_sessions=len(self.sessions)
        )
 async def serve_grpc(port: int = 50051):
    """Start the gRPC server"""
    server = grpc.aio.server(
        futures.ThreadPoolExecutor(max_workers=10),
        options=[
            ('grpc.max_send_message_length', 100 * 1024 * 1024),  # 100MB
            ('grpc.max_receive_message_length', 100 * 1024 * 1024),
        ]
    )
    servicer = TranscriptionServicer()
    transcription_pb2_grpc.add_TranscriptionServiceServicer_to_server(servicer, server)
    server.add_insecure_port(f'[::]:{port}')
    await server.start()
    logger.info(f"gRPC server started on port {port}")
    await server.wait_for_termination()
 # WebSocket support for compatibility
 async def handle_websocket(websocket, path):
    """Handle WebSocket connections for compatibility"""
    import websockets
    engine = TranscriptionEngine()
    session_id = str(time.time())
    audio_buffer = bytearray()
    try:
        # Send connection confirmation
        await websocket.send(json.dumps({
            'type': 'connected',
            'session_id': session_id
        }))
        async for message in websocket:
            data = json.loads(message)
            if data['type'] == 'audio':
                # Decode base64 audio
                audio_data = base64.b64decode(data['data'])
                audio_buffer.extend(audio_data)
                # Process when we have enough audio
                min_bytes = int(SAMPLE_RATE * 0.5 * 2)
                while len(audio_buffer) >= min_bytes:
                    chunk = bytes(audio_buffer[:min_bytes])
                    audio_buffer = audio_buffer[min_bytes:]
                    result = engine.transcribe_chunk(chunk)
                    if result:
                        await websocket.send(json.dumps({
                            'type': 'transcription',
                            'text': result['text'],
                            'start_time': result['start_time'],
                            'end_time': result['end_time'],
                            'is_final': result['is_final'],
                            'timestamp': int(time.time() * 1000)
                        }))
            elif data['type'] == 'stop':
                # Process remaining audio
                if audio_buffer:
                    result = engine.transcribe_chunk(bytes(audio_buffer))
                    if result:
                        await websocket.send(json.dumps({
                            'type': 'transcription',
                            'text': result['text'],
                            'is_final': True,
                            'timestamp': int(time.time() * 1000)
                        }))
                break
    except websockets.exceptions.ConnectionClosed:
        logger.info(f"WebSocket connection closed: {session_id}")
    except Exception as e:
        logger.error(f"WebSocket error: {e}")
 async def serve_websocket(port: int = 8765):
    """Start the WebSocket server"""
    import websockets
    logger.info(f"WebSocket server started on port {port}")
    async with websockets.serve(handle_websocket, "0.0.0.0", port):
        await asyncio.Future()  # Run forever
 async def main():
    """Main entry point"""
    grpc_port = int(os.environ.get('GRPC_PORT', '50051'))
    ws_port = int(os.environ.get('WEBSOCKET_PORT', '8765'))
    enable_websocket = os.environ.get('ENABLE_WEBSOCKET', 'true').lower() == 'true'
    tasks = [serve_grpc(grpc_port)]
    if enable_websocket:
        tasks.append(serve_websocket(ws_port))
    await asyncio.gather(*tasks)
 if __name__ == "__main__":
    asyncio.run(main())