transcription-api/proto/transcription.proto

syntax = "proto3";

package transcription;

// The transcription service provides real-time speech-to-text capabilities
service TranscriptionService {
  // Bidirectional streaming: send audio chunks, receive transcriptions
  rpc StreamTranscribe(stream AudioChunk) returns (stream TranscriptionResult);

  // Unary call for single audio file transcription
  rpc TranscribeFile(AudioFile) returns (TranscriptionResponse);

  // Get available models and languages
  rpc GetCapabilities(Empty) returns (Capabilities);

  // Health check
  rpc HealthCheck(Empty) returns (HealthStatus);
}

// Audio chunk for streaming
message AudioChunk {
  bytes audio_data = 1;  // PCM16 audio data (16-bit, 16kHz, mono)
  string session_id = 2; // Optional session ID for tracking
  AudioConfig config = 3; // Optional config (only needed in first chunk)
}

// Audio configuration
message AudioConfig {
  string language = 1;    // Language code (e.g., "en", "es", "auto")
  string task = 2;        // "transcribe" or "translate"
  string model = 3;       // Model size: "tiny", "base", "small", "medium", "large-v3"
  int32 sample_rate = 4; // Sample rate (default: 16000)
  bool vad_enabled = 5;   // Voice Activity Detection
}

// Transcription result for streaming
message TranscriptionResult {
  string text = 1;           // Transcribed text
  float start_time = 2;      // Start time in seconds
  float end_time = 3;        // End time in seconds
  bool is_final = 4;         // Is this a final result?
  float confidence = 5;      // Confidence score (0-1)
  string language = 6;       // Detected language
  string session_id = 7;     // Session ID for tracking
  int64 timestamp_ms = 8;    // Server timestamp in milliseconds
}

// Complete audio file for transcription
message AudioFile {
  bytes audio_data = 1;      // Complete audio file data
  string format = 2;         // Format: "wav", "mp3", "webm", "raw_pcm16"
  AudioConfig config = 3;    // Audio configuration
}

// Response for file transcription
message TranscriptionResponse {
  repeated TranscriptionSegment segments = 1;
  string full_text = 2;      // Complete transcription
  string detected_language = 3;
  float duration_seconds = 4;
}

// Transcription segment
message TranscriptionSegment {
  string text = 1;
  float start_time = 2;
  float end_time = 3;
  float confidence = 4;
}

// Service capabilities
message Capabilities {
  repeated string available_models = 1;
  repeated string supported_languages = 2;
  repeated string supported_formats = 3;
  int32 max_audio_length_seconds = 4;
  bool streaming_supported = 5;
  bool vad_supported = 6;
}

// Health status
message HealthStatus {
  bool healthy = 1;
  string status = 2;
  string model_loaded = 3;
  int64 uptime_seconds = 4;
  int32 active_sessions = 5;
}

// Empty message for requests without parameters
message Empty {}