mirror of
https://github.com/aljazceru/transcription-api.git
synced 2025-12-16 23:14:18 +01:00
91 lines
2.8 KiB
Protocol Buffer
91 lines
2.8 KiB
Protocol Buffer
syntax = "proto3";
|
|
|
|
package transcription;
|
|
|
|
// The transcription service provides real-time speech-to-text capabilities
|
|
service TranscriptionService {
|
|
// Bidirectional streaming: send audio chunks, receive transcriptions
|
|
rpc StreamTranscribe(stream AudioChunk) returns (stream TranscriptionResult);
|
|
|
|
// Unary call for single audio file transcription
|
|
rpc TranscribeFile(AudioFile) returns (TranscriptionResponse);
|
|
|
|
// Get available models and languages
|
|
rpc GetCapabilities(Empty) returns (Capabilities);
|
|
|
|
// Health check
|
|
rpc HealthCheck(Empty) returns (HealthStatus);
|
|
}
|
|
|
|
// Audio chunk for streaming
|
|
message AudioChunk {
|
|
bytes audio_data = 1; // PCM16 audio data (16-bit, 16kHz, mono)
|
|
string session_id = 2; // Optional session ID for tracking
|
|
AudioConfig config = 3; // Optional config (only needed in first chunk)
|
|
}
|
|
|
|
// Audio configuration
|
|
message AudioConfig {
|
|
string language = 1; // Language code (e.g., "en", "es", "auto")
|
|
string task = 2; // "transcribe" or "translate"
|
|
string model = 3; // Model size: "tiny", "base", "small", "medium", "large-v3"
|
|
int32 sample_rate = 4; // Sample rate (default: 16000)
|
|
bool vad_enabled = 5; // Voice Activity Detection
|
|
}
|
|
|
|
// Transcription result for streaming
|
|
message TranscriptionResult {
|
|
string text = 1; // Transcribed text
|
|
float start_time = 2; // Start time in seconds
|
|
float end_time = 3; // End time in seconds
|
|
bool is_final = 4; // Is this a final result?
|
|
float confidence = 5; // Confidence score (0-1)
|
|
string language = 6; // Detected language
|
|
string session_id = 7; // Session ID for tracking
|
|
int64 timestamp_ms = 8; // Server timestamp in milliseconds
|
|
}
|
|
|
|
// Complete audio file for transcription
|
|
message AudioFile {
|
|
bytes audio_data = 1; // Complete audio file data
|
|
string format = 2; // Format: "wav", "mp3", "webm", "raw_pcm16"
|
|
AudioConfig config = 3; // Audio configuration
|
|
}
|
|
|
|
// Response for file transcription
|
|
message TranscriptionResponse {
|
|
repeated TranscriptionSegment segments = 1;
|
|
string full_text = 2; // Complete transcription
|
|
string detected_language = 3;
|
|
float duration_seconds = 4;
|
|
}
|
|
|
|
// Transcription segment
|
|
message TranscriptionSegment {
|
|
string text = 1;
|
|
float start_time = 2;
|
|
float end_time = 3;
|
|
float confidence = 4;
|
|
}
|
|
|
|
// Service capabilities
|
|
message Capabilities {
|
|
repeated string available_models = 1;
|
|
repeated string supported_languages = 2;
|
|
repeated string supported_formats = 3;
|
|
int32 max_audio_length_seconds = 4;
|
|
bool streaming_supported = 5;
|
|
bool vad_supported = 6;
|
|
}
|
|
|
|
// Health status
|
|
message HealthStatus {
|
|
bool healthy = 1;
|
|
string status = 2;
|
|
string model_loaded = 3;
|
|
int64 uptime_seconds = 4;
|
|
int32 active_sessions = 5;
|
|
}
|
|
|
|
// Empty message for requests without parameters
|
|
message Empty {} |