initial commit

This commit is contained in:
2025-09-11 09:59:16 +02:00
commit ab17a8ac21
19 changed files with 2587 additions and 0 deletions

91
proto/transcription.proto Normal file
View File

@@ -0,0 +1,91 @@
syntax = "proto3";
package transcription;
// The transcription service provides real-time speech-to-text capabilities
service TranscriptionService {
// Bidirectional streaming: send audio chunks, receive transcriptions
rpc StreamTranscribe(stream AudioChunk) returns (stream TranscriptionResult);
// Unary call for single audio file transcription
rpc TranscribeFile(AudioFile) returns (TranscriptionResponse);
// Get available models and languages
rpc GetCapabilities(Empty) returns (Capabilities);
// Health check
rpc HealthCheck(Empty) returns (HealthStatus);
}
// Audio chunk for streaming
message AudioChunk {
bytes audio_data = 1; // PCM16 audio data (16-bit, 16kHz, mono)
string session_id = 2; // Optional session ID for tracking
AudioConfig config = 3; // Optional config (only needed in first chunk)
}
// Audio configuration
message AudioConfig {
string language = 1; // Language code (e.g., "en", "es", "auto")
string task = 2; // "transcribe" or "translate"
string model = 3; // Model size: "tiny", "base", "small", "medium", "large-v3"
int32 sample_rate = 4; // Sample rate (default: 16000)
bool vad_enabled = 5; // Voice Activity Detection
}
// Transcription result for streaming
message TranscriptionResult {
string text = 1; // Transcribed text
float start_time = 2; // Start time in seconds
float end_time = 3; // End time in seconds
bool is_final = 4; // Is this a final result?
float confidence = 5; // Confidence score (0-1)
string language = 6; // Detected language
string session_id = 7; // Session ID for tracking
int64 timestamp_ms = 8; // Server timestamp in milliseconds
}
// Complete audio file for transcription
message AudioFile {
bytes audio_data = 1; // Complete audio file data
string format = 2; // Format: "wav", "mp3", "webm", "raw_pcm16"
AudioConfig config = 3; // Audio configuration
}
// Response for file transcription
message TranscriptionResponse {
repeated TranscriptionSegment segments = 1;
string full_text = 2; // Complete transcription
string detected_language = 3;
float duration_seconds = 4;
}
// Transcription segment
message TranscriptionSegment {
string text = 1;
float start_time = 2;
float end_time = 3;
float confidence = 4;
}
// Service capabilities
message Capabilities {
repeated string available_models = 1;
repeated string supported_languages = 2;
repeated string supported_formats = 3;
int32 max_audio_length_seconds = 4;
bool streaming_supported = 5;
bool vad_supported = 6;
}
// Health status
message HealthStatus {
bool healthy = 1;
string status = 2;
string model_loaded = 3;
int64 uptime_seconds = 4;
int32 active_sessions = 5;
}
// Empty message for requests without parameters
message Empty {}