mirror of
https://github.com/aljazceru/transcription-api.git
synced 2025-12-17 07:14:24 +01:00
live transcription update
This commit is contained in:
@@ -29,7 +29,7 @@ struct Args {
|
|||||||
server: String,
|
server: String,
|
||||||
|
|
||||||
/// Language code (e.g., "en", "es", "auto")
|
/// Language code (e.g., "en", "es", "auto")
|
||||||
#[arg(short, long, default_value = "auto")]
|
#[arg(short, long, default_value = "en")]
|
||||||
language: String,
|
language: String,
|
||||||
|
|
||||||
/// Task: transcribe or translate
|
/// Task: transcribe or translate
|
||||||
@@ -151,9 +151,9 @@ fn capture_audio(tx: tokio_mpsc::Sender<AudioChunk>) -> Result<()> {
|
|||||||
let mut buf = buffer_clone.lock().unwrap();
|
let mut buf = buffer_clone.lock().unwrap();
|
||||||
buf.extend_from_slice(data);
|
buf.extend_from_slice(data);
|
||||||
|
|
||||||
// Send chunks of ~0.5 seconds (8000 samples at 16kHz)
|
// Send chunks of ~3 seconds (48000 samples at 16kHz) for better accuracy
|
||||||
while buf.len() >= 8000 {
|
while buf.len() >= 48000 {
|
||||||
let chunk: Vec<i16> = buf.drain(..8000).collect();
|
let chunk: Vec<i16> = buf.drain(..48000).collect();
|
||||||
|
|
||||||
// Convert i16 to bytes
|
// Convert i16 to bytes
|
||||||
let bytes: Vec<u8> = chunk.iter()
|
let bytes: Vec<u8> = chunk.iter()
|
||||||
|
|||||||
@@ -159,14 +159,18 @@ class TranscriptionEngine:
|
|||||||
"""Get the shared model instance from ModelManager"""
|
"""Get the shared model instance from ModelManager"""
|
||||||
return self.model_manager.get_model()
|
return self.model_manager.get_model()
|
||||||
|
|
||||||
def is_speech(self, audio: np.ndarray, energy_threshold: float = 0.002, zero_crossing_threshold: int = 50) -> bool:
|
def is_speech(self, audio: np.ndarray, energy_threshold: float = 0.005, zero_crossing_threshold: int = 50) -> bool:
|
||||||
"""
|
"""
|
||||||
Simple Voice Activity Detection
|
Enhanced Voice Activity Detection
|
||||||
Returns True if the audio chunk likely contains speech
|
Returns True if the audio chunk likely contains speech
|
||||||
"""
|
"""
|
||||||
# Check if audio is too quiet (likely silence)
|
# Check if audio is too quiet (likely silence)
|
||||||
energy = np.sqrt(np.mean(audio**2))
|
energy = np.sqrt(np.mean(audio**2))
|
||||||
if energy < energy_threshold:
|
if energy < energy_threshold: # Increased threshold for better filtering
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if audio is too loud (likely noise/distortion)
|
||||||
|
if energy > 0.95:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check zero crossing rate (helps distinguish speech from noise)
|
# Check zero crossing rate (helps distinguish speech from noise)
|
||||||
@@ -177,6 +181,10 @@ class TranscriptionEngine:
|
|||||||
if zero_crossings > len(audio) * zero_crossing_threshold / SAMPLE_RATE:
|
if zero_crossings > len(audio) * zero_crossing_threshold / SAMPLE_RATE:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Check for consistent silence (flat signal)
|
||||||
|
if np.std(audio) < 0.001:
|
||||||
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def transcribe_chunk(self, audio_data: bytes, language: str = "auto", vad_enabled: bool = True) -> Optional[dict]:
|
def transcribe_chunk(self, audio_data: bytes, language: str = "auto", vad_enabled: bool = True) -> Optional[dict]:
|
||||||
@@ -216,24 +224,37 @@ class TranscriptionEngine:
|
|||||||
audio = np.pad(audio, (0, SAMPLE_RATE - len(audio)))
|
audio = np.pad(audio, (0, SAMPLE_RATE - len(audio)))
|
||||||
|
|
||||||
# Use more conservative settings to reduce hallucinations
|
# Use more conservative settings to reduce hallucinations
|
||||||
|
# Force English if specified to prevent language switching
|
||||||
|
forced_language = None if language == "auto" else language
|
||||||
|
if language == "en" or language == "english":
|
||||||
|
forced_language = "en"
|
||||||
|
|
||||||
result = model.transcribe(
|
result = model.transcribe(
|
||||||
audio,
|
audio,
|
||||||
language=None if language == "auto" else language,
|
language=forced_language,
|
||||||
fp16=self.device == "cuda",
|
fp16=self.device == "cuda",
|
||||||
temperature=0.0, # More deterministic, less hallucination
|
temperature=0.0, # More deterministic, less hallucination
|
||||||
no_speech_threshold=0.6, # Higher threshold for detecting non-speech
|
no_speech_threshold=0.8, # Much higher threshold for detecting non-speech
|
||||||
logprob_threshold=-1.0, # Filter out low probability results
|
logprob_threshold=-0.5, # Stricter filtering of low probability results
|
||||||
compression_ratio_threshold=2.4 # Filter out repetitive results
|
compression_ratio_threshold=2.0, # Stricter filtering of repetitive results
|
||||||
|
condition_on_previous_text=False, # Don't use previous text as context (reduces hallucination chains)
|
||||||
|
initial_prompt=None # Don't use initial prompt to avoid biasing
|
||||||
)
|
)
|
||||||
|
|
||||||
if result and result.get('text'):
|
if result and result.get('text'):
|
||||||
text = result['text'].strip()
|
text = result['text'].strip()
|
||||||
|
|
||||||
# Filter out common hallucinations
|
# Filter out common hallucinations (expanded list)
|
||||||
hallucination_phrases = [
|
hallucination_phrases = [
|
||||||
"thank you", "thanks", "you", "uh", "um",
|
"thank you", "thanks", "you", "uh", "um",
|
||||||
"thank you for watching", "please subscribe",
|
"thank you for watching", "please subscribe",
|
||||||
"bye", "bye-bye", ".", "...", ""
|
"bye", "bye-bye", ".", "...", "",
|
||||||
|
"thank you for watching.", "thanks for watching",
|
||||||
|
"please like and subscribe", "hit the bell",
|
||||||
|
"see you next time", "goodbye", "the end",
|
||||||
|
"[music]", "[applause]", "(music)", "(applause)",
|
||||||
|
"foreign", "[foreign]", "aqui", "ici", "здесь",
|
||||||
|
"谢谢", "ありがとう", "شكرا", "धन्यवाद"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Check if the result is just a hallucination
|
# Check if the result is just a hallucination
|
||||||
@@ -242,12 +263,33 @@ class TranscriptionEngine:
|
|||||||
logger.debug(f"Filtered out hallucination: {text}")
|
logger.debug(f"Filtered out hallucination: {text}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Filter out very short text that's likely noise
|
||||||
|
if len(text_lower) <= 2 and text_lower not in ["ok", "no", "hi", "go"]:
|
||||||
|
logger.debug(f"Filtered out short text: {text}")
|
||||||
|
return None
|
||||||
|
|
||||||
# Check for repetitive text (another sign of hallucination)
|
# Check for repetitive text (another sign of hallucination)
|
||||||
words = text.lower().split()
|
words = text.lower().split()
|
||||||
if len(words) > 1 and len(set(words)) == 1:
|
if len(words) > 1 and len(set(words)) == 1:
|
||||||
logger.debug(f"Filtered out repetitive text: {text}")
|
logger.debug(f"Filtered out repetitive text: {text}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Filter out text that's mostly punctuation or special characters
|
||||||
|
alphanumeric_ratio = sum(c.isalnum() or c.isspace() for c in text) / max(len(text), 1)
|
||||||
|
if alphanumeric_ratio < 0.5:
|
||||||
|
logger.debug(f"Filtered out non-alphanumeric text: {text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if detected language matches expected (if English is forced)
|
||||||
|
detected_lang = result.get('language', '')
|
||||||
|
if forced_language == "en" and detected_lang and detected_lang != "en":
|
||||||
|
# Check if text contains mostly non-English characters
|
||||||
|
import re
|
||||||
|
non_english = re.findall(r'[^\x00-\x7F]+', text)
|
||||||
|
if len(''.join(non_english)) > len(text) * 0.3:
|
||||||
|
logger.debug(f"Filtered out non-English text: {text} (detected: {detected_lang})")
|
||||||
|
return None
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'text': text,
|
'text': text,
|
||||||
'start_time': 0,
|
'start_time': 0,
|
||||||
|
|||||||
Reference in New Issue
Block a user