''' Realtime Speech Recognition with "Ende" recognition 21.4.2026 mchris HOWTO: 1. Change the python environment e.g. source ~/tools/venv/ch/bin/activate 2. run the script python whisperVAD.py INSTALLATION: Difficulties - you have to find out the device_id of your microphone, type python -m sounddevice The default sample rate may differ from what the device states. You have to find out manually. You have to have install all the libraries needed in your python environment, adapt the path to your path. You can create a new python environment with python -m venv ~/tools/venv/ch The webrtcvad is needed for this script: sudo apt update sudo apt install build-essential python3-dev pip install webrtcvad ''' import sounddevice as sd import numpy as np import whisper import queue import tempfile import scipy.io.wavfile as wavfile import webrtcvad # ---------------- CONFIG ---------------- # addapt the following parameters to your microphone properties !!! device_id = 7 samplerate = 16000 frame_duration = 30 # ms (must be 10, 20 or 30 for webrtcvad) vad_mode = 2 # 0=very permissive, 3=very strict # ---------------------------------------- model = whisper.load_model("base") vad = webrtcvad.Vad(vad_mode) q = queue.Queue() frame_size = int(samplerate * frame_duration / 1000) def callback(indata, frames, time, status): if status: print(status) q.put(indata.copy().flatten()) def float_to_pcm16(audio): audio = np.clip(audio, -1, 1) return (audio * 32767).astype(np.int16) print(f"Using device {device_id} at {samplerate} Hz") with sd.InputStream(device=device_id, channels=1, samplerate=samplerate, dtype='float32', callback=callback, blocksize=frame_size): print("Listening...") print('sage "Ende", um das Programm zu verlassen') ring_buffer = [] triggered = False voiced_frames = [] silence_counter = 0 try: flag=True while flag: chunk = q.get() # convert to 16-bit PCM for VAD pcm = float_to_pcm16(chunk) # split into VAD frames for i in range(0, len(pcm), frame_size): frame = pcm[i:i+frame_size] if len(frame) < frame_size: continue is_speech = vad.is_speech(frame.tobytes(), samplerate) if not triggered: ring_buffer.append(frame) if len(ring_buffer) > 10: ring_buffer.pop(0) if is_speech: triggered = True voiced_frames = ring_buffer.copy() ring_buffer.clear() silence_counter = 0 else: voiced_frames.append(frame) if not is_speech: silence_counter += 1 else: silence_counter = 0 # end of speech if silence_counter > 10: # ~300 ms silence triggered = False audio = np.concatenate(voiced_frames).astype(np.int16) # normalize audio_f = audio.astype(np.float32) / 32767 peak = np.max(np.abs(audio_f)) if peak > 0: audio_f = audio_f / peak * 0.9 # minimum duration check duration = len(audio_f) / samplerate if duration > 0.5: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: wavfile.write(tmp.name, samplerate, (audio_f * 32767).astype(np.int16)) result = model.transcribe( tmp.name, language="de", task="transcribe", # (not translate!) temperature=0.0 ) #result = model.transcribe(tmp.name, language="de") txt=result["text"] print("→", txt) if txt.__contains__("Ende.") : flag=False print("Ende verstanden") voiced_frames = [] except KeyboardInterrupt: print("Stopped.")