'''
Realtime Speech Recognition with "Ende" recognition

21.4.2026 mchris

HOWTO:
1. Change the python environment
e.g. 
source ~/tools/venv/ch/bin/activate

2. run the script
python whisperVAD.py 

INSTALLATION:

Difficulties
- you have to find out the device_id of your microphone, type

python -m sounddevice

The default sample rate may differ from what the device states. You have to find out manually. 

You have to have install all the libraries needed in your python environment,
adapt the path to your path.
You can create a new python environment with

python -m venv ~/tools/venv/ch

The webrtcvad is needed for this script:

sudo apt update
sudo apt install build-essential python3-dev
pip install webrtcvad
'''

import sounddevice as sd
import numpy as np
import whisper
import queue
import tempfile
import scipy.io.wavfile as wavfile
import webrtcvad

# ---------------- CONFIG ----------------
# addapt the following parameters to your microphone properties !!!
device_id = 7
samplerate = 16000

frame_duration = 30  # ms (must be 10, 20 or 30 for webrtcvad)
vad_mode = 2         # 0=very permissive, 3=very strict

# ----------------------------------------

model = whisper.load_model("base")
vad = webrtcvad.Vad(vad_mode)

q = queue.Queue()

frame_size = int(samplerate * frame_duration / 1000)

def callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy().flatten())

def float_to_pcm16(audio):
    audio = np.clip(audio, -1, 1)
    return (audio * 32767).astype(np.int16)

print(f"Using device {device_id} at {samplerate} Hz")

with sd.InputStream(device=device_id,
                    channels=1,
                    samplerate=samplerate,
                    dtype='float32',
                    callback=callback,
                    blocksize=frame_size):

    print("Listening...")
    print('sage "Ende", um das Programm zu verlassen')

    ring_buffer = []
    triggered = False
    voiced_frames = []
    silence_counter = 0

    try:
        flag=True
        while flag:
            chunk = q.get()

            # convert to 16-bit PCM for VAD
            pcm = float_to_pcm16(chunk)

            # split into VAD frames
            for i in range(0, len(pcm), frame_size):
                frame = pcm[i:i+frame_size]

                if len(frame) < frame_size:
                    continue

                is_speech = vad.is_speech(frame.tobytes(), samplerate)

                if not triggered:
                    ring_buffer.append(frame)
                    if len(ring_buffer) > 10:
                        ring_buffer.pop(0)

                    if is_speech:
                        triggered = True
                        voiced_frames = ring_buffer.copy()
                        ring_buffer.clear()
                        silence_counter = 0

                else:
                    voiced_frames.append(frame)

                    if not is_speech:
                        silence_counter += 1
                    else:
                        silence_counter = 0

                    # end of speech
                    if silence_counter > 10:  # ~300 ms silence
                        triggered = False

                        audio = np.concatenate(voiced_frames).astype(np.int16)

                        # normalize
                        audio_f = audio.astype(np.float32) / 32767
                        peak = np.max(np.abs(audio_f))
                        if peak > 0:
                            audio_f = audio_f / peak * 0.9

                        # minimum duration check
                        duration = len(audio_f) / samplerate
                        if duration > 0.5:
                            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                                wavfile.write(tmp.name, samplerate,
                                              (audio_f * 32767).astype(np.int16))
                                result = model.transcribe(
                                    tmp.name,
                                    language="de",
                                    task="transcribe",   # (not translate!)
                                    temperature=0.0
                                )
                                #result = model.transcribe(tmp.name, language="de")
                                txt=result["text"]
                                print("→", txt)
                                if txt.__contains__("Ende.") :
                                    flag=False
                                    print("Ende verstanden")

                        voiced_frames = []

    except KeyboardInterrupt:
        print("Stopped.")