Source code for spokestack.vad.webrtc

"""
This module contains the webrtc component for voice activity detection (vad)
"""
import logging
from typing import Any

import numpy as np

from spokestack.context import SpeechContext
from spokestack.extensions.webrtc.vad import WebRtcVad

QUALITY = 0
LOW_BITRATE = 1
AGGRESSIVE = 2
VERY_AGGRESSIVE = 3

_LOG = logging.getLogger(__name__)


[docs]class VoiceActivityDetector: """This class detects the presence of voice in a frame of audio. Args: sample_rate (int): sample rate of the audio (Hz) frame_width (int): width of the audio frame: 10, 20, or 30 (ms) vad_rise_delay (int): rising edge delay (ms) vad_fall_delay (int): falling edge delay (ms) mode (int): named constant to set mode for vad """ def __init__( self, sample_rate: int = 16000, frame_width: int = 20, vad_rise_delay: int = 0, vad_fall_delay: int = 0, mode: int = QUALITY, **kwargs: Any ) -> None: # validate sample rate self._sample_rate: int = sample_rate if self._sample_rate not in {8000, 16000, 32000}: raise ValueError("invalid_sample_rate") self._frame_width: int = frame_width # validate frame width if self._frame_width not in {10, 20}: raise ValueError("invalid_frame_width") self._rise_length: int = vad_rise_delay // frame_width self._fall_length: int = vad_fall_delay // frame_width self._vad = WebRtcVad(sample_rate=sample_rate, mode=mode) self._run_value: int = 0 self._run_length: int = 0 def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Processes a single frame of audio to determine if voice is present Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ # validate dtype if not np.issubdtype(frame.dtype, np.signedinteger): raise TypeError("invalid_dtype") result: bool = self._vad.is_speech(frame) raw = result > 0 if raw == self._run_value: self._run_length += 1 else: self._run_value = raw self._run_length = 1 if self._run_value != context.is_speech: if self._run_value and self._run_length >= self._rise_length: context.is_speech = True _LOG.info("vad: true") if not self._run_value and self._run_length >= self._fall_length: context.is_speech = False _LOG.info("vad: false")
[docs] def reset(self) -> None: """ Resets the current state """ self._run_value = 0 self._run_length = 0
[docs] def close(self) -> None: """ Close interface for use in pipeline """ self.reset()
[docs]class VoiceActivityTrigger: """ Voice Activity Detector trigger pipeline component """ def __init__(self) -> None: self._is_speech = False def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Activates speech context whenever speech is detected Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ if context.is_speech != self._is_speech: if context.is_speech: context.is_active = True self._is_speech = context.is_speech def close(self) -> None: self.reset() def reset(self) -> None: self._is_speech = False