Source code for spokestack.vad.webrtc

"""
This module contains the webrtc component for voice activity detection (vad)
"""
import logging
from typing import Any

import numpy as np

from spokestack.context import SpeechContext
from spokestack.extensions.webrtc.vad import WebRtcVad

QUALITY = 0
LOW_BITRATE = 1
AGGRESSIVE = 2
VERY_AGGRESSIVE = 3

_LOG = logging.getLogger(__name__)


[docs]class VoiceActivityDetector:
    """This class detects the presence of voice in a frame of audio.

    Args:
        sample_rate (int): sample rate of the audio (Hz)
        frame_width (int): width of the audio frame: 10, 20, or 30 (ms)
        vad_rise_delay (int): rising edge delay (ms)
        vad_fall_delay (int): falling edge delay (ms)
        mode (int): named constant to set mode for vad

    """

    def __init__(
        self,
        sample_rate: int = 16000,
        frame_width: int = 20,
        vad_rise_delay: int = 0,
        vad_fall_delay: int = 0,
        mode: int = QUALITY,
        **kwargs: Any
    ) -> None:

        # validate sample rate
        self._sample_rate: int = sample_rate
        if self._sample_rate not in {8000, 16000, 32000}:
            raise ValueError("invalid_sample_rate")
        self._frame_width: int = frame_width
        # validate frame width
        if self._frame_width not in {10, 20}:
            raise ValueError("invalid_frame_width")

        self._rise_length: int = vad_rise_delay // frame_width
        self._fall_length: int = vad_fall_delay // frame_width

        self._vad = WebRtcVad(sample_rate=sample_rate, mode=mode)

        self._run_value: int = 0
        self._run_length: int = 0

    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Processes a single frame of audio to determine if voice is present

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        # validate dtype
        if not np.issubdtype(frame.dtype, np.signedinteger):
            raise TypeError("invalid_dtype")

        result: bool = self._vad.is_speech(frame)

        raw = result > 0
        if raw == self._run_value:
            self._run_length += 1
        else:
            self._run_value = raw
            self._run_length = 1

        if self._run_value != context.is_speech:
            if self._run_value and self._run_length >= self._rise_length:
                context.is_speech = True
                _LOG.info("vad: true")
            if not self._run_value and self._run_length >= self._fall_length:
                context.is_speech = False
                _LOG.info("vad: false")

[docs]    def reset(self) -> None:
        """ Resets the current state """
        self._run_value = 0
        self._run_length = 0

[docs]    def close(self) -> None:
        """ Close interface for use in pipeline """
        self.reset()


[docs]class VoiceActivityTrigger:
    """ Voice Activity Detector trigger pipeline component """

    def __init__(self) -> None:
        self._is_speech = False

    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Activates speech context whenever speech is detected

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        if context.is_speech != self._is_speech:
            if context.is_speech:
                context.is_active = True
            self._is_speech = context.is_speech

    def close(self) -> None:
        self.reset()

    def reset(self) -> None:
        self._is_speech = False