Source code for spokestack.asr.spokestack.speech_recognizer

This module contains the recognizer for cloud based ASR in
the speech pipeline
import logging
from typing import Any

import numpy as np

from spokestack.asr.spokestack.cloud_client import CloudClient
from spokestack.context import SpeechContext

_LOG = logging.getLogger(__name__)

[docs]class CloudSpeechRecognizer: """Speech recognizer for use in the speech pipeline Args: spokestack_id (str): identity under spokestack api credentials spokestack_secret (str): secret key from spokestack api credentials language (str): language recognized sample_rate (int): audio sample rate (kHz) frame_width (int): frame width of the audio (ms) idle_timeout (int): the number of iterations before the connection times out """ def __init__( self, spokestack_id: str = "", spokestack_secret: str = "", language: str = "en", sample_rate: int = 16000, frame_width: int = 20, idle_timeout: int = 5000, **kwargs: Any, ) -> None: self._client: CloudClient = CloudClient( key_id=spokestack_id, key_secret=spokestack_secret, language=language, sample_rate=sample_rate, idle_timeout=int(idle_timeout / frame_width), ) self._is_active = False def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Entry point of the recognizer Args: context (SpeechContext): current state of the speech pipeline frame (np.ndarray): single frame of audio """ if context.is_active and not self._is_active: self._begin() self._send(frame) _LOG.debug("ready for speech") elif context.is_active: self._send(frame) self._receive(context) _LOG.debug("begin speech") elif self._is_active: self._commit() _LOG.debug("end speech") elif not self._client.is_final: self._receive(context) elif self._client.idle_count < self._client.idle_timeout: self._client.idle_count += 1 else: self._client.disconnect() def _begin(self) -> None: self._client.connect() self._client.initialize() self._is_active = True self._client.idle_count = 0 def _send(self, frame: np.ndarray) -> None: self._client.send(frame) def _receive(self, context: SpeechContext) -> None: self._client.receive() hypotheses = self._client.response.get("hypotheses") if hypotheses: hypothesis = hypotheses[0] context.transcript = hypothesis["transcript"] context.confidence = hypothesis["confidence"] if context.transcript: context.event("partial_recognize") if self._client.is_final: if context.transcript: context.event("recognize") _LOG.debug("recognize event") else: context.event("timeout") _LOG.debug("timeout event") def _commit(self) -> None: self._is_active = False self._client.end()
[docs] def reset(self) -> None: """ resets client connection """ self._client.idle_count = 0 self._is_active = False self.close()
[docs] def close(self) -> None: """ closes client connection """ self._client.disconnect()