Source code for spokestack.tts.clients.spokestack

"""
This module contains the Spokestack client for text to speech
"""
import base64
import hashlib
import hmac
import json
from typing import Any, Iterator

import requests

_MODES = {
    "ssml": "synthesizeSsml",
    "markdown": "synthesizeMarkdown",
    "text": "synthesizeText",
}


[docs]class TextToSpeechClient:
    """Spokestack Text to Speech Client

    Args:
        key_id (str): identity from spokestack api credentials
        key_secret (str): secret key from spokestack api credentials
        url (str): spokestack api url
    """

    def __init__(
        self, key_id: str, key_secret: str, url: str = "https://api.spokestack.io/v1"
    ) -> None:

        self._key_id = key_id
        self._key = key_secret.encode("utf-8")
        self._url = url

[docs]    def synthesize(
        self,
        utterance: str,
        mode: str = "text",
        voice: str = "demo-male",
        profile: str = "default",
    ) -> Iterator[bytes]:
        """Converts the given utterance to speech.

        Text can be formatted as plain text (`mode="text"`),
        SSML (`mode="ssml"`), or Speech Markdown (`mode="markdown"`).

        This method also supports different formats for the synthesized
        audio via the `profile` argument. The supported profiles and
        their associated formats are:

        - `default`: 24kHz, 64kbps mono MP3
        - `alexa`: 24kHz, 48kbps mono MP3
        - `discord`: 48kHz, 64kbpz stereo OPUS
        - `twilio`: 8kHz, 64kbpz mono MP3

        Args:
            utterance (str): string that needs to be rendered as speech.
            mode (str): synthesis mode to use with utterance. text, ssml, markdown.
            voice (str): name of the tts voice.
            profile (str): name of the audio profile used to create the
                           resulting stream.

        Returns:
            (Iterator[bytes]): Encoded audio response in the form of a sequence of bytes

        """
        audio_url = self.synthesize_url(utterance, mode, voice, profile)
        response = requests.get(audio_url, stream=True)

        if response.status_code != 200:
            raise Exception(response.reason)

        return response.iter_content(chunk_size=None)

[docs]    def synthesize_url(
        self,
        utterance: str,
        mode: str = "text",
        voice: str = "demo-male",
        profile: str = "default",
    ) -> str:
        """Converts the given utterance to speech accessible by a URL.

        Text can be formatted as plain text (`mode="text"`),
        SSML (`mode="ssml"`), or Speech Markdown (`mode="markdown"`).

        This method also supports different formats for the synthesized
        audio via the `profile` argument. The supported profiles and
        their associated formats are:

        - `default`: 24kHz, 64kbps mono MP3
        - `alexa`: 24kHz, 48kbps mono MP3
        - `discord`: 48kHz, 64kbpz stereo OPUS
        - `twilio`: 8kHz, 64kbpz mono MP3

        Args:
            utterance (str): string that needs to be rendered as speech.
            mode (str): synthesis mode to use with utterance. text, ssml, markdown.
            voice (str): name of the tts voice.
            profile (str): name of the audio profile used to create the
                           resulting stream.

        Returns: URL of the audio clip

        """
        body = self._build_body(utterance, mode, voice, profile)
        signature = base64.b64encode(
            hmac.new(self._key, body.encode("utf-8"), hashlib.sha256).digest()
        ).decode("utf-8")
        headers = {
            "Authorization": f"Spokestack {self._key_id}:{signature}",
            "Content-Type": "application/json",
        }
        response: Any = requests.post(self._url, headers=headers, data=body)

        if response.status_code != 200:
            raise Exception(response.reason)

        response = response.json()
        if "errors" in response:
            raise TTSError(response["errors"])

        return response["data"][_MODES[mode]]["url"]

    @staticmethod
    def _build_body(message: str, mode: str, voice: str, profile: str) -> str:
        if mode not in _MODES:
            raise ValueError("invalid_mode")

        query = f"""
        query PythonSynthesis(
          $voice: String!, ${mode}: String!, $profile: SynthesisProfile) {{
            {_MODES[mode]}(voice: $voice, {mode}: ${mode}, profile: $profile) {{url}}
        }}
        """
        return json.dumps(
            {
                "query": query,
                "variables": {
                    "voice": voice,
                    mode: message,
                    "profile": profile.upper(),
                },
            }
        )


[docs]class TTSError(Exception):
    """ Text to speech error wrapper """

    def __init__(self, response: Any) -> None:
        messages = [error["message"] for error in response]
        super().__init__(messages)