Source code for spokestack.tts.clients.spokestack

"""
This module contains the Spokestack client for text to speech
"""
import base64
import hashlib
import hmac
import json
from typing import Any, Iterator

import requests

_MODES = {
    "ssml": "synthesizeSsml",
    "markdown": "synthesizeMarkdown",
    "text": "synthesizeText",
}


[docs]class TextToSpeechClient: """Spokestack Text to Speech Client Args: key_id (str): identity from spokestack api credentials key_secret (str): secret key from spokestack api credentials url (str): spokestack api url """ def __init__( self, key_id: str, key_secret: str, url: str = "https://api.spokestack.io/v1" ) -> None: self._key_id = key_id self._key = key_secret.encode("utf-8") self._url = url
[docs] def synthesize( self, utterance: str, mode: str = "text", voice: str = "demo-male", profile: str = "default", ) -> Iterator[bytes]: """Converts the given utterance to speech. Text can be formatted as plain text (`mode="text"`), SSML (`mode="ssml"`), or Speech Markdown (`mode="markdown"`). This method also supports different formats for the synthesized audio via the `profile` argument. The supported profiles and their associated formats are: - `default`: 24kHz, 64kbps mono MP3 - `alexa`: 24kHz, 48kbps mono MP3 - `discord`: 48kHz, 64kbpz stereo OPUS - `twilio`: 8kHz, 64kbpz mono MP3 Args: utterance (str): string that needs to be rendered as speech. mode (str): synthesis mode to use with utterance. text, ssml, markdown. voice (str): name of the tts voice. profile (str): name of the audio profile used to create the resulting stream. Returns: (Iterator[bytes]): Encoded audio response in the form of a sequence of bytes """ audio_url = self.synthesize_url(utterance, mode, voice, profile) response = requests.get(audio_url, stream=True) if response.status_code != 200: raise Exception(response.reason) return response.iter_content(chunk_size=None)
[docs] def synthesize_url( self, utterance: str, mode: str = "text", voice: str = "demo-male", profile: str = "default", ) -> str: """Converts the given utterance to speech accessible by a URL. Text can be formatted as plain text (`mode="text"`), SSML (`mode="ssml"`), or Speech Markdown (`mode="markdown"`). This method also supports different formats for the synthesized audio via the `profile` argument. The supported profiles and their associated formats are: - `default`: 24kHz, 64kbps mono MP3 - `alexa`: 24kHz, 48kbps mono MP3 - `discord`: 48kHz, 64kbpz stereo OPUS - `twilio`: 8kHz, 64kbpz mono MP3 Args: utterance (str): string that needs to be rendered as speech. mode (str): synthesis mode to use with utterance. text, ssml, markdown. voice (str): name of the tts voice. profile (str): name of the audio profile used to create the resulting stream. Returns: URL of the audio clip """ body = self._build_body(utterance, mode, voice, profile) signature = base64.b64encode( hmac.new(self._key, body.encode("utf-8"), hashlib.sha256).digest() ).decode("utf-8") headers = { "Authorization": f"Spokestack {self._key_id}:{signature}", "Content-Type": "application/json", } response: Any = requests.post(self._url, headers=headers, data=body) if response.status_code != 200: raise Exception(response.reason) response = response.json() if "errors" in response: raise TTSError(response["errors"]) return response["data"][_MODES[mode]]["url"]
@staticmethod def _build_body(message: str, mode: str, voice: str, profile: str) -> str: if mode not in _MODES: raise ValueError("invalid_mode") query = f""" query PythonSynthesis( $voice: String!, ${mode}: String!, $profile: SynthesisProfile) {{ {_MODES[mode]}(voice: $voice, {mode}: ${mode}, profile: $profile) {{url}} }} """ return json.dumps( { "query": query, "variables": { "voice": voice, mode: message, "profile": profile.upper(), }, } )
[docs]class TTSError(Exception): """ Text to speech error wrapper """ def __init__(self, response: Any) -> None: messages = [error["message"] for error in response] super().__init__(messages)