Source code for spokestack.tts.lite

"""
Spokestack-Lite Speech Synthesizer

This module contains the SpeechSynthesizer class used to convert text to speech
using local TTS models trained on the Spokestack platform. A SpeechSynthesizer
instance can be passed to the TextToSpeechManager for playback.

Example:
    This example assumes that a TTS model was downloaded from the Spokestack
    platform and extracted to the :code:`model` directory. ::

        from spokestack.io.pyaudio import PyAudioOutput
        from spokestack.tts.manager import TextToSpeechManager, FORMAT_PCM16
        from spokestack.tts.lite import SpeechSynthesizer, BLOCK_LENGTH, SAMPLE_RATE

        tts = TextToSpeechManager(
            SpeechSynthesizer("./model"),
            PyAudioOutput(sample_rate=SAMPLE_RATE, frames_per_buffer=BLOCK_LENGTH),
            format_=FORMAT_PCM16)

        tts.synthesize("Hello world!")

"""

import importlib
import json
import os
import re
import typing as T
from collections import defaultdict

import numpy as np

from spokestack.models.tensorflow import TFLiteModel

# signal configuration
SAMPLE_RATE = 24000
HOP_LENGTH = 240
ENCODER_PAD = -2
BREAK_LENGTH = 0.1

# streaming/cross-fading configuration
FRAME_LENGTH = 63
FRAME_OVERLAP = 1
BLOCK_LENGTH = FRAME_LENGTH * HOP_LENGTH
BLOCK_OVERLAP = FRAME_OVERLAP * HOP_LENGTH
FADE_OUT = np.linspace(1, 0, BLOCK_OVERLAP, dtype=np.float32)
FADE_IN = FADE_OUT[::-1]


[docs]class SpeechSynthesizer: """ Initialize a new lightweight speech synthesizer Args: model_path (str): Path to the extracted TTS model downloaded from the Spokestack platform """ def __init__(self, model_path: str): # load NLP configuration self._lexicon = _load_lexicon(os.path.join(model_path, "lexicon.txt")) with open(os.path.join(model_path, "metadata.json")) as file: metadata = json.load(file) lang = metadata["language"] self._sym_to_id = {s: i for i, s in enumerate(metadata["alphabet"])} self._language: T.Any = importlib.import_module(f"spokestack.tts.lite.{lang}") self._nlp = self._language.nlp() # load the TTS models self._aligner = TFLiteModel(os.path.join(model_path, "align.tflite")) self._encoder = TFLiteModel(os.path.join(model_path, "encode.tflite")) self._decoder = TFLiteModel(os.path.join(model_path, "decode.tflite")) self._aligner_input_index = self._aligner.input_details[0]["index"] self._encoder_input_index = self._encoder.input_details[0]["index"]
[docs] def synthesize( self, utterance: str, *_args: T.List, **_kwargs: T.Dict ) -> T.Iterator[np.array]: """ Synthesize a text utterance to speech audio Args: utterance (str): The text string to synthesize Returns: Iterator[np.array]: A generator for returns a sequence of PCM-16 numpy audio blocks for playback, storage, etc. """ # segment sentences into a list of phoneme/grapheme lists for tokens in self._parse(utterance): # convert tokens to a vector of ids inputs = self._vectorize(tokens) # run the aligner model self._aligner.resize(self._aligner_input_index, inputs.shape) inputs = self._aligner(inputs)[0] # run the encoder model self._encoder.resize(self._encoder_input_index, inputs.shape) encoded = self._encoder(inputs)[0] # stream the decoder model and cross-fade the output audio overlap = np.zeros([BLOCK_OVERLAP], dtype=np.float32) for i in range(FRAME_OVERLAP, len(encoded), FRAME_LENGTH): # decode the current frame, padding as need to fill the decoder's input inputs = encoded[i - FRAME_OVERLAP : i + FRAME_LENGTH] inputs = np.pad( inputs, [(0, (FRAME_LENGTH + FRAME_OVERLAP) - len(inputs)), (0, 0)], "constant", constant_values=ENCODER_PAD, ) outputs = self._decoder(inputs)[0] # fade in the new block, convert to int16 and return it overlap += outputs[:BLOCK_OVERLAP] * FADE_IN block = np.hstack([overlap, outputs[BLOCK_OVERLAP:-BLOCK_OVERLAP]]) yield (block * (2 ** 15 - 1)).astype(np.int16) # fade out the previous block for mixing with the next block overlap = outputs[-BLOCK_OVERLAP:] * FADE_OUT # add a break after each segment yield np.zeros([int(BREAK_LENGTH * SAMPLE_RATE)], dtype=np.int16)
def _parse(self, text: str) -> T.Iterator[str]: # perform language-specific number conversions, abbreviation expansions, etc. text = self._language.clean(text) # escape characters used for phonetic substitution text = re.sub(r"{", "[", text) text = re.sub(r"}", "]", text) # segment and tokenize the text, and convert words to their phonetic # representations using the attached lexicon for sentence in self._nlp(text).sents: tokens = [] for token in sentence: if token.pos_ in ["SYM", "PUNCT"]: tokens.append(token.text_with_ws) else: entry = self._lexicon.get(token.text.lower(), {}) ipa = entry.get(token.tag_, entry.get(None)) tokens.append( f"{{{ipa}}}{token.whitespace_}" if ipa else token.text_with_ws ) yield re.sub(r"}\s+{", " ", "".join(tokens)) def _vectorize(self, text: str) -> np.array: # start with bos token vector = [self._sym_to_id["^"]] while text: # check for curly braces and treat their contents as ipa matches = re.match(r"(.*?)\{(.+?)\}(.*)", text) # no ipa in this block, vectorize graphemes if not matches: vector.extend(self._vectorize_text(text)) break # ipa found, vectorize leading text, then phones vector.extend(self._vectorize_text(matches.group(1))) vector.extend(self._vectorize_phones(matches.group(2))) text = matches.group(3) # append eos token vector.append(self._sym_to_id["~"]) return np.array(vector, dtype=np.int32) def _vectorize_text(self, text: T.Union[str, T.List[str]]) -> T.List[int]: return [ self._sym_to_id[c] for c in text if c in self._sym_to_id and c not in "_^~" ] def _vectorize_phones(self, phones: str) -> T.List[int]: return self._vectorize_text([f"@{c}" if c != " " else c for c in phones])
def _load_lexicon(path: str) -> T.Dict[str, T.Dict[T.Optional[str], str]]: lexicon: T.Dict[str, T.Dict[T.Optional[str], str]] = defaultdict(dict) with open(path, "r") as file: for line in file: # parse the the lexicon entry, discard any alternative pronunciations parts = line.strip().split("\t") if len(parts) > 1: word = parts[0].lower() ipa = parts[1].split(",")[0].strip() pos = parts[2] if len(parts) > 2 else None lexicon[word][pos] = ipa return lexicon