Source code for spokestack.nlu.tflite

"""
This module contains the class for using TFLite NLU models. In this case, an NLU model
is a TFLite model which takes in an utterance and returns an intent along with
any slots that are associated with that intent.
"""
import json
import logging
import os
from importlib import import_module
from typing import Any, Dict, List, Tuple

import numpy as np
from tokenizers import BertWordPieceTokenizer

from spokestack import utils
from spokestack.models.tensorflow import TFLiteModel
from spokestack.nlu.result import Result

_LOG = logging.getLogger(__name__)


[docs]class TFLiteNLU: """Abstraction for using TFLite NLU models Args: model_dir (str): path to the model directory containing nlu.tflite, metadata.json, and vocab.txt """ def __init__(self, model_dir: str) -> None: self._model = TFLiteModel(model_path=os.path.join(model_dir, "nlu.tflite")) self._metadata = utils.load_json(os.path.join(model_dir, "metadata.json")) self._tokenizer = BertWordPieceTokenizer(os.path.join(model_dir, "vocab.txt")) self._max_length = self._model.input_details[0]["shape"][-1] self._intent_decoder = { i: intent["name"] for i, intent in enumerate(self._metadata["intents"]) } self._tag_decoder = {i: tag for i, tag in enumerate(self._metadata["tags"])} self._intent_meta = { intent.pop("name"): intent for intent in self._metadata["intents"] } self._slot_meta = {} for intent in self._intent_meta: for slot in self._intent_meta[intent]["slots"]: self._slot_meta[slot.pop("name")] = slot self._warm_up() def __call__(self, utterance: str) -> Result: """Classifies a string utterance into an intent and identifies any associated slots contained in the utterance. The slots get parsed based on type and then returned along with the intent and its associated confidence value. Args: utterance (str): string that needs to be understood Returns (Result): A class with properties for the identified intent, along with raw, parsed slots and model confidence in prediction """ inputs, input_ids = self._encode(utterance) outputs = self._model(inputs) intent, tags, confidence = self._decode(outputs) # slice off special tokens: [CLS], [SEP] tags = tags[: len(input_ids) - 2] _LOG.debug(f"{tags}") input_ids = input_ids[1:-1] _LOG.debug(f"{input_ids}") # retrieve slots from the tagged positions and decode slots back # into original values slots = [ (token_id, tag[2:]) for token_id, tag in zip(input_ids, tags) if tag != "o" ] _LOG.debug(f"{slots}") slot_map: dict = {} for (token, tag) in slots: if tag in slot_map: slot_map[tag].append(token) else: slot_map[tag] = [token] for key, value in slot_map.items(): slot_map[key] = self._tokenizer.decode(value) # attempt to resolve tagged tokens into slots and # collect the successful ones parsed_slots = {} for key in slot_map: parsed = self._parse_slots(self._slot_meta[key], slot_map[key]) parsed_slots[key] = { "name": key, "parsed_value": parsed, "raw_value": slot_map[key], } _LOG.debug(f"parsed slots: {parsed_slots}") return Result( utterance=utterance, intent=intent, confidence=confidence, slots=parsed_slots, ) def _warm_up(self) -> None: # make an array the same size as the inputs to warm the # model since first inference is always slower than subsequent warm = np.zeros((self._model.input_details[0]["shape"]), dtype=np.int32) _ = self._model(warm) def _encode(self, utterance: str) -> Tuple[np.ndarray, List[int]]: inputs = self._tokenizer.encode(utterance) # get the non-padded/truncated token ids to match the # original utterance to the respective labels and # use the length to slice the results input_ids = inputs.ids # it's (max_length + 1) because the [CLS] # token gets appended inside the model # notice the slice [1:] when we convert to an array inputs.truncate(max_length=self._max_length + 1) inputs.pad(length=self._max_length + 1) inputs = np.array(inputs.ids[1:], np.int32) # add the batch dimension for the TFLite model inputs = np.expand_dims(inputs, 0) return inputs, input_ids def _decode(self, outputs: list) -> Tuple[str, List[str], float]: # to get the index of the highest probability we # apply argmax to the posteriors which allows the # labels to be decoded with an integer to string mapping # we derive the confidence from the highest probability intent_posterior, tag_posterior = outputs intents, confidence = self._decode_intent(intent_posterior) tags = self._decode_tags(tag_posterior) _LOG.debug(f"decoded tags: {tags}") _LOG.debug(f"decoded intent: {intents}") _LOG.debug(f"confidence: {confidence}") return intents, tags, confidence def _decode_tags(self, posterior: np.ndarray) -> List[Any]: posterior = np.squeeze(posterior, 0) tags = np.argmax(posterior, -1) return [self._tag_decoder.get(tag) for tag in tags] def _decode_intent(self, posterior: np.ndarray) -> Any: posterior = np.squeeze(posterior, 0) intent = np.argmax(posterior, -1) return self._intent_decoder.get(intent), posterior[intent] def _parse_slots(self, slot_meta: Dict[str, Any], slots: Dict[str, Any]) -> Any: slot_type = slot_meta["type"] parser = import_module(f"spokestack.nlu.parsers.{slot_type}") facets = json.loads(slot_meta["facets"]) return parser.parse(facets, slots) # type: ignore