prado_model.py

# Copyright 2020 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tensorflow graph creator for PRADO model."""

import collections
import functools

from typing import Mapping, Dict, Any
from absl import logging
import tensorflow.compat.v1 as tf

from prado import common_layer # import sequence_projection module
from tf_ops import sequence_string_projection_op as ssp # import sequence_projection module

_NGRAM_INFO = [
    {
        "name": "unigram",
        "padding": 0,
        "kernel_size": [1, 1],
        "mask": None
    },
    {
        "name": "bigram",
        "padding": 1,
        "kernel_size": [2, 1],
        "mask": None
    },
    {
        "name": "trigram",
        "padding": 2,
        "kernel_size": [3, 1],
        "mask": None
    },
    {
        "name": "bigramskip1",
        "padding": 2,
        "kernel_size": [3, 1],
        "mask": [[[[1]]], [[[0]]], [[[1]]]]
    },
    {
        "name": "bigramskip2",
        "padding": 3,
        "kernel_size": [4, 1],
        "mask": [[[[1]]], [[[0]]], [[[0]]], [[[1]]]]
    },
    {
        "name": "fourgram",
        "padding": 3,
        "kernel_size": [4, 1],
        "mask": None
    },
    {
        "name": "fivegram",
        "padding": 4,
        "kernel_size": [5, 1],
        "mask": None
    },
]


def _get_params(model_config, varname, default_value=None):
  value = model_config[varname] if varname in model_config else default_value
  logging.info("%s = %s", varname, value)
  return value


def create_projection(model_config, mode, inputs):
  """Create projection."""
  feature_size = _get_params(model_config, "feature_size")
  text_distortion_probability = _get_params(model_config,
                                            "text_distortion_probability", 0.0)
  max_seq_len = _get_params(model_config, "max_seq_len", 0)
  add_eos_tag = _get_params(model_config, "add_eos_tag")
  is_training = mode == tf.estimator.ModeKeys.TRAIN
  distortion_probability = text_distortion_probability if is_training else 0.0
  raw_string = tf.identity(inputs, "Input")
  features, _, seq_length = ssp.sequence_string_projection(
      input=raw_string,
      feature_size=feature_size,
      max_splits=max_seq_len - 1,
      distortion_probability=distortion_probability,
      split_on_space=True,
      add_eos_tag=add_eos_tag,
      vocabulary="")

  if mode != tf.estimator.ModeKeys.PREDICT and max_seq_len > 0:
    pad_value = [[0, 0], [0, max_seq_len - tf.shape(features)[1]], [0, 0]]
    features = tf.pad(features, pad_value)
    batch_size = inputs.get_shape().as_list()[0]
    features = tf.reshape(features,
                          [batch_size, max_seq_len, feature_size])
  return features, seq_length


def _fully_connected(pod_layers, tensor, num_features, mode, bsz, keep_prob):
  """Fully connected layer."""
  tensor_out = pod_layers.fully_connected(tensor, num_features)
  if mode == tf.estimator.ModeKeys.TRAIN:
    tensor_out = tf.nn.dropout(tensor_out, rate=(1 - keep_prob))
  return tf.reshape(tensor_out, [bsz, -1, 1, num_features])


def _get_convolutional_layer(pod_layers, head_type, channels, valid_step_mask,
                             tensor, invalid_value):
  """Get convolutional layer."""
  info = _NGRAM_INFO[head_type]
  pad = info["padding"]
  weight_mask = info["mask"]
  kernel_size = info["kernel_size"]
  paddings = [[0, 0], [0, pad], [0, 0], [0, 0]]
  # Padding before convolution and using 'valid' instead of 'same' padding
  # structure ensures that the convolution output is identical between
  # train/eval and inference models. It also ensures that they lineup
  # correctly with the valid_step_mask.
  tensor = tf.pad(tensor, paddings) if pad != 0 else tensor
  # Not using activation allows a bigram feature to de-emphasize a feature
  # that triggers positive for unigram for example. The output weights
  # should be allowed to be positve or negative for this to happen.
  tensor = pod_layers.convolution2d(
      tensor,
      kernel_size,
      channels,
      padding="VALID",
      weight_mask=weight_mask,
      activation=None)
  if valid_step_mask is not None:
    tensor = tensor * valid_step_mask + (1 - valid_step_mask) * invalid_value
  return tensor


def _get_predictions(pod_layers, head_type, keys, values, channels,
                     valid_step_mask):
  """Get predictions using one ngram head."""
  conv_layer = functools.partial(_get_convolutional_layer, pod_layers,
                                 head_type, channels, valid_step_mask)
  return conv_layer(keys, -100), conv_layer(values, 0)


def reduce_tensors(pod_layers, bsz, attention_logits, values):
  """Reduce information using attention."""
  channels = attention_logits.get_shape().as_list()[-1]
  attention_logits = tf.reshape(attention_logits, [bsz, -1, channels])
  values = tf.reshape(values, [bsz, -1, channels])

  with tf.variable_scope("attention_expected_value"):
    attention_logits = tf.identity(attention_logits, "attention_logits_in")
    values = tf.identity(values, "values_in")
    attention_logits = tf.transpose(attention_logits, [0, 2, 1])
    values = tf.transpose(values, [0, 2, 1])
    attention = tf.nn.softmax(attention_logits, axis=2)
    evalue = tf.reduce_sum(attention * values, axis=[2])
    evalue = tf.identity(evalue, "expected_value_out")
  return pod_layers.quantization(evalue)


def ngram_attention_args_v2(projection, seq_length, mode, num_classes,
                            model_args):
  """Implements an ngram attention network.

  Args:
    projection: Projection features from text.
    seq_length: Sequence length.
    mode: Model creation mode (train, eval or predict).
    num_classes: Number of classes to be predicted.
    model_args: A namedtuple containing all model arguments.

  Returns:
    A tensor corresponding to the logits of the graph.
  """

  pod_layers = common_layer.CommonLayers(
      mode, quantization_enabled=model_args.quantize)

  features = pod_layers.qrange_tanh(projection)
  bsz = features.get_shape().as_list()[0] or tf.shape(features)[0]

  # Regularizer just for the word embedding.
  pod_layers.set_regularizer_scale(model_args.embedding_regularizer_scale)
  values = _fully_connected(pod_layers, features, model_args.embedding_size,
                            mode, bsz, model_args.keep_prob)
  keys = _fully_connected(pod_layers, features, model_args.embedding_size, mode,
                          bsz, model_args.keep_prob)

  # Regularizer for the rest of the network.
  pod_layers.set_regularizer_scale(model_args.network_regularizer_scale)

  valid_step_mask = None
  if mode != tf.estimator.ModeKeys.PREDICT:
    valid_step_mask = pod_layers.zero_beyond_sequence_length(
        seq_length, features)
    valid_step_mask = tf.expand_dims(valid_step_mask, 3)
    # Mask out the sentence beyond valid sequence length for training graph.
    # This ensures that these values are all zeroed out. Without masking, the
    # fully connected layer before will make them take an arbitrary constant
    # value during training/eval in the minibatches. But these values won't
    # be present during inference as the inference is not batched.
    keys = valid_step_mask * keys
    values = valid_step_mask * values
    pod_layers.set_variable_length_moment_fn(seq_length, tf.shape(features)[1])

  multi_head_predictions = []
  for head_type, head in zip(model_args.head_types, model_args.heads):
    if not head:
      continue
    att_logits, att_values = _get_predictions(pod_layers, head_type, keys,
                                              values, head, valid_step_mask)
    multi_head_predictions.append(
        reduce_tensors(pod_layers, bsz, att_logits, att_values))
  multi_head_predictions = tf.concat(multi_head_predictions, axis=1)
  multi_head_predictions = pod_layers.quantization(multi_head_predictions)
  # Sequence dimension has been summed out, so we don't need special moment
  # function.
  pod_layers.set_moment_fn(None)

  output = multi_head_predictions

  # Add FC layers before the logits.
  for fc_layer_size in model_args.pre_logits_fc_layers:
    output = pod_layers.fully_connected(
        output, fc_layer_size, activation=tf.nn.relu)

  return pod_layers.fully_connected(output, num_classes, activation=None)


def create_encoder(model_config: Dict[str, Any], projection: tf.Tensor,
                   seq_length: tf.Tensor,
                   mode: tf.estimator.ModeKeys) -> Mapping[str, tf.Tensor]:
  """Implements a simple attention network for brand safety."""

  args = {}

  def _get_params(varname, default_value=None):
    value = model_config[varname] if varname in model_config else default_value
    logging.info("%s = %s", varname, value)
    args[varname] = value

  _get_params("labels")
  _get_params("quantize", True)
  _get_params("max_seq_len", 0)
  _get_params("max_seq_len_inference", 0)
  _get_params("split_on_space", True)
  _get_params("exclude_nonalphaspace_unicodes", False)
  _get_params("embedding_regularizer_scale", 35e-3)
  _get_params("embedding_size", 64)
  _get_params("heads", [0, 64, 64, 0, 0])
  _get_params("feature_size", 512)
  _get_params("network_regularizer_scale", 1e-4)
  _get_params("keep_prob", 0.5)
  _get_params("word_novelty_bits", 0)
  _get_params("doc_size_levels", 0)
  _get_params("pre_logits_fc_layers", [])
  args["head_types"] = list(range(len(args["heads"])))
  args["text_distortion_probability"] = 0.0
  if mode == tf.estimator.ModeKeys.TRAIN:
    _get_params("text_distortion_probability", 0.25)
  model_args = collections.namedtuple("ModelArgs", sorted(args))(**args)
  num_classes = len(model_args.labels)
  logits = ngram_attention_args_v2(
      projection=projection,
      seq_length=seq_length,
      mode=mode,
      num_classes=num_classes,
      model_args=model_args)
  outputs = {
      "logits":
          tf.identity(logits, "Logits"),
      "label_map":
          tf.constant(list(model_args.labels), tf.string, name="LabelMap")
  }
  return outputs