model_utils.py 4.32 KB
Newer Older
Frederick Liu's avatar
Frederick Liu committed
1
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Katherine Wu's avatar
Katherine Wu committed
2
3
4
5
6
7
8
9
10
11
12
13
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Frederick Liu's avatar
Frederick Liu committed
14

Katherine Wu's avatar
Katherine Wu committed
15
16
17
18
"""Transformer model helper methods."""

import math

19
import numpy as np
Katherine Wu's avatar
Katherine Wu committed
20
21
import tensorflow as tf

22
23
24
25
# Very low numbers to represent -infinity. We do not actually use -Inf, since we
# want to be able to multiply these values by zero to get zero. (-Inf * 0 = NaN)
_NEG_INF_FP32 = -1e9
_NEG_INF_FP16 = np.finfo(np.float16).min
Katherine Wu's avatar
Katherine Wu committed
26
27


Hongkun Yu's avatar
Hongkun Yu committed
28
29
30
31
def get_position_encoding(length,
                          hidden_size,
                          min_timescale=1.0,
                          max_timescale=1.0e4):
Katherine Wu's avatar
Katherine Wu committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  """Return positional encoding.

  Calculates the position encoding as a mix of sine and cosine functions with
  geometrically increasing wavelengths.
  Defined and formulized in Attention is All You Need, section 3.5.

  Args:
    length: Sequence length.
    hidden_size: Size of the
    min_timescale: Minimum scale that will be applied at each position
    max_timescale: Maximum scale that will be applied at each position

  Returns:
    Tensor with shape [length, hidden_size]
  """
47
48
49
  # We compute the positional encoding in float32 even if the model uses
  # float16, as many of the ops used, like log and exp, are numerically unstable
  # in float16.
50
  position = tf.cast(tf.range(length), tf.float32)
Katherine Wu's avatar
Katherine Wu committed
51
52
53
  num_timescales = hidden_size // 2
  log_timescale_increment = (
      math.log(float(max_timescale) / float(min_timescale)) /
54
      (tf.cast(num_timescales, tf.float32) - 1))
Katherine Wu's avatar
Katherine Wu committed
55
  inv_timescales = min_timescale * tf.exp(
56
      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
Katherine Wu's avatar
Katherine Wu committed
57
58
59
60
61
  scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
  signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  return signal


62
def get_decoder_self_attention_bias(length, dtype=tf.float32):
Katherine Wu's avatar
Katherine Wu committed
63
64
65
66
67
68
69
70
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.
71
    dtype: The dtype of the return value.
Katherine Wu's avatar
Katherine Wu committed
72
73
74
75

  Returns:
    float tensor of shape [1, 1, length, length]
  """
76
  neg_inf = _NEG_INF_FP16 if dtype == tf.float16 else _NEG_INF_FP32
Katherine Wu's avatar
Katherine Wu committed
77
  with tf.name_scope("decoder_self_attention_bias"):
Hongkun Yu's avatar
Hongkun Yu committed
78
79
    valid_locs = tf.linalg.band_part(
        tf.ones([length, length], dtype=dtype), -1, 0)
Katherine Wu's avatar
Katherine Wu committed
80
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
81
    decoder_bias = neg_inf * (1.0 - valid_locs)
Katherine Wu's avatar
Katherine Wu committed
82
83
84
  return decoder_bias


85
def get_padding(x, padding_value=0, dtype=tf.float32):
Katherine Wu's avatar
Katherine Wu committed
86
87
88
89
  """Return float tensor representing the padding values in x.

  Args:
    x: int tensor with any shape
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
90
    padding_value: int which represents padded values in input
91
    dtype: The dtype of the return value.
Katherine Wu's avatar
Katherine Wu committed
92
93

  Returns:
Songyi Blair Han's avatar
Songyi Blair Han committed
94
    float tensor with same shape as x containing values 0 or 1.
Katherine Wu's avatar
Katherine Wu committed
95
96
97
      0 -> non-padding, 1 -> padding
  """
  with tf.name_scope("padding"):
98
    return tf.cast(tf.equal(x, padding_value), dtype)
Katherine Wu's avatar
Katherine Wu committed
99
100


A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
101
def get_padding_bias(x, padding_value=0, dtype=tf.float32):
Katherine Wu's avatar
Katherine Wu committed
102
103
104
105
106
107
108
109
  """Calculate bias tensor from padding values in tensor.

  Bias tensor that is added to the pre-softmax multi-headed attention logits,
  which has shape [batch_size, num_heads, length, length]. The tensor is zero at
  non-padding locations, and -1e9 (negative infinity) at padding locations.

  Args:
    x: int tensor with shape [batch_size, length]
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
110
111
    padding_value: int which represents padded values in input
    dtype: The dtype of the return value
Katherine Wu's avatar
Katherine Wu committed
112
113
114
115
116

  Returns:
    Attention bias tensor of shape [batch_size, 1, 1, length].
  """
  with tf.name_scope("attention_bias"):
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
117
    padding = get_padding(x, padding_value, dtype)
118
    attention_bias = padding * _NEG_INF_FP32
Katherine Wu's avatar
Katherine Wu committed
119
120
121
    attention_bias = tf.expand_dims(
        tf.expand_dims(attention_bias, axis=1), axis=1)
  return attention_bias