xlnet paassing first test

16b63617 · thomwolf · 32aabe8c · 16b63617 · 16b63617 · 16b63617
Commit 16b63617 authored Sep 10, 2019 by thomwolf
4 changed files
--- a/pytorch_transformers/modeling_tf_bert.py
+++ b/pytorch_transformers/modeling_tf_bert.py
@@ -218,8 +218,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
-        if training:
+        embeddings = self.dropout(embeddings, training=training)
-            embeddings = self.dropout(embeddings)
        return embeddings
    def _linear(self, inputs):
@@ -286,10 +285,9 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
        # Normalize the attention scores to probabilities.
        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
-        if training:
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.dropout(attention_probs)
+        attention_probs = self.dropout(attention_probs, training=training)
        # Mask heads if we want to
        if head_mask is not None:
@@ -316,8 +314,7 @@ class TFBertSelfOutput(tf.keras.layers.Layer):
        hidden_states, input_tensor = inputs
        hidden_states = self.dense(hidden_states)
-        if training:
+        hidden_states = self.dropout(hidden_states, training=training)
-            hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
@@ -366,8 +363,7 @@ class TFBertOutput(tf.keras.layers.Layer):
        hidden_states, input_tensor = inputs
        hidden_states = self.dense(hidden_states)
-        if training:
+        hidden_states = self.dropout(hidden_states, training=training)
-            hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
@@ -871,8 +867,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel):
        pooled_output = outputs[1]
-        if training:
+        pooled_output = self.dropout(pooled_output, training=training)
-            pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
@@ -947,8 +942,7 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel):
        pooled_output = outputs[1]
-        if training:
+        pooled_output = self.dropout(pooled_output, training=training)
-            pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))
@@ -995,8 +989,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel):
        sequence_output = outputs[0]
-        if training:
+        sequence_output = self.dropout(sequence_output, training=training)
-            sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

--- a/pytorch_transformers/modeling_tf_gpt2.py
+++ b/pytorch_transformers/modeling_tf_gpt2.py
@@ -178,8 +178,7 @@ class TFAttention(tf.keras.layers.Layer):
            w = w + attention_mask
        w = tf.nn.softmax(w, axis=-1)
-        if training:
+        w = self.attn_dropout(w, training=training)
-            w = self.attn_dropout(w)
        # Mask heads if we want to
        if head_mask is not None:
@@ -221,8 +220,7 @@ class TFAttention(tf.keras.layers.Layer):
        a = self.merge_heads(a)
        a = self.c_proj(a)
-        if training:
+        a = self.resid_dropout(a, training=training)
-            a = self.resid_dropout(a)
        outputs = [a, present] + attn_outputs[1:]
        return outputs  # a, present, (attentions)
@@ -240,8 +238,7 @@ class TFMLP(tf.keras.layers.Layer):
    def call(self, x, training=False):
        h = self.act(self.c_fc(x))
        h2 = self.c_proj(h)
-        if training:
+        h2 = self.dropout(h2, training=training)
-            h2 = self.dropout(h2)
        return h2
@@ -368,8 +365,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer):
        else:
            token_type_embeds = 0
        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        if training:
+        hidden_states = self.drop(hidden_states, training=training)
-            hidden_states = self.drop(hidden_states)
        output_shape = input_shape + [shape_list(hidden_states)[-1]]

--- a/pytorch_transformers/modeling_tf_xlnet.py
+++ b/pytorch_transformers/modeling_tf_xlnet.py
@@ -145,7 +145,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
        self.dropout = tf.keras.layers.Dropout(config.dropout)
-    def build(input_shape):
+    def build(self, input_shape):
        initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range)
        self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
                                 initializer=initializer,
@@ -221,10 +221,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                attn_score = attn_score - 1e30 * attn_mask
        # attention probability
-        attn_prob = tf.softmax(attn_score, axis=1)
+        attn_prob = tf.nn.softmax(attn_score, axis=1)
-        if training:
+        attn_prob = self.dropout(attn_prob, training=training)
-            attn_prob = self.dropout(attn_prob)
        # Mask heads if we want to
        if head_mask is not None:
@@ -245,10 +244,9 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
        attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
-        if training:
+        attn_out = self.dropout(attn_out, training=training)
-            attn_out = self.dropout(attn_out)
-        if residual:
+        if residual is not None:
            attn_out = attn_out + h
        output = self.layer_norm(attn_out)
@@ -288,7 +286,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                attn_vec_h, attn_prob_h = attn_vec_h
            # post processing
-            output_h = self.post_attention([h, attn_vec_h], training=training)
+            output_h = self.post_attention([h, attn_vec_h, None], training=training)
            ##### g-stream
            # query-stream query head
@@ -314,7 +312,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                    attn_vec_g, attn_prob_g = attn_vec_g
            # post processing
-            output_g = self.post_attention([g, attn_vec_g], training=training)
+            output_g = self.post_attention([g, attn_vec_g, None], training=training)
            if self.output_attentions:
                attn_prob = attn_prob_h, attn_prob_g
@@ -343,7 +341,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
                attn_vec, attn_prob = attn_vec
            # post processing
-            output_h = self.post_attention([h, attn_vec], training=training)
+            output_h = self.post_attention([h, attn_vec, None], training=training)
            output_g = None
        outputs = (output_h, output_g)
@@ -368,11 +366,9 @@ class TFXLNetFeedForward(tf.keras.layers.Layer):
        output = inp
        output = self.layer_1(output)
        output = self.activation_function(output)
-        if training:
+        output = self.dropout(output, training=training)
-            output = self.dropout(output)
        output = self.layer_2(output)
-        if training:
+        output = self.dropout(output, training=training)
-            output = self.dropout(output)
        output = self.layer_norm(output + inp)
        return output
@@ -413,12 +409,12 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        self.initializer_range = config.initializer_range
        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
-        self.layer = [XLNetLayer(config, name='layer_{}'.format(i)) for i in range(config.n_layer)]
+        self.layer = [TFXLNetLayer(config, name='layer_{}'.format(i)) for i in range(config.n_layer)]
        self.dropout = tf.keras.layers.Dropout(config.dropout)
-    def build(input_shape):
+    def build(self, input_shape):
        initializer = tf.random_normal_initializer(mean=0., stddev=self.initializer_range)
-        self.mask_emb = self.add_weight(shape=(1, 1, config.d_model),
+        self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
                                 initializer=initializer,
                                 trainable=True, name='mask_emb')
@@ -532,16 +528,39 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        return pos_emb
    def call(self, inputs, training=False):
-        (input_ids, attention_mask, mems, perm_mask, target_mapping,
+        if not isinstance(inputs, (dict, tuple, list)):
-         token_type_ids, input_mask, head_mask) = inputs
+            input_ids = inputs
+            (attention_mask, mems, perm_mask, target_mapping,
+            token_type_ids, input_mask, head_mask) = None, None, None, None, None, None, None
+        elif isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else None
+            mems = inputs[2] if len(inputs) > 2 else None
+            perm_mask = inputs[3] if len(inputs) > 3 else None
+            target_mapping = inputs[4] if len(inputs) > 4 else None
+            token_type_ids = inputs[5] if len(inputs) > 5 else None
+            input_mask = inputs[6] if len(inputs) > 6 else None
+            head_mask = inputs[7] if len(inputs) > 7 else None
+            assert len(inputs) <= 8, "Too many inputs."
+        else:
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', None)
+            mems = inputs.get('mems', None)
+            perm_mask = inputs.get('perm_mask', None)
+            target_mapping = inputs.get('target_mapping', None)
+            token_type_ids = inputs.get('token_type_ids', None)
+            input_mask = inputs.get('input_mask', None)
+            head_mask = inputs.get('head_mask', None)
+            assert len(inputs) <= 8, "Too many inputs."
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
        # so we move here the first dimension (batch) to the end
-        input_ids = tf.transpose(input_ids, perm=(0, 1))
+        input_ids = tf.transpose(input_ids, perm=(1, 0))
-        token_type_ids = tf.transpose(token_type_ids, perm=(0, 1)) if token_type_ids is not None else None
+        token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
-        input_mask = tf.transpose(input_mask, perm=(0, 1)) if input_mask is not None else None
+        input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
-        attention_mask = tf.transpose(attention_mask, perm=(0, 1)) if attention_mask is not None else None
+        attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
        perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
        target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
@@ -597,15 +616,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        ##### Word embeddings and prepare h & g hidden states
        word_emb_k = self.word_embedding(input_ids)
-        if training:
+        output_h = self.dropout(word_emb_k, training=training)
-            output_h = self.dropout(word_emb_k)
        if target_mapping is not None:
            word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
        # else:  # We removed the inp_q input which was same as target mapping
        #     inp_q_ext = inp_q[:, :, None]
        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
-            if training:
+            output_g = self.dropout(word_emb_q, training=training)
-                output_g = self.dropout(word_emb_q)
        else:
            output_g = None
@@ -625,8 +642,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        ##### Positional encoding
        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
-        if training:
+        pos_emb = self.dropout(pos_emb, training=training)
-            pos_emb = self.dropout(pos_emb)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
@@ -666,8 +682,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        if self.output_hidden_states:
            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
-        if training:
+        output = self.dropout(output_g if output_g is not None else output_h, training=training)
-            output = self.dropout(output_g if output_g is not None else output_h)
        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
        outputs = (tf.transpose(output, perm=(1, 0, 2)), new_mems)
@@ -805,7 +820,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):
    """
    def __init__(self, config, *inputs, **kwargs):
        super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFBertMainLayer(config, name='transformer')
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
    def call(self, inputs, training=False):
        outputs = self.transformer(inputs, training=training)

--- a/pytorch_transformers/tests/modeling_tf_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_tf_xlnet_test.py
@@ -105,8 +105,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
            perm_mask_last = tf.ones((self.batch_size, self.seq_length + 1, 1), dtype=tf.float32)
            perm_mask = tf.concat([perm_mask, perm_mask_last], axis=-1)
            # perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=torch.float32)
+            target_mapping = tf.zeros((self.batch_size, 1, self.seq_length), dtype=tf.float32)
-            target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=torch.float32)
+            target_mapping_last = tf.ones((self.batch_size, 1, 1), dtype=tf.float32)
            target_mapping = tf.concat([target_mapping, target_mapping_last], axis=-1)
            # target_mapping[:, 0, -1] = 1.0  # predict last token
@@ -145,18 +145,18 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
            model = TFXLNetModel(config)
-            inputs = {'input_ids': input_ids,
+            inputs = {'input_ids': input_ids_1,
                      'input_mask': input_mask,
-                      'token_type_ids': token_type_ids}
+                      'token_type_ids': segment_ids}
            _, _ = model(inputs)
-            inputs = [input_ids, input_mask]
+            inputs = [input_ids_1, input_mask]
            outputs, mems_1 = model(inputs)
            result = {
-                "mems_1": [mem.numpy() for m in mems_1],
+                "mems_1": [mem.numpy() for mem in mems_1],
                "outputs": outputs.numpy(),
            }