update readme, file names, removing TF code, moving tests

f8276008 · thomwolf · 3c24e4be · f8276008 · f8276008 · f8276008
Commit f8276008 authored Nov 03, 2018 by thomwolf
5 changed files
--- a/tensorflow_code/modeling_test.py
+++ b/tensorflow_code/modeling_test.py
@@ -16,17 +16,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import six
+import unittest
 import collections
 import json
 import random
 import re

-from tensorflow_code import modeling
-import six
-import tensorflow as tf
+import torch
+
+import modeling as modeling


-class BertModelTest(tf.test.TestCase):
+class BertModelTest(unittest.TestCase):
    class BertModelTester(object):

        def __init__(self,
@@ -68,18 +70,15 @@ class BertModelTest(tf.test.TestCase):
            self.scope = scope

        def create_model(self):
-            input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length],
-                                                 self.vocab_size)
+            input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

            input_mask = None
            if self.use_input_mask:
-                input_mask = BertModelTest.ids_tensor(
-                    [self.batch_size, self.seq_length], vocab_size=2)
+                input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

            token_type_ids = None
            if self.use_token_type_ids:
-                token_type_ids = BertModelTest.ids_tensor(
-                    [self.batch_size, self.seq_length], self.type_vocab_size)
+                token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

            config = modeling.BertConfig(
                vocab_size=self.vocab_size,
@@ -94,33 +93,23 @@ class BertModelTest(tf.test.TestCase):
                type_vocab_size=self.type_vocab_size,
                initializer_range=self.initializer_range)

-            model = modeling.BertModel(
-                config=config,
-                is_training=self.is_training,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                token_type_ids=token_type_ids,
-                scope=self.scope)
+            model = modeling.BertModel(config=config)
+
+            all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)

            outputs = {
-                "embedding_output": model.get_embedding_output(),
-                "sequence_output": model.get_sequence_output(),
-                "pooled_output": model.get_pooled_output(),
-                "all_encoder_layers": model.get_all_encoder_layers(),
+                "sequence_output": all_encoder_layers[-1],
+                "pooled_output": pooled_output,
+                "all_encoder_layers": all_encoder_layers,
            }
            return outputs

        def check_output(self, result):
-            self.parent.assertAllEqual(
-                result["embedding_output"].shape,
-                [self.batch_size, self.seq_length, self.hidden_size])
-
-            self.parent.assertAllEqual(
-                result["sequence_output"].shape,
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])

-            self.parent.assertAllEqual(result["pooled_output"].shape,
-                                       [self.batch_size, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])

    def test_default(self):
        self.run_tester(BertModelTest.BertModelTester(self))
@@ -132,15 +121,11 @@ class BertModelTest(tf.test.TestCase):
        self.assertEqual(obj["hidden_size"], 37)

    def run_tester(self, tester):
-        with self.test_session() as sess:
-            ops = tester.create_model()
-            init_op = tf.group(tf.global_variables_initializer(),
-                               tf.local_variables_initializer())
-            sess.run(init_op)
-            output_result = sess.run(ops)
-            tester.check_output(output_result)
+        output_result = tester.create_model()
+        tester.check_output(output_result)

-            self.assert_all_tensors_reachable(sess, [init_op, ops])
+        # TODO Find PyTorch equivalent of assert_all_tensors_reachable() if necessary
+        # self.assert_all_tensors_reachable(sess, [init_op, ops])

    @classmethod
    def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
@@ -156,7 +141,8 @@ class BertModelTest(tf.test.TestCase):
        for _ in range(total_dims):
            values.append(rng.randint(0, vocab_size - 1))

-        return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name)
+        # TODO Solve : the returned tensors provoke index out of range errors when passed to the model
+        return torch.tensor(data=values, dtype=torch.int32)

    def assert_all_tensors_reachable(self, sess, outputs):
        """Checks that all the tensors in the graph are reachable from outputs."""
@@ -272,4 +258,4 @@ class BertModelTest(tf.test.TestCase):


 if __name__ == "__main__":
-    tf.test.main()
+    unittest.main()
--- a/optimization_test_pytorch.py
+++ b/optimization_test_pytorch.py
@@ -20,7 +20,7 @@ import unittest

 import torch

-import optimization_pytorch as optimization
+import optimization as optimization

 class OptimizationTest(unittest.TestCase):


--- a/tensorflow_code/tokenization_test.py
+++ b/tensorflow_code/tokenization_test.py
@@ -17,45 +17,44 @@ from __future__ import division
 from __future__ import print_function

 import os
-import tempfile
+import unittest

-from tensorflow_code import tokenization
-import tensorflow as tf
+import tokenization as tokenization


-class TokenizationTest(tf.test.TestCase):
+class TokenizationTest(unittest.TestCase):

    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ","
        ]
-        with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            vocab_file = vocab_writer.name

        tokenizer = tokenization.FullTokenizer(vocab_file)
-        os.unlink(vocab_file)
+        os.remove(vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-        self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])

    def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
-        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])

    def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"])

@@ -70,13 +69,13 @@ class TokenizationTest(tf.test.TestCase):
            vocab[token] = i
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)

-        self.assertAllEqual(tokenizer.tokenize(""), [])
+        self.assertListEqual(tokenizer.tokenize(""), [])

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenizer.tokenize("unwanted running"),
            ["un", "##want", "##ed", "runn", "##ing"])

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])

    def test_convert_tokens_to_ids(self):
@@ -89,7 +88,7 @@ class TokenizationTest(tf.test.TestCase):
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i

-        self.assertAllEqual(
+        self.assertListEqual(
            tokenization.convert_tokens_to_ids(
                vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])

@@ -121,5 +120,5 @@ class TokenizationTest(tf.test.TestCase):
        self.assertFalse(tokenization._is_punctuation(u" "))


-if __name__ == "__main__":
-    tf.test.main()
+if __name__ == '__main__':
+    unittest.main()
--- a/tokenization_pytorch.py
+++ b/tokenization_pytorch.py
--- a/tokenization_test_pytorch.py
+++ b/tokenization_test_pytorch.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import unittest
-
-import tokenization_pytorch as tokenization
-
-
-class TokenizationTest(unittest.TestCase):
-
-    def test_full_tokenizer(self):
-        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing", ","
-        ]
-        with open("/tmp/bert_tokenizer_test.txt", "w") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-            vocab_file = vocab_writer.name
-
-        tokenizer = tokenization.FullTokenizer(vocab_file)
-        os.remove(vocab_file)
-
-        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["hello", "!", "how", "are", "you", "?"])
-        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-            ["HeLLo", "!", "how", "Are", "yoU", "?"])
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing"
-        ]
-
-        vocab = {}
-        for (i, token) in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(
-            tokenizer.tokenize("unwanted running"),
-            ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(
-            tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_convert_tokens_to_ids(self):
-        vocab_tokens = [
-            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-            "##ing"
-        ]
-
-        vocab = {}
-        for (i, token) in enumerate(vocab_tokens):
-            vocab[token] = i
-
-        self.assertListEqual(
-            tokenization.convert_tokens_to_ids(
-                vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
-
-    def test_is_whitespace(self):
-        self.assertTrue(tokenization._is_whitespace(u" "))
-        self.assertTrue(tokenization._is_whitespace(u"\t"))
-        self.assertTrue(tokenization._is_whitespace(u"\r"))
-        self.assertTrue(tokenization._is_whitespace(u"\n"))
-        self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
-
-        self.assertFalse(tokenization._is_whitespace(u"A"))
-        self.assertFalse(tokenization._is_whitespace(u"-"))
-
-    def test_is_control(self):
-        self.assertTrue(tokenization._is_control(u"\u0005"))
-
-        self.assertFalse(tokenization._is_control(u"A"))
-        self.assertFalse(tokenization._is_control(u" "))
-        self.assertFalse(tokenization._is_control(u"\t"))
-        self.assertFalse(tokenization._is_control(u"\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(tokenization._is_punctuation(u"-"))
-        self.assertTrue(tokenization._is_punctuation(u"$"))
-        self.assertTrue(tokenization._is_punctuation(u"`"))
-        self.assertTrue(tokenization._is_punctuation(u"."))
-
-        self.assertFalse(tokenization._is_punctuation(u"A"))
-        self.assertFalse(tokenization._is_punctuation(u" "))
-
-
-if __name__ == '__main__':
-    unittest.main()