tokenization_xlnet_test.py 5.43 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals

import os
18
import sys
thomwolf's avatar
thomwolf committed
19
20
21
22
23
import unittest
from io import open
import shutil
import pytest

24
25
26
27
28
if sys.version_info[0] == 2:
    import cPickle as pickle
else:
    import pickle

29
30
31
from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
                                                        PRETRAINED_VOCAB_ARCHIVE_MAP,
                                                        SPIECE_UNDERLINE)
thomwolf's avatar
thomwolf committed
32
33
34
35
36
37
38
39
40
41

SAMPLE_VOCAB = os.path.join(os.path.dirname(
                    os.path.dirname(os.path.abspath(__file__))),
                    'samples/test_sentencepiece.model')

class XLNetTokenizationTest(unittest.TestCase):

    def test_full_tokenizer(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB)

thomwolf's avatar
thomwolf committed
42
43
        tokens = tokenizer.tokenize(u'This is a test')
        self.assertListEqual(tokens, [u'鈻乀his', u'鈻乮s', u'鈻乤', u'鈻乼', u'est'])
thomwolf's avatar
thomwolf committed
44
45
46
47

        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])

thomwolf's avatar
thomwolf committed
48
        vocab_path = u"/tmp/"
thomwolf's avatar
thomwolf committed
49
50
51
52
        vocab_file, special_tokens_file = tokenizer.save_vocabulary(vocab_path)
        tokenizer = tokenizer.from_pretrained(vocab_path,
                                              keep_accents=True)

thomwolf's avatar
thomwolf committed
53
54
55
56
57
        tokens = tokenizer.tokenize(u"I was born in 92000, and this is fals茅.")
        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'茅', u'.'])
thomwolf's avatar
thomwolf committed
58
59
60
61
62
63
64
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids, [8, 21, 84, 55, 24, 19, 7, 0,
                            602, 347, 347, 347, 3, 12, 66,
                            46, 72, 80, 6, 0, 4])

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
thomwolf's avatar
thomwolf committed
65
66
67
68
69
70
        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
                                           u'or', u'n', SPIECE_UNDERLINE + u'in',
                                           SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
                                           SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                           SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
                                           u'<unk>', u'.'])
thomwolf's avatar
thomwolf committed
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
        text = "Munich and Berlin are nice cities"
        filename = u"/tmp/tokenizer.bin"

        subwords = tokenizer.tokenize(text)

        pickle.dump(tokenizer, open(filename, "wb"))

        tokenizer_new = pickle.load(open(filename, "rb"))
        subwords_loaded = tokenizer_new.tokenize(text)

        self.assertListEqual(subwords, subwords_loaded)

        os.remove(filename)
        os.remove(vocab_file)
        os.remove(special_tokens_file)

thomwolf's avatar
thomwolf committed
88
89
90
91
92
93
94
95
96
97
98
    @pytest.mark.slow
    def test_tokenizer_from_pretrained(self):
        cache_dir = "/tmp/pytorch_pretrained_bert_test/"
        for model_name in list(PRETRAINED_VOCAB_ARCHIVE_MAP.keys())[:1]:
            tokenizer = XLNetTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(tokenizer)

    def test_tokenizer_lower(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
        tokens = tokenizer.tokenize(u"I was born in 92000, and this is fals茅.")
thomwolf's avatar
thomwolf committed
99
100
101
102
103
        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
                                      u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"鈻乭e", u"ll", u"o"])
thomwolf's avatar
thomwolf committed
104
105
106
107

    def test_tokenizer_no_lower(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
        tokens = tokenizer.tokenize(u"I was born in 92000, and this is fals茅.")
thomwolf's avatar
thomwolf committed
108
109
110
111
        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
                                      u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
thomwolf's avatar
thomwolf committed
112
113
114
115


if __name__ == '__main__':
    unittest.main()