tokenization_xxx.py 9.11 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# coding=utf-8
# Copyright 2018 XXX Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model XXX."""


import collections
import logging
import os

from .tokenization_utils import PreTrainedTokenizer

Aymeric Augustin's avatar
Aymeric Augustin committed
24

thomwolf's avatar
thomwolf committed
25
26
27
28
29
30
31
32
33
34
logger = logging.getLogger(__name__)

####################################################
# In this template, replace all the XXX (various casings) with your model name
####################################################

####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
35
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
thomwolf's avatar
thomwolf committed
36
37
38
39
40
41

####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
42
43
44
    "vocab_file": {
        "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
        "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
thomwolf's avatar
thomwolf committed
45
46
47
48
49
50
51
    }
}

####################################################
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
52
53
    "xxx-base-uncased": 512,
    "xxx-large-uncased": 512,
thomwolf's avatar
thomwolf committed
54
55
56
57
58
59
60
61
}

####################################################
# Mapping from model shortcut names to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = {
62
63
    "xxx-base-uncased": {"do_lower_case": True},
    "xxx-large-uncased": {"do_lower_case": True},
thomwolf's avatar
thomwolf committed
64
65
66
67
68
69
70
71
72
}


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()
    for index, token in enumerate(tokens):
73
        token = token.rstrip("\n")
thomwolf's avatar
thomwolf committed
74
75
76
77
78
79
80
81
82
83
84
        vocab[token] = index
    return vocab


class XxxTokenizer(PreTrainedTokenizer):
    r"""
    Constructs a XxxTokenizer.
    :class:`~transformers.XxxTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
Julien Chaumond's avatar
Julien Chaumond committed
85
        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
thomwolf's avatar
thomwolf committed
86
87
88
89
90
91
92
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

93
94
95
96
97
98
99
100
101
102
103
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs
    ):
thomwolf's avatar
thomwolf committed
104
105
106
107
108
109
110
111
        """Constructs a XxxTokenizer.

        Args:
            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
            **do_lower_case**: (`optional`) boolean (default True)
                Whether to lower case the input
                Only has an effect when do_basic_tokenize=True
        """
Julien Chaumond's avatar
Julien Chaumond committed
112
        super().__init__(
113
114
115
116
117
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
118
            **kwargs,
119
        )
thomwolf's avatar
thomwolf committed
120
121
122
123

        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
124
125
                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
            )
thomwolf's avatar
thomwolf committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
        self.vocab = load_vocab(vocab_file)

    @property
    def vocab_size(self):
        return len(self.vocab)

    def _tokenize(self, text):
        """ Take as input a string and return a list of strings (tokens) for words/sub-words
        """
        split_tokens = []
        if self.do_basic_tokenize:
            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
                for sub_token in self.wordpiece_tokenizer.tokenize(token):
                    split_tokens.append(sub_token)
        else:
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    def _convert_token_to_id(self, token):
Aymeric Augustin's avatar
Aymeric Augustin committed
145
        """ Converts a token (str) in an id using the vocab. """
thomwolf's avatar
thomwolf committed
146
147
148
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
Aymeric Augustin's avatar
Aymeric Augustin committed
149
        """Converts an index (integer) in a token (str) using the vocab."""
thomwolf's avatar
thomwolf committed
150
151
152
153
        return self.ids_to_tokens.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """ Converts a sequence of tokens (string) in a single string. """
154
        out_string = " ".join(tokens).replace(" ##", "").strip()
thomwolf's avatar
thomwolf committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A BERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
174
        special tokens using the tokenizer ``prepare_for_model`` methods.
thomwolf's avatar
thomwolf committed
175
176
177
178
179
180
181
182
183

        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model

        Returns:
Lysandre's avatar
Lysandre committed
184
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
thomwolf's avatar
thomwolf committed
185
186
187
188
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
189
190
191
192
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
thomwolf's avatar
thomwolf committed
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A BERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence

        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, vocab_path):
        """Save the tokenizer vocabulary to a directory or file."""
        index = 0
        if os.path.isdir(vocab_path):
218
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
thomwolf's avatar
thomwolf committed
219
220
221
222
223
        else:
            vocab_file = vocab_path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
224
225
226
227
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
                    )
thomwolf's avatar
thomwolf committed
228
                    index = token_index
229
                writer.write(token + "\n")
thomwolf's avatar
thomwolf committed
230
231
                index += 1
        return (vocab_file,)