tokenization_distilbert.py 2.43 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
thomwolf's avatar
thomwolf committed
15
"""Tokenization classes for DistilBERT."""
thomwolf's avatar
thomwolf committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

from __future__ import absolute_import, division, print_function, unicode_literals

import collections
import logging
import os
import unicodedata
from io import open

from .tokenization_bert import BertTokenizer

logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}

PRETRAINED_VOCAB_FILES_MAP = {
    'vocab_file':
    {
thomwolf's avatar
thomwolf committed
34
35
        'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
        'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
thomwolf's avatar
thomwolf committed
36
37
38
39
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
thomwolf's avatar
thomwolf committed
40
41
    'distilbert-base-uncased': 512,
    'distilbert-base-uncased-distilled-squad': 512,
thomwolf's avatar
thomwolf committed
42
43
44
}


thomwolf's avatar
thomwolf committed
45
class DistilBertTokenizer(BertTokenizer):
thomwolf's avatar
thomwolf committed
46
    r"""
thomwolf's avatar
thomwolf committed
47
48
    Constructs a DistilBertTokenizer.
    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
thomwolf's avatar
thomwolf committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
            minimum of this value (if specified) and the underlying BERT model's sequence length.
        never_split: List of tokens which will never be split during tokenization. Only has an effect when
            do_wordpiece_only=False
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES