tokenization_distilbert.py 3.73 KB
Newer Older
thomwolf's avatar
thomwolf committed
1
# coding=utf-8
thomwolf's avatar
thomwolf committed
2
# Copyright 2018 The HuggingFace Inc. team.
thomwolf's avatar
thomwolf committed
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
thomwolf's avatar
thomwolf committed
15
"""Tokenization classes for DistilBERT."""
thomwolf's avatar
thomwolf committed
16

17
from .tokenization_bert import BertTokenizer, BertTokenizerFast
Lysandre Debut's avatar
Lysandre Debut committed
18
from .utils import logging
thomwolf's avatar
thomwolf committed
19

Aymeric Augustin's avatar
Aymeric Augustin committed
20

Lysandre Debut's avatar
Lysandre Debut committed
21
logger = logging.get_logger(__name__)
thomwolf's avatar
thomwolf committed
22

23
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
thomwolf's avatar
thomwolf committed
24
25

PRETRAINED_VOCAB_FILES_MAP = {
26
27
28
    "vocab_file": {
        "distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
        "distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
29
30
        "distilbert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
        "distilbert-base-cased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
31
32
        "distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
        "distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
thomwolf's avatar
thomwolf committed
33
34
35
36
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
37
38
    "distilbert-base-uncased": 512,
    "distilbert-base-uncased-distilled-squad": 512,
39
40
    "distilbert-base-cased": 512,
    "distilbert-base-cased-distilled-squad": 512,
41
42
    "distilbert-base-german-cased": 512,
    "distilbert-base-multilingual-cased": 512,
thomwolf's avatar
thomwolf committed
43
44
45
}


46
47
48
PRETRAINED_INIT_CONFIGURATION = {
    "distilbert-base-uncased": {"do_lower_case": True},
    "distilbert-base-uncased-distilled-squad": {"do_lower_case": True},
49
50
    "distilbert-base-cased": {"do_lower_case": False},
    "distilbert-base-cased-distilled-squad": {"do_lower_case": False},
51
52
53
54
55
    "distilbert-base-german-cased": {"do_lower_case": False},
    "distilbert-base-multilingual-cased": {"do_lower_case": False},
}


thomwolf's avatar
thomwolf committed
56
class DistilBertTokenizer(BertTokenizer):
thomwolf's avatar
thomwolf committed
57
    r"""
58
59
60
    Constructs a  DistilBertTokenizer.

    :class:`~transformers.DistilBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
Lysandre Debut's avatar
Lysandre Debut committed
61
62
63
64
    tokenization: punctuation splitting + wordpiece.

    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
    parameters.
thomwolf's avatar
thomwolf committed
65
66
67
68
69
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
Lysandre Debut's avatar
Lysandre Debut committed
71
    model_input_names = ["attention_mask"]
72
73
74


class DistilBertTokenizerFast(BertTokenizerFast):
75
76
77
78
79
80
81
82
83
84
    r"""
    Constructs a  "Fast" DistilBertTokenizer (backed by HuggingFace's `tokenizers` library).

    :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
    tokenization: punctuation splitting + wordpiece.

    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
    parameters.
    """

85
86
87
88
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
Lysandre Debut's avatar
Lysandre Debut committed
89
    model_input_names = ["attention_mask"]