Unverified Commit ba8c4d0a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

[Dependencies|tokenizers] Make both SentencePiece and Tokenizers optional dependencies (#7659)

* splitting fast and slow tokenizers [WIP]

* [WIP] splitting sentencepiece and tokenizers dependencies

* update dummy objects

* add name_or_path to models and tokenizers

* prefix added to file names

* prefix

* styling + quality

* spliting all the tokenizer files - sorting sentencepiece based ones

* update tokenizer version up to 0.9.0

* remove hard dependency on sentencepiece 🎉

* and removed hard dependency on tokenizers 🎉



* update conversion script

* update missing models

* fixing tests

* move test_tokenization_fast to main tokenization tests - fix bugs

* bump up tokenizers

* fix bert_generation

* update ad fix several tokenizers

* keep sentencepiece in deps for now

* fix funnel and deberta tests

* fix fsmt

* fix marian tests

* fix layoutlm

* fix squeezebert and gpt2

* fix T5 tokenization

* fix xlnet tests

* style

* fix mbart

* bump up tokenizers to 0.9.2

* fix model tests

* fix tf models

* fix seq2seq examples

* fix tests without sentencepiece

* fix slow => fast  conversion without sentencepiece

* update auto and bert generation tests

* fix mbart tests

* fix auto and common test without tokenizers

* fix tests without tokenizers

* clean up tests lighten up when tokenizers + sentencepiece are both off

* style quality and tests fixing

* add sentencepiece to doc/examples reqs

* leave sentencepiece on for now

* style quality split hebert and fix pegasus

* WIP Herbert fast

* add sample_text_no_unicode and fix hebert tokenization

* skip FSMT example test for now

* fix style

* fix fsmt in example tests

* update following Lysandre and Sylvain's comments

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/testing_utils.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* Update src/transformers/tokenization_utils_base.py
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c65863ce
......@@ -19,12 +19,11 @@ import json
import os
import warnings
from functools import lru_cache
from typing import Optional, Tuple
import regex as re
from .tokenization_utils import AddedToken, PreTrainedTokenizer
from .tokenization_utils_base import BatchEncoding
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
......@@ -256,22 +255,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
......@@ -303,114 +296,3 @@ class GPT2Tokenizer(PreTrainedTokenizer):
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
::
>>> from transformers import GPT2TokenizerFast
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
.. note::
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
``add_prefix_space=True``.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = GPT2Tokenizer
def __init__(
self,
vocab_file,
merges_file,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs
):
super().__init__(
vocab_file,
merges_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
else:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
import json
import warnings
from typing import Optional, Tuple
from tokenizers import pre_tokenizers
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_utils_base import BatchEncoding
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
},
"merges_file": {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
},
"tokenizer_file": {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tokenizer.json",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tokenizer.json",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tokenizer.json",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-tokenizer.json",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"gpt2": 1024,
"gpt2-medium": 1024,
"gpt2-large": 1024,
"gpt2-xl": 1024,
"distilgpt2": 1024,
}
class GPT2TokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
Byte-Pair-Encoding.
This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
be encoded differently whether it is at the beginning of the sentence (without space) or not:
::
>>> from transformers import GPT2TokenizerFast
>>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
>>> tokenizer("Hello world")['input_ids']
[15496, 995]
>>> tokenizer(" Hello world")['input_ids']
[18435, 995]
You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
.. note::
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
``add_prefix_space=True``.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
methods. Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
The end of sequence token.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
slow_tokenizer_class = GPT2Tokenizer
def __init__(
self,
vocab_file,
merges_file,
tokenizer_file=None,
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
**kwargs
):
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
**kwargs,
)
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
self.add_prefix_space = add_prefix_space
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._batch_encode_plus(*args, **kwargs)
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
if "is_pretokenized" in kwargs:
warnings.warn(
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
FutureWarning,
)
is_split_into_words = kwargs.pop("is_pretokenized")
else:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
"to use it with pretokenized inputs."
)
return super()._encode_plus(*args, **kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
......@@ -13,10 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional
from .tokenization_bert import BasicTokenizer
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .tokenization_xlm import XLMTokenizer
from .utils import logging
......@@ -28,6 +25,14 @@ VOCAB_FILES_NAMES = {
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/vocab.json"},
"merges_file": {"allegro/herbert-base-cased": "https://cdn.huggingface.co/allegro/herbert-base-cased/merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"allegro/herbert-base-cased": 514}
PRETRAINED_INIT_CONFIGURATION = {}
class HerbertTokenizer(XLMTokenizer):
"""
......@@ -45,6 +50,9 @@ class HerbertTokenizer(XLMTokenizer):
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, **kwargs):
......@@ -71,127 +79,3 @@ class HerbertTokenizer(XLMTokenizer):
split_tokens.extend([t for t in self.bpe(token).split(" ")])
return split_tokens
class HerbertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
Peculiarities:
- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
"""
vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class = HerbertTokenizer
def __init__(self, vocab_file, merges_file, **kwargs):
kwargs["cls_token"] = "<s>"
kwargs["unk_token"] = "<unk>"
kwargs["pad_token"] = "<pad>"
kwargs["mask_token"] = "<mask>"
kwargs["sep_token"] = "</s>"
super().__init__(
vocab_file,
merges_file,
**kwargs,
)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An HerBERT, like BERT sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
HerBERT, like BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# coding=utf-8
# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Tuple
from .tokenization_herbert import (
PRETRAINED_INIT_CONFIGURATION,
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES,
PRETRAINED_VOCAB_FILES_MAP,
HerbertTokenizer,
)
from .tokenization_utils_fast import PreTrainedTokenizerFast
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
class HerbertTokenizerFast(PreTrainedTokenizerFast):
"""
Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
Peculiarities:
- uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = HerbertTokenizer
def __init__(self, vocab_file, merges_file, tokenizer_file=None, **kwargs):
kwargs["cls_token"] = "<s>"
kwargs["unk_token"] = "<unk>"
kwargs["pad_token"] = "<pad>"
kwargs["mask_token"] = "<mask>"
kwargs["sep_token"] = "</s>"
super().__init__(
vocab_file,
merges_file,
tokenizer_file=tokenizer_file,
**kwargs,
)
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An HerBERT, like BERT sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
cls = [self.cls_token_id]
sep = [self.sep_token_id]
if token_ids_1 is None:
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
HerBERT, like BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
......@@ -15,7 +15,7 @@
""" Tokenization class for model LayoutLM."""
from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .tokenization_bert import BertTokenizer
from .utils import logging
......@@ -58,21 +58,3 @@ class LayoutLMTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
class LayoutLMTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" LayoutLMTokenizer.
:class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
model_input_names = ["attention_mask"]
# coding=utf-8
# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model LayoutLM."""
from .tokenization_bert_fast import BertTokenizerFast
from .tokenization_layoutlm import LayoutLMTokenizer
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
"microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
},
"tokenizer_file": {
"microsoft/layoutlm-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json",
"microsoft/layoutlm-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tokenizer.json",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/layoutlm-base-uncased": 512,
"microsoft/layoutlm-large-uncased": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/layoutlm-base-uncased": {"do_lower_case": True},
"microsoft/layoutlm-large-uncased": {"do_lower_case": True},
}
class LayoutLMTokenizerFast(BertTokenizerFast):
r"""
Constructs a "Fast" LayoutLMTokenizer.
:class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = LayoutLMTokenizer
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
from .tokenization_roberta import RobertaTokenizer
from .utils import logging
......@@ -54,19 +54,3 @@ class LongformerTokenizer(RobertaTokenizer):
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
}
class LongformerTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
to the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
}
slow_tokenizer_class = LongformerTokenizer
# coding=utf-8
# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .tokenization_longformer import LongformerTokenizer
from .tokenization_roberta_fast import RobertaTokenizerFast
from .utils import logging
logger = logging.get_logger(__name__)
# vocab and merges same as roberta
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
tokenizer_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tokenizer.json"
_all_longformer_models = [
"allenai/longformer-base-4096",
"allenai/longformer-large-4096",
"allenai/longformer-large-4096-finetuned-triviaqa",
"allenai/longformer-base-4096-extra.pos.embd.only",
"allenai/longformer-large-4096-extra.pos.embd.only",
]
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"allenai/longformer-base-4096": 4096,
"allenai/longformer-large-4096": 4096,
"allenai/longformer-large-4096-finetuned-triviaqa": 4096,
"allenai/longformer-base-4096-extra.pos.embd.only": 4096,
"allenai/longformer-large-4096-extra.pos.embd.only": 4096,
}
class LongformerTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
to the superclass for usage examples and documentation concerning parameters.
"""
# merges and vocab same as Roberta
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
"tokenizer_file": {m: tokenizer_url for m in _all_longformer_models},
}
slow_tokenizer_class = LongformerTokenizer
......@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .tokenization_bert import BertTokenizer
####################################################
......@@ -63,20 +63,3 @@ class LxmertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
class LxmertTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = LxmertTokenizer
# coding=utf-8
# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .tokenization_bert_fast import BertTokenizerFast
from .tokenization_lxmert import LxmertTokenizer
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
},
"tokenizer_file": {
"unc-nlp/lxmert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tokenizer.json",
},
}
####################################################
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"unc-nlp/lxmert-base-uncased": 512,
}
####################################################
# Mapping from model shortcut names to a dictionary of additional
# keyword arguments for Tokenizer `__init__`.
# To be used for checkpoint specific configurations.
####################################################
PRETRAINED_INIT_CONFIGURATION = {
"unc-nlp/lxmert-base-uncased": {"do_lower_case": True},
}
class LxmertTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = LxmertTokenizer
......@@ -18,6 +18,19 @@ vocab_files_names = {
"vocab": "vocab.json",
"tokenizer_config_file": "tokenizer_config.json",
}
PRETRAINED_VOCAB_FILES_MAP = {
"source_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/source.spm"},
"target_spm": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/target.spm"},
"vocab": {"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/vocab.json"},
"tokenizer_config_file": {
"Helsinki-NLP/opus-mt-en-de": "https://cdn.huggingface.co/Helsinki-NLP/opus-mt-en-de/tokenizer_config.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"Helsinki-NLP/opus-mt-en-de": 512}
PRETRAINED_INIT_CONFIGURATION = {}
# Example URL https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/vocab.json
......@@ -63,6 +76,9 @@ class MarianTokenizer(PreTrainedTokenizer):
"""
vocab_files_names = vocab_files_names
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["attention_mask"]
language_code_re = re.compile(">>.+<<") # type: re.Pattern
......@@ -189,27 +205,22 @@ class MarianTokenizer(PreTrainedTokenizer):
def vocab_size(self) -> int:
return len(self.encoder)
def save_vocabulary(self, save_directory: str) -> Tuple[str]:
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory)
assert save_dir.is_dir(), f"{save_directory} should be a directory"
save_json(self.encoder, save_dir / self.vocab_files_names["vocab"])
save_json(
self.encoder,
save_dir / ((filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab"]),
)
for orig, f in zip(["source.spm", "target.spm"], self.spm_files):
dest_path = save_dir / Path(f).name
dest_path = save_dir / ((filename_prefix + "-" if filename_prefix else "") + Path(f).name)
if not dest_path.exists():
copyfile(f, save_dir / orig)
return tuple(save_dir / f for f in self.vocab_files_names)
return tuple(
save_dir / ((filename_prefix + "-" if filename_prefix else "") + f) for f in self.vocab_files_names
)
def get_vocab(self) -> Dict:
vocab = self.encoder.copy()
......
This diff is collapsed.
This diff is collapsed.
......@@ -13,7 +13,7 @@
# limitations under the License.
"""Tokenization classes for MobileBERT."""
from .tokenization_bert import BertTokenizer, BertTokenizerFast
from .tokenization_bert import BertTokenizer
from .utils import logging
......@@ -27,7 +27,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
PRETRAINED_INIT_CONFIGURATION = {}
......@@ -48,21 +48,3 @@ class MobileBertTokenizer(BertTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
class MobileBertTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = MobileBertTokenizer
# coding=utf-8
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for MobileBERT."""
from .tokenization_bert_fast import BertTokenizerFast
from .tokenization_mobilebert import MobileBertTokenizer
from .utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/vocab.txt"
},
"tokenizer_file": {
"mobilebert-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/google/mobilebert-uncased/tokenizer.json"
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
PRETRAINED_INIT_CONFIGURATION = {}
class MobileBertTokenizerFast(BertTokenizerFast):
r"""
Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece.
Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
slow_tokenizer_class = MobileBertTokenizer
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -19,7 +19,7 @@
import os
import re
from shutil import copyfile
from typing import List, Optional
from typing import List, Optional, Tuple
from .tokenization_utils import PreTrainedTokenizer
from .utils import logging
......@@ -311,22 +311,16 @@ class PhobertTokenizer(PreTrainedTokenizer):
out_string = " ".join(tokens).replace("@@ ", "").strip()
return out_string
def save_vocabulary(self, save_directory):
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
out_merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
out_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
out_merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment