Unverified Commit 51397336 authored by Suraj Patil's avatar Suraj Patil Committed by GitHub
Browse files

LongformerTokenizerFast (#4547)

parent c9c385c5
...@@ -139,7 +139,7 @@ from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFas ...@@ -139,7 +139,7 @@ from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFas
from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
from .tokenization_flaubert import FlaubertTokenizer from .tokenization_flaubert import FlaubertTokenizer
from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
from .tokenization_longformer import LongformerTokenizer from .tokenization_longformer import LongformerTokenizer, LongformerTokenizerFast
from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .tokenization_reformer import ReformerTokenizer from .tokenization_reformer import ReformerTokenizer
from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import logging import logging
from .tokenization_roberta import RobertaTokenizer from .tokenization_roberta import RobertaTokenizer, RobertaTokenizerFast
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -40,3 +40,12 @@ class LongformerTokenizer(RobertaTokenizer): ...@@ -40,3 +40,12 @@ class LongformerTokenizer(RobertaTokenizer):
"vocab_file": {m: vocab_url for m in _all_longformer_models}, "vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models}, "merges_file": {m: merges_url for m in _all_longformer_models},
} }
class LongformerTokenizerFast(RobertaTokenizerFast):
# merges and vocab same as Roberta
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_longformer_models},
"merges_file": {m: merges_url for m in _all_longformer_models},
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment