Commit fb6c70a9 authored by Louis MARTIN's avatar Louis MARTIN Committed by Julien Chaumond
Browse files

Update tokenization_camembert.py with urls

parent e44b939e
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License
""" Tokenization classes for Camembert model.""" """ Tokenization classes for Camembert model."""
from __future__ import (absolute_import, division, print_function, from __future__ import (absolute_import, division, print_function,
unicode_literals) unicode_literals)
...@@ -20,6 +20,19 @@ import sentencepiece as spm ...@@ -20,6 +20,19 @@ import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'camembert-base': "https://dl.fbaipublicfiles.com/camembert/camembert-base-v0-sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'camembert-base': None,
}
class CamembertTokenizer(PreTrainedTokenizer): class CamembertTokenizer(PreTrainedTokenizer):
""" """
Adapted from RobertaTokenizer and XLNetTokenizer Adapted from RobertaTokenizer and XLNetTokenizer
...@@ -27,7 +40,9 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -27,7 +40,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
""" """
vocab_files_names = {'vocab_file': None} vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>", def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment