"pytorch_pretrained_bert/modeling_bert.py" did not exist on "c8ea286048517d9072397d77f4de21b8483a4531"
Unverified Commit 146c5212 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into add_models_special_tokens_to_specific_configs

parents f5b50c6b b623ddc0
......@@ -935,7 +935,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
def get_output_embeddings(self):
return self.lm_loss
def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs):
# Add dummy token at the end (no attention on this one)
effective_batch_size = input_ids.shape[0]
......@@ -958,8 +958,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
# if past is defined in model kwargs then use it for faster decoding
if "past" in model_kwargs and model_kwargs["past"]:
inputs["mems"] = model_kwargs["past"]
if past:
inputs["mems"] = past
return inputs
......@@ -1264,8 +1264,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
......@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_bart import BartConfig
from .configuration_distilbert import DistilBertConfig
from .configuration_roberta import RobertaConfig
from .configuration_utils import PretrainedConfig
......@@ -279,6 +280,9 @@ class _ScikitCompat(ABC):
class Pipeline(_ScikitCompat):
"""
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
different pipelines.
Base class implementing pipelined operations.
Pipeline workflow is defined as a sequence of the following operations:
Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
......@@ -292,39 +296,49 @@ class Pipeline(_ScikitCompat):
pickle format.
Arguments:
**model**: ``(str, PretrainedModel, TFPretrainedModel)``:
Reference to the model to use through this pipeline.
**tokenizer**: ``(str, PreTrainedTokenizer)``:
Reference to the tokenizer to use through this pipeline.
**args_parser**: ``ArgumentHandler``:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
**device**: ``int``:
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
**binary_output** ``bool`` (default: False):
binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
Return:
:obj:`List` or :obj:`Dict`:
Pipeline returns list or dictionary depending on:
- Does the user provided multiple sample
- The pipeline expose multiple fields in the output object
Examples:
nlp = pipeline('ner')
nlp = pipeline('ner', model='...', config='...', tokenizer='...')
nlp = NerPipeline(model='...', config='...', tokenizer='...')
nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
- Whether the user supplied multiple samples
- Whether the pipeline exposes multiple fields in the output object
"""
default_input_names = None
task = None
def __init__(
self,
model,
model: Optional = None,
tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
......@@ -336,6 +350,8 @@ class Pipeline(_ScikitCompat):
if framework is None:
framework = get_framework()
model, tokenizer = self.get_defaults(model, tokenizer, framework)
self.model = model
self.tokenizer = tokenizer
self.modelcard = modelcard
......@@ -412,7 +428,7 @@ class Pipeline(_ScikitCompat):
"""
args = ["input_ids", "attention_mask"]
if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)):
if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig)):
args += ["token_type_ids"]
# PR #1548 (CLI) There is an issue with attention_mask
......@@ -467,15 +483,74 @@ class Pipeline(_ScikitCompat):
else:
return predictions.numpy()
def get_defaults(self, model, tokenizer, framework):
task_defaults = SUPPORTED_TASKS[self.task]
if model is None:
if framework == "tf":
model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
elif framework == "pt":
model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
else:
raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")
if tokenizer is None:
default_tokenizer = task_defaults["default"]["tokenizer"]
if isinstance(default_tokenizer, tuple):
# For tuple we have (tokenizer name, {kwargs})
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
else:
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)
return model, tokenizer
class FeatureExtractionPipeline(Pipeline):
"""
Feature extraction pipeline using Model head.
Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
which can be used as features in a downstream tasks.
This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "feature-extraction", for extracting features of a sequence.
All models may be used for this pipeline. See a list of all models, including community-contributed models on
`huggingface.co/models <https://huggingface.co/models>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
"""
task = "feature-extraction"
def __init__(
self,
model,
model: Optional = None,
tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
......@@ -498,9 +573,49 @@ class FeatureExtractionPipeline(Pipeline):
class TextClassificationPipeline(Pipeline):
"""
Text classification pipeline using ModelForTextClassification head.
Text classification pipeline using ModelForSequenceClassification head. See the
`sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.
This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "sentiment-analysis", for classifying sequences according to positive or negative sentiments.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=text-classification>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
"""
task = "sentiment-analysis"
def __call__(self, *args, **kwargs):
outputs = super().__call__(*args, **kwargs)
scores = np.exp(outputs) / np.exp(outputs).sum(-1)
......@@ -509,12 +624,53 @@ class TextClassificationPipeline(Pipeline):
class FillMaskPipeline(Pipeline):
"""
Masked language modeling prediction pipeline using ModelWithLMHead head.
Masked language modeling prediction pipeline using ModelWithLMHead head. See the
`masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.
This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "fill-mask", for predicting masked tokens in a sequence.
The models that this pipeline can use are models that have been trained with a masked language modeling objective,
which includes the bi-directional models in the library.
See the list of available community models on
`huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
"""
task = "fill-mask"
def __init__(
self,
model,
model: Optional = None,
tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
......@@ -574,14 +730,57 @@ class FillMaskPipeline(Pipeline):
class NerPipeline(Pipeline):
"""
Named Entity Recognition pipeline using ModelForTokenClassification head.
Named Entity Recognition pipeline using ModelForTokenClassification head. See the
`named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.
This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.
The models that this pipeline can use are models that have been fine-tuned on a token classification task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=token-classification>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
Example::
from transformers import pi
"""
default_input_names = "sequences"
task = "ner"
def __init__(
self,
model,
model: Optional = None,
tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
......@@ -636,7 +835,7 @@ class NerPipeline(Pipeline):
if self.model.config.id2label[label_idx] not in self.ignore_labels:
answer += [
{
"word": self.tokenizer.decode([int(input_ids[idx])]),
"word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
"score": score[idx][label_idx].item(),
"entity": self.model.config.id2label[label_idx],
}
......@@ -716,15 +915,54 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
class QuestionAnsweringPipeline(Pipeline):
"""
Question Answering pipeline using ModelForQuestionAnswering head.
Question Answering pipeline using ModelForQuestionAnswering head. See the
`question answering usage <../usage.html#question-answering>`__ examples for more information.
This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "question-answering", for answering questions given a context.
The models that this pipeline can use are models that have been fine-tuned on a question answering task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=question-answering>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
"""
default_input_names = "question,context"
task = "question-answering"
def __init__(
self,
model,
tokenizer: Optional[PreTrainedTokenizer],
model: Optional = None,
tokenizer: Optional[PreTrainedTokenizer] = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
device: int = -1,
......@@ -1003,23 +1241,77 @@ def pipeline(
model: Optional = None,
config: Optional[Union[str, PretrainedConfig]] = None,
tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
modelcard: Optional[Union[str, ModelCard]] = None,
framework: Optional[str] = None,
**kwargs
) -> Pipeline:
"""
Utility factory method to build a pipeline.
Pipeline are made of:
A Tokenizer instance in charge of mapping raw textual input to token
A Model instance
Some (optional) post processing for enhancing model's output
Examples:
- A Tokenizer instance in charge of mapping raw textual input to token
- A Model instance
- Some (optional) post processing for enhancing model's output
Args:
task (:obj:`str`):
The task defining which pipeline will be returned. Currently accepted tasks are:
- "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
- "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
- "ner": will return a :class:`~transformers.NerPipeline`
- "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
- "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained model configuration inheriting from
:class:`~transformers.PretrainedConfig`.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
Returns:
:class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
the task.
Examples::
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
# Sentiment analysis pipeline
pipeline('sentiment-analysis')
# Question answering pipeline, specifying the checkpoint identifier
pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased')
pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
# Named entity recognition pipeline, passing in a specific model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
pipeline('ner', model=model, tokenizer=tokenizer)
# Named entity recognition pipeline, passing a model and configuration with a HTTPS URL.
model_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/pytorch_model.bin"
config_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/config.json"
pipeline('ner', model=model_url, config=config_url, tokenizer='bert-base-cased')
"""
# Retrieve the task
if task not in SUPPORTED_TASKS:
......@@ -1048,13 +1340,12 @@ def pipeline(
"Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
)
modelcard = None
# Try to infer modelcard from model or config name (if provided as str)
if modelcard is None:
# Try to fallback on one of the provided string for model or config (will replace the suffix)
if isinstance(model, str):
modelcard = model
elif isinstance(config, str):
modelcard = config
if isinstance(model, str):
modelcard = model
elif isinstance(config, str):
modelcard = config
# Instantiate tokenizer if needed
if isinstance(tokenizer, (str, tuple)):
......
......@@ -19,6 +19,7 @@ import logging
import os
import unicodedata
from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer
......@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizer(PreTrainedTokenizer):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer):
return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
"ids is already formatted with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
......@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -19,11 +19,7 @@ from .tokenization_roberta import RobertaTokenizer
# vocab and merges same as roberta
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
_all_bart_models = [
"bart-large",
"bart-large-mnli",
# "bart-large-cnn"
]
_all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
class BartTokenizer(RobertaTokenizer):
......
......@@ -19,6 +19,7 @@ import collections
import logging
import os
import unicodedata
from typing import List, Optional
from tokenizers import BertWordPieceTokenizer
......@@ -117,17 +118,41 @@ def whitespace_tokenize(text):
class BertTokenizer(PreTrainedTokenizer):
r"""
Constructs a BertTokenizer.
:class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
Constructs a BERT tokenizer. Based on WordPiece.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_basic_tokenize=True
vocab_file (:obj:`string`):
File containing the vocabulary.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to do basic tokenization before WordPiece.
never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
List of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/transformers/issues/328
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars=True,
**kwargs
):
"""Constructs a BertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input
Only has an effect when do_basic_tokenize=True
**do_basic_tokenize**: (`optional`) boolean (default True)
Whether to do basic tokenization before wordpiece.
**never_split**: (`optional`) list of string
List of tokens which will never be split during tokenization.
Only has an effect when do_basic_tokenize=True
**tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
super().__init__(
unk_token=unk_token,
sep_token=sep_token,
......@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer):
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A BERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP]
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
......@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
......
......@@ -18,6 +18,7 @@
import logging
import os
from shutil import copyfile
from typing import List, Optional
import sentencepiece as spm
......@@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
A CamemBERT sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
......@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
A CamemBERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -116,8 +116,21 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-Pair-Encoding
Constructs a CTRL tokenizer. Peculiarities:
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
......
......@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
class DistilBertTokenizer(BertTokenizer):
r"""
Constructs a DistilBertTokenizer.
:class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_basic_tokenize=True
:class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer):
"""
BPE tokenizer for Flaubert
- Moses preprocessing & tokenization
- Normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
- Moses preprocessing & tokenization
- Normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
and documentation regarding arguments.
"""
vocab_files_names = VOCAB_FILES_NAMES
......
......@@ -101,11 +101,35 @@ def get_pairs(word):
class GPT2Tokenizer(PreTrainedTokenizer):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding and tokenize methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The end of sequence token.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return text
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
......
......@@ -82,8 +82,21 @@ def text_standardize(text):
class OpenAIGPTTokenizer(PreTrainedTokenizer):
"""
BPE tokenizer. Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
......
......@@ -16,6 +16,7 @@
import logging
from typing import List, Optional
from tokenizers.processors import RobertaProcessing
......@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class RobertaTokenizer(GPT2Tokenizer):
"""
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the
``add_prefix_space`` flag set to ``True``.
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
......@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......
......@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
class TransfoXLTokenizer(PreTrainedTokenizer):
"""
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
raise ValueError("No <unkown> token in vocabulary")
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
logger.warning(
"Please note you will not be able to load the save vocabulary in"
......
......@@ -1012,6 +1012,12 @@ class PreTrainedTokenizer(object):
"https://github.com/huggingface/transformers/pull/2674"
)
# Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None:
raise ValueError(
"Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
)
first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None
......@@ -1115,6 +1121,12 @@ class PreTrainedTokenizer(object):
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
# Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None:
raise ValueError(
"Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
)
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers."
......@@ -1126,8 +1138,7 @@ class PreTrainedTokenizer(object):
input_ids = []
for ids_or_pair_ids in batch_text_or_text_pairs:
if isinstance(ids_or_pair_ids, (list, tuple)):
assert len(ids_or_pair_ids) == 2
if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
ids, pair_ids = ids_or_pair_ids
else:
ids, pair_ids = ids_or_pair_ids, None
......@@ -1789,7 +1800,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
# Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None:
raise ValueError("Unable to set proper padding strategy as the tokenizer does have padding token")
raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token")
# Set the truncation and padding strategy and restore the initial configuration
with truncate_and_pad(
......
......@@ -21,6 +21,7 @@ import os
import re
import sys
import unicodedata
from typing import List, Optional
import sacremoses as sm
......@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
BPE tokenizer for XLM
- Moses preprocessing & tokenization for most supported languages
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
- (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
- Moses preprocessing & tokenization for most supported languages
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
- (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
Vocabulary file.
merges_file (:obj:`string`):
Merges file.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "</s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
List of additional special tokens.
lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
Dictionary mapping languages string identifiers to their IDs.
id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
Dictionary mapping language IDs to their string identifiers.
do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase and remove accents when tokenizing.
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A XLM sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s> B </s>
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
......@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
......@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's).
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
......
......@@ -18,6 +18,7 @@
import logging
import os
from shutil import copyfile
from typing import List, Optional
from transformers.tokenization_utils import PreTrainedTokenizer
......@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
A XLM-R sequence has the following format:
- single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError(
......@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's).
XLM-R does not make use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return out_string
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -19,6 +19,7 @@ import logging
import os
import unicodedata
from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer
......@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
class XLNetTokenizer(PreTrainedTokenizer):
"""
SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES
......@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens.
An XLNet sequence has the following format:
single sequence: X <sep> <cls>
pair of sequences: A <sep> B <sep> <cls>
- single sequence: ``X <sep> <cls>``
- pair of sequences: ``A <sep> B <sep> <cls>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
......@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
already_has_special_tokens: (default False) Set to True if the token list is already formated with
special tokens for the model
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
:obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if already_has_special_tokens:
......@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLNet sequence pair mask has the following format:
......@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
| first sequence | second sequence | CLS segment ID
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep = [self.sep_token_id]
cls_segment_id = [2]
......@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory.
"""
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
......@@ -594,7 +594,7 @@ def main():
# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
......
......@@ -57,8 +57,18 @@ class ConfigTester(object):
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
def create_and_test_config_with_num_labels(self):
config = self.config_class(**self.inputs_dict, num_labels=5)
self.parent.assertEqual(len(config.id2label), 5)
self.parent.assertEqual(len(config.label2id), 5)
config.num_labels = 3
self.parent.assertEqual(len(config.id2label), 3)
self.parent.assertEqual(len(config.label2id), 3)
def run_common_tests(self):
self.create_and_test_config_common_properties()
self.create_and_test_config_to_json_string()
self.create_and_test_config_to_json_file()
self.create_and_test_config_from_and_save_pretrained()
self.create_and_test_config_with_num_labels()
......@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase):
for file in files:
# Open all files
print("Testing", file, end=" ")
with open(os.path.join(directory, file)) as f:
# Retrieve examples
examples = get_examples_from_file(f)
......@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase):
joined_examples.append(example)
joined_examples_index += 1
print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples)))
print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example.
for index, code_example in enumerate(joined_examples):
......@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase):
def test_main_doc_examples(self):
doc_directory = "docs/source"
self.analyze_directory(doc_directory)
ignore_files = ["favicon.ico"]
self.analyze_directory(doc_directory, ignore_files=ignore_files)
def test_modeling_examples(self):
transformers_directory = "src/transformers"
......@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase):
"modeling_tf_auto.py",
"modeling_utils.py",
"modeling_tf_t5.py",
"modeling_bart.py",
"modeling_tf_utils.py",
]
self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment