Unverified Commit 146c5212 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into add_models_special_tokens_to_specific_configs

parents f5b50c6b b623ddc0
...@@ -935,7 +935,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -935,7 +935,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_loss return self.lm_loss
def prepare_inputs_for_generation(self, input_ids, **model_kwargs): def prepare_inputs_for_generation(self, input_ids, past, **model_kwargs):
# Add dummy token at the end (no attention on this one) # Add dummy token at the end (no attention on this one)
effective_batch_size = input_ids.shape[0] effective_batch_size = input_ids.shape[0]
...@@ -958,8 +958,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -958,8 +958,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping} inputs = {"input_ids": input_ids, "perm_mask": perm_mask, "target_mapping": target_mapping}
# if past is defined in model kwargs then use it for faster decoding # if past is defined in model kwargs then use it for faster decoding
if "past" in model_kwargs and model_kwargs["past"]: if past:
inputs["mems"] = model_kwargs["past"] inputs["mems"] = past
return inputs return inputs
...@@ -1264,8 +1264,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): ...@@ -1264,8 +1264,10 @@ class XLNetForTokenClassification(XLNetPreTrainedModel):
# Only keep active parts of the loss # Only keep active parts of the loss
if attention_mask is not None: if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1 active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss] active_logits = logits.view(-1, self.num_labels)
active_labels = labels.view(-1)[active_loss] active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels) loss = loss_fct(active_logits, active_labels)
else: else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
......
...@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Tuple, Union ...@@ -28,6 +28,7 @@ from typing import Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_bart import BartConfig
from .configuration_distilbert import DistilBertConfig from .configuration_distilbert import DistilBertConfig
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
...@@ -279,6 +280,9 @@ class _ScikitCompat(ABC): ...@@ -279,6 +280,9 @@ class _ScikitCompat(ABC):
class Pipeline(_ScikitCompat): class Pipeline(_ScikitCompat):
""" """
The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
different pipelines.
Base class implementing pipelined operations. Base class implementing pipelined operations.
Pipeline workflow is defined as a sequence of the following operations: Pipeline workflow is defined as a sequence of the following operations:
Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
...@@ -292,39 +296,49 @@ class Pipeline(_ScikitCompat): ...@@ -292,39 +296,49 @@ class Pipeline(_ScikitCompat):
pickle format. pickle format.
Arguments: Arguments:
**model**: ``(str, PretrainedModel, TFPretrainedModel)``: model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
Reference to the model to use through this pipeline. The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
**tokenizer**: ``(str, PreTrainedTokenizer)``: :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
Reference to the tokenizer to use through this pipeline. TensorFlow.
**args_parser**: ``ArgumentHandler``: If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters. Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
**device**: ``int``:
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id. on the associated CUDA device id.
binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
**binary_output** ``bool`` (default: False):
Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
Return: Return:
:obj:`List` or :obj:`Dict`:
Pipeline returns list or dictionary depending on: Pipeline returns list or dictionary depending on:
- Does the user provided multiple sample
- The pipeline expose multiple fields in the output object - Whether the user supplied multiple samples
- Whether the pipeline exposes multiple fields in the output object
Examples:
nlp = pipeline('ner')
nlp = pipeline('ner', model='...', config='...', tokenizer='...')
nlp = NerPipeline(model='...', config='...', tokenizer='...')
nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
""" """
default_input_names = None default_input_names = None
task = None
def __init__( def __init__(
self, self,
model, model: Optional = None,
tokenizer: PreTrainedTokenizer = None, tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None, modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
...@@ -336,6 +350,8 @@ class Pipeline(_ScikitCompat): ...@@ -336,6 +350,8 @@ class Pipeline(_ScikitCompat):
if framework is None: if framework is None:
framework = get_framework() framework = get_framework()
model, tokenizer = self.get_defaults(model, tokenizer, framework)
self.model = model self.model = model
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.modelcard = modelcard self.modelcard = modelcard
...@@ -412,7 +428,7 @@ class Pipeline(_ScikitCompat): ...@@ -412,7 +428,7 @@ class Pipeline(_ScikitCompat):
""" """
args = ["input_ids", "attention_mask"] args = ["input_ids", "attention_mask"]
if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)): if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig, BartConfig)):
args += ["token_type_ids"] args += ["token_type_ids"]
# PR #1548 (CLI) There is an issue with attention_mask # PR #1548 (CLI) There is an issue with attention_mask
...@@ -467,15 +483,74 @@ class Pipeline(_ScikitCompat): ...@@ -467,15 +483,74 @@ class Pipeline(_ScikitCompat):
else: else:
return predictions.numpy() return predictions.numpy()
def get_defaults(self, model, tokenizer, framework):
task_defaults = SUPPORTED_TASKS[self.task]
if model is None:
if framework == "tf":
model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
elif framework == "pt":
model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
else:
raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")
if tokenizer is None:
default_tokenizer = task_defaults["default"]["tokenizer"]
if isinstance(default_tokenizer, tuple):
# For tuple we have (tokenizer name, {kwargs})
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
else:
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)
return model, tokenizer
class FeatureExtractionPipeline(Pipeline): class FeatureExtractionPipeline(Pipeline):
""" """
Feature extraction pipeline using Model head. Feature extraction pipeline using Model head. This pipeline extracts the hidden states from the base transformer,
which can be used as features in a downstream tasks.
This feature extraction pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "feature-extraction", for extracting features of a sequence.
All models may be used for this pipeline. See a list of all models, including community-contributed models on
`huggingface.co/models <https://huggingface.co/models>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
""" """
task = "feature-extraction"
def __init__( def __init__(
self, self,
model, model: Optional = None,
tokenizer: PreTrainedTokenizer = None, tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None, modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
...@@ -498,9 +573,49 @@ class FeatureExtractionPipeline(Pipeline): ...@@ -498,9 +573,49 @@ class FeatureExtractionPipeline(Pipeline):
class TextClassificationPipeline(Pipeline): class TextClassificationPipeline(Pipeline):
""" """
Text classification pipeline using ModelForTextClassification head. Text classification pipeline using ModelForSequenceClassification head. See the
`sequence classification usage <../usage.html#sequence-classification>`__ examples for more information.
This text classification pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "sentiment-analysis", for classifying sequences according to positive or negative sentiments.
The models that this pipeline can use are models that have been fine-tuned on a sequence classification task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=text-classification>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
""" """
task = "sentiment-analysis"
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
outputs = super().__call__(*args, **kwargs) outputs = super().__call__(*args, **kwargs)
scores = np.exp(outputs) / np.exp(outputs).sum(-1) scores = np.exp(outputs) / np.exp(outputs).sum(-1)
...@@ -509,12 +624,53 @@ class TextClassificationPipeline(Pipeline): ...@@ -509,12 +624,53 @@ class TextClassificationPipeline(Pipeline):
class FillMaskPipeline(Pipeline): class FillMaskPipeline(Pipeline):
""" """
Masked language modeling prediction pipeline using ModelWithLMHead head. Masked language modeling prediction pipeline using ModelWithLMHead head. See the
`masked language modeling usage <../usage.html#masked-language-modeling>`__ examples for more information.
This mask filling pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "fill-mask", for predicting masked tokens in a sequence.
The models that this pipeline can use are models that have been trained with a masked language modeling objective,
which includes the bi-directional models in the library.
See the list of available community models on
`huggingface.co/models <https://huggingface.co/models?search=&filter=lm-head>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
""" """
task = "fill-mask"
def __init__( def __init__(
self, self,
model, model: Optional = None,
tokenizer: PreTrainedTokenizer = None, tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None, modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
...@@ -574,14 +730,57 @@ class FillMaskPipeline(Pipeline): ...@@ -574,14 +730,57 @@ class FillMaskPipeline(Pipeline):
class NerPipeline(Pipeline): class NerPipeline(Pipeline):
""" """
Named Entity Recognition pipeline using ModelForTokenClassification head. Named Entity Recognition pipeline using ModelForTokenClassification head. See the
`named entity recognition usage <../usage.html#named-entity-recognition>`__ examples for more information.
This token recognition pipeline can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "ner", for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous.
The models that this pipeline can use are models that have been fine-tuned on a token classification task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=token-classification>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
Example::
from transformers import pi
""" """
default_input_names = "sequences" default_input_names = "sequences"
task = "ner"
def __init__( def __init__(
self, self,
model, model: Optional = None,
tokenizer: PreTrainedTokenizer = None, tokenizer: PreTrainedTokenizer = None,
modelcard: Optional[ModelCard] = None, modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
...@@ -636,7 +835,7 @@ class NerPipeline(Pipeline): ...@@ -636,7 +835,7 @@ class NerPipeline(Pipeline):
if self.model.config.id2label[label_idx] not in self.ignore_labels: if self.model.config.id2label[label_idx] not in self.ignore_labels:
answer += [ answer += [
{ {
"word": self.tokenizer.decode([int(input_ids[idx])]), "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
"score": score[idx][label_idx].item(), "score": score[idx][label_idx].item(),
"entity": self.model.config.id2label[label_idx], "entity": self.model.config.id2label[label_idx],
} }
...@@ -716,15 +915,54 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): ...@@ -716,15 +915,54 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
class QuestionAnsweringPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline):
""" """
Question Answering pipeline using ModelForQuestionAnswering head. Question Answering pipeline using ModelForQuestionAnswering head. See the
`question answering usage <../usage.html#question-answering>`__ examples for more information.
This question answering can currently be loaded from the :func:`~transformers.pipeline` method using
the following task identifier(s):
- "question-answering", for answering questions given a context.
The models that this pipeline can use are models that have been fine-tuned on a question answering task.
See the list of available community models fine-tuned on such a task on
`huggingface.co/models <https://huggingface.co/models?search=&filter=question-answering>`__.
Arguments:
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`, defaults to :obj:`None`):
Model card attributed to the model for this pipeline.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`, defaults to :obj:`None`):
Reference to the object in charge of parsing supplied pipeline parameters.
device (:obj:`int`, `optional`, defaults to :obj:`-1`):
Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
on the associated CUDA device id.
""" """
default_input_names = "question,context" default_input_names = "question,context"
task = "question-answering"
def __init__( def __init__(
self, self,
model, model: Optional = None,
tokenizer: Optional[PreTrainedTokenizer], tokenizer: Optional[PreTrainedTokenizer] = None,
modelcard: Optional[ModelCard] = None, modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
device: int = -1, device: int = -1,
...@@ -1003,23 +1241,77 @@ def pipeline( ...@@ -1003,23 +1241,77 @@ def pipeline(
model: Optional = None, model: Optional = None,
config: Optional[Union[str, PretrainedConfig]] = None, config: Optional[Union[str, PretrainedConfig]] = None,
tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
modelcard: Optional[Union[str, ModelCard]] = None,
framework: Optional[str] = None, framework: Optional[str] = None,
**kwargs **kwargs
) -> Pipeline: ) -> Pipeline:
""" """
Utility factory method to build a pipeline. Utility factory method to build a pipeline.
Pipeline are made of: Pipeline are made of:
A Tokenizer instance in charge of mapping raw textual input to token
A Model instance
Some (optional) post processing for enhancing model's output
Examples: - A Tokenizer instance in charge of mapping raw textual input to token
- A Model instance
- Some (optional) post processing for enhancing model's output
Args:
task (:obj:`str`):
The task defining which pipeline will be returned. Currently accepted tasks are:
- "feature-extraction": will return a :class:`~transformers.FeatureExtractionPipeline`
- "sentiment-analysis": will return a :class:`~transformers.TextClassificationPipeline`
- "ner": will return a :class:`~transformers.NerPipeline`
- "question-answering": will return a :class:`~transformers.QuestionAnsweringPipeline`
- "fill-mask": will return a :class:`~transformers.FillMaskPipeline`
model (:obj:`str` or :obj:`~transformers.PreTrainedModel` or :obj:`~transformers.TFPreTrainedModel`, `optional`, defaults to :obj:`None`):
The model that will be used by the pipeline to make predictions. This can be :obj:`None`, a string
checkpoint identifier or an actual pre-trained model inheriting from
:class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
TensorFlow.
If :obj:`None`, the default of the pipeline will be loaded.
config (:obj:`str` or :obj:`~transformers.PretrainedConfig`, `optional`, defaults to :obj:`None`):
The configuration that will be used by the pipeline to instantiate the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained model configuration inheriting from
:class:`~transformers.PretrainedConfig`.
If :obj:`None`, the default of the pipeline will be loaded.
tokenizer (:obj:`str` or :obj:`~transformers.PreTrainedTokenizer`, `optional`, defaults to :obj:`None`):
The tokenizer that will be used by the pipeline to encode data for the model. This can be :obj:`None`,
a string checkpoint identifier or an actual pre-trained tokenizer inheriting from
:class:`~transformers.PreTrainedTokenizer`.
If :obj:`None`, the default of the pipeline will be loaded.
framework (:obj:`str`, `optional`, defaults to :obj:`None`):
The framework to use, either "pt" for PyTorch or "tf" for TensorFlow. The specified framework must be
installed.
If no framework is specified, will default to the one currently installed. If no framework is specified
and both frameworks are installed, will default to PyTorch.
Returns:
:class:`~transformers.Pipeline`: Class inheriting from :class:`~transformers.Pipeline`, according to
the task.
Examples::
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
# Sentiment analysis pipeline
pipeline('sentiment-analysis') pipeline('sentiment-analysis')
# Question answering pipeline, specifying the checkpoint identifier
pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased') pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
pipeline('ner', model='dbmdz/bert-large-cased-finetuned-conll03-english', tokenizer='bert-base-cased') # Named entity recognition pipeline, passing in a specific model and tokenizer
pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased') model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
pipeline('ner', model=model, tokenizer=tokenizer)
# Named entity recognition pipeline, passing a model and configuration with a HTTPS URL.
model_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/pytorch_model.bin"
config_url = "https://s3.amazonaws.com/models.huggingface.co/bert/dbmdz/bert-large-cased-finetuned-conll03-english/config.json"
pipeline('ner', model=model_url, config=config_url, tokenizer='bert-base-cased')
""" """
# Retrieve the task # Retrieve the task
if task not in SUPPORTED_TASKS: if task not in SUPPORTED_TASKS:
...@@ -1048,13 +1340,12 @@ def pipeline( ...@@ -1048,13 +1340,12 @@ def pipeline(
"Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer." "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
) )
modelcard = None
# Try to infer modelcard from model or config name (if provided as str) # Try to infer modelcard from model or config name (if provided as str)
if modelcard is None: if isinstance(model, str):
# Try to fallback on one of the provided string for model or config (will replace the suffix) modelcard = model
if isinstance(model, str): elif isinstance(config, str):
modelcard = model modelcard = config
elif isinstance(config, str):
modelcard = config
# Instantiate tokenizer if needed # Instantiate tokenizer if needed
if isinstance(tokenizer, (str, tuple)): if isinstance(tokenizer, (str, tuple)):
......
...@@ -19,6 +19,7 @@ import logging ...@@ -19,6 +19,7 @@ import logging
import os import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁" ...@@ -55,9 +56,55 @@ SPIECE_UNDERLINE = "▁"
class AlbertTokenizer(PreTrainedTokenizer): class AlbertTokenizer(PreTrainedTokenizer):
""" """
SentencePiece based tokenizer. Peculiarities: Constructs an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -185,17 +232,28 @@ class AlbertTokenizer(PreTrainedTokenizer):
return self.sp_model.IdToPiece(index) return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
An ALBERT sequence has the following format: An ALBERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP] - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -203,27 +261,30 @@ class AlbertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep return cls + token_ids_0 + sep
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
raise ValueError( raise ValueError(
"You should not supply a second sequence if the provided sequence of " "You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model." "ids is already formatted with special tokens for the model."
) )
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
...@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -231,14 +292,29 @@ class AlbertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An ALBERT sequence pair mask has the following format: An ALBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's). ::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -248,8 +324,15 @@ class AlbertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
...@@ -19,11 +19,7 @@ from .tokenization_roberta import RobertaTokenizer ...@@ -19,11 +19,7 @@ from .tokenization_roberta import RobertaTokenizer
# vocab and merges same as roberta # vocab and merges same as roberta
vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
_all_bart_models = [ _all_bart_models = ["bart-large", "bart-large-mnli", "bart-large-cnn"]
"bart-large",
"bart-large-mnli",
# "bart-large-cnn"
]
class BartTokenizer(RobertaTokenizer): class BartTokenizer(RobertaTokenizer):
......
...@@ -19,6 +19,7 @@ import collections ...@@ -19,6 +19,7 @@ import collections
import logging import logging
import os import os
import unicodedata import unicodedata
from typing import List, Optional
from tokenizers import BertWordPieceTokenizer from tokenizers import BertWordPieceTokenizer
...@@ -117,17 +118,41 @@ def whitespace_tokenize(text): ...@@ -117,17 +118,41 @@ def whitespace_tokenize(text):
class BertTokenizer(PreTrainedTokenizer): class BertTokenizer(PreTrainedTokenizer):
r""" r"""
Constructs a BertTokenizer. Constructs a BERT tokenizer. Based on WordPiece.
:class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args: Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file vocab_file (:obj:`string`):
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True File containing the vocabulary.
do_basic_tokenize: Whether to do basic tokenization before wordpiece. do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the Whether to lowercase the input when tokenizing.
minimum of this value (if specified) and the underlying BERT model's sequence length. do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
never_split: List of tokens which will never be split during tokenization. Only has an effect when Whether to do basic tokenization before WordPiece.
do_basic_tokenize=True never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
List of tokens which will never be split during tokenization. Only has an effect when
:obj:`do_basic_tokenize=True`
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/transformers/issues/328
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -149,23 +174,6 @@ class BertTokenizer(PreTrainedTokenizer):
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
**kwargs **kwargs
): ):
"""Constructs a BertTokenizer.
Args:
**vocab_file**: Path to a one-wordpiece-per-line vocabulary file
**do_lower_case**: (`optional`) boolean (default True)
Whether to lower case the input
Only has an effect when do_basic_tokenize=True
**do_basic_tokenize**: (`optional`) boolean (default True)
Whether to do basic tokenization before wordpiece.
**never_split**: (`optional`) list of string
List of tokens which will never be split during tokenization.
Only has an effect when do_basic_tokenize=True
**tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters.
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
super().__init__( super().__init__(
unk_token=unk_token, unk_token=unk_token,
sep_token=sep_token, sep_token=sep_token,
...@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -221,13 +229,25 @@ class BertTokenizer(PreTrainedTokenizer):
out_string = " ".join(tokens).replace(" ##", "").strip() out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A BERT sequence has the following format: A BERT sequence has the following format:
single sequence: [CLS] X [SEP]
pair of sequences: [CLS] A [SEP] B [SEP] - single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -235,20 +255,23 @@ class BertTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -263,14 +286,29 @@ class BertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence ::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0's). if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -279,7 +317,16 @@ class BertTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
index = 0 index = 0
if os.path.isdir(vocab_path): if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"]) vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
import logging import logging
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
import sentencepiece as spm import sentencepiece as spm
...@@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -53,7 +54,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
Adapted from RobertaTokenizer and XLNetTokenizer Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities: SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -97,34 +141,50 @@ class CamembertTokenizer(PreTrainedTokenizer):
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A CamemBERT sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
...@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -138,14 +198,29 @@ class CamembertTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format: A CamemBERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence ::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
if token_ids_1 is None, only returns the first portion of the mask (0's). Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -200,8 +275,15 @@ class CamembertTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
...@@ -116,8 +116,21 @@ def get_pairs(word): ...@@ -116,8 +116,21 @@ def get_pairs(word):
class CTRLTokenizer(PreTrainedTokenizer): class CTRLTokenizer(PreTrainedTokenizer):
""" """
CTRL BPE tokenizer. Peculiarities: Constructs a CTRL tokenizer. Peculiarities:
- Byte-Pair-Encoding
- Byte-Pair-Encoding
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer): ...@@ -219,7 +232,16 @@ class CTRLTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = { ...@@ -58,16 +58,11 @@ PRETRAINED_INIT_CONFIGURATION = {
class DistilBertTokenizer(BertTokenizer): class DistilBertTokenizer(BertTokenizer):
r""" r"""
Constructs a DistilBertTokenizer. Constructs a DistilBertTokenizer.
:class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
tokenization: punctuation splitting + wordpiece.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True parameters.
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_basic_tokenize=True
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer): ...@@ -80,14 +80,14 @@ class FlaubertTokenizer(XLMTokenizer):
""" """
BPE tokenizer for Flaubert BPE tokenizer for Flaubert
- Moses preprocessing & tokenization - Moses preprocessing & tokenization
- Normalize all inputs text
- Normalize all inputs text - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
(ex: "__classify__") to a vocabulary
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - `do_lowercase` controle lower casing (automatically set for pretrained vocabularies)
(ex: "__classify__") to a vocabulary
This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
- `do_lowercase` controle lower casing (automatically set for pretrained vocabularies) and documentation regarding arguments.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
......
...@@ -101,11 +101,35 @@ def get_pairs(word): ...@@ -101,11 +101,35 @@ def get_pairs(word):
class GPT2Tokenizer(PreTrainedTokenizer): class GPT2Tokenizer(PreTrainedTokenizer):
""" """
GPT-2 BPE tokenizer. Peculiarities: GPT-2 BPE tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding and tokenize methods should be called with the - Byte-level Byte-Pair-Encoding
``add_prefix_space`` flag set to ``True``. - Requires a space to start the input string => the encoding methods should be called with the
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve ``add_prefix_space`` flag set to ``True``.
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"` Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
unk_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The beginning of sequence token.
eos_token (:obj:`string`, `optional`, defaults to `<|endoftext|>`):
The end of sequence token.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -219,7 +243,16 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return text return text
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -82,8 +82,21 @@ def text_standardize(text): ...@@ -82,8 +82,21 @@ def text_standardize(text):
class OpenAIGPTTokenizer(PreTrainedTokenizer): class OpenAIGPTTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer. Peculiarities: BPE tokenizer. Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. - lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -201,7 +214,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
import logging import logging
from typing import List, Optional
from tokenizers.processors import RobertaProcessing from tokenizers.processors import RobertaProcessing
...@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -60,12 +61,59 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
class RobertaTokenizer(GPT2Tokenizer): class RobertaTokenizer(GPT2Tokenizer):
""" """
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Constructs a RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
- Byte-level Byte-Pair-Encoding
- Requires a space to start the input string => the encoding methods should be called with the - Byte-level Byte-Pair-Encoding
``add_prefix_space`` flag set to ``True``. - Requires a space to start the input string => the encoding methods should be called with the
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve ``add_prefix_space`` flag set to ``True``.
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string:
::
tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
merges_file (:obj:`str`):
Path to the merges file.
errors (:obj:`str`, `optional`, defaults to "replace"):
Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
<https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -102,13 +150,25 @@ class RobertaTokenizer(GPT2Tokenizer):
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A RoBERTa sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -116,20 +176,23 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
...@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -143,12 +206,22 @@ class RobertaTokenizer(GPT2Tokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned. RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's). Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
......
...@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin" ...@@ -72,6 +72,9 @@ CORPUS_NAME = "corpus.bin"
class TransfoXLTokenizer(PreTrainedTokenizer): class TransfoXLTokenizer(PreTrainedTokenizer):
""" """
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -189,7 +192,16 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
raise ValueError("No <unkown> token in vocabulary") raise ValueError("No <unkown> token in vocabulary")
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """
Save the vocabulary and special tokens file to a directory.
Args:
vocab_path (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
logger.warning( logger.warning(
"Please note you will not be able to load the save vocabulary in" "Please note you will not be able to load the save vocabulary in"
......
...@@ -1012,6 +1012,12 @@ class PreTrainedTokenizer(object): ...@@ -1012,6 +1012,12 @@ class PreTrainedTokenizer(object):
"https://github.com/huggingface/transformers/pull/2674" "https://github.com/huggingface/transformers/pull/2674"
) )
# Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None:
raise ValueError(
"Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
)
first_ids = get_input_ids(text) first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None second_ids = get_input_ids(text_pair) if text_pair is not None else None
...@@ -1115,6 +1121,12 @@ class PreTrainedTokenizer(object): ...@@ -1115,6 +1121,12 @@ class PreTrainedTokenizer(object):
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
) )
# Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None:
raise ValueError(
"Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
)
if return_offsets_mapping: if return_offsets_mapping:
raise NotImplementedError( raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers." "return_offset_mapping is not available when using Python tokenizers."
...@@ -1126,8 +1138,7 @@ class PreTrainedTokenizer(object): ...@@ -1126,8 +1138,7 @@ class PreTrainedTokenizer(object):
input_ids = [] input_ids = []
for ids_or_pair_ids in batch_text_or_text_pairs: for ids_or_pair_ids in batch_text_or_text_pairs:
if isinstance(ids_or_pair_ids, (list, tuple)): if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
assert len(ids_or_pair_ids) == 2
ids, pair_ids = ids_or_pair_ids ids, pair_ids = ids_or_pair_ids
else: else:
ids, pair_ids = ids_or_pair_ids, None ids, pair_ids = ids_or_pair_ids, None
...@@ -1789,7 +1800,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer): ...@@ -1789,7 +1800,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizer):
# Throw an error if we can pad because there is no padding token # Throw an error if we can pad because there is no padding token
if pad_to_max_length and self.pad_token_id is None: if pad_to_max_length and self.pad_token_id is None:
raise ValueError("Unable to set proper padding strategy as the tokenizer does have padding token") raise ValueError("Unable to set proper padding strategy as the tokenizer does not have a padding token")
# Set the truncation and padding strategy and restore the initial configuration # Set the truncation and padding strategy and restore the initial configuration
with truncate_and_pad( with truncate_and_pad(
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import re import re
import sys import sys
import unicodedata import unicodedata
from typing import List, Optional
import sacremoses as sm import sacremoses as sm
...@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -530,20 +531,59 @@ class XLMTokenizer(PreTrainedTokenizer):
""" """
BPE tokenizer for XLM BPE tokenizer for XLM
- Moses preprocessing & tokenization for most supported languages - Moses preprocessing & tokenization for most supported languages
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP)
- Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) - (optionally) lower case & normalize all inputs text
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \
- (optionally) lower case & normalize all inputs text (ex: "__classify__") to a vocabulary
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies)
- argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies)
(ex: "__classify__") to a vocabulary
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
- `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) should refer to the superclass for more information regarding methods.
- `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) Args:
vocab_file (:obj:`string`):
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies) Vocabulary file.
merges_file (:obj:`string`):
Merges file.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "</s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<special1>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
List of additional special tokens.
lang2id (:obj:`Dict[str, int]`, `optional`, defaults to :obj:`None`):
Dictionary mapping languages string identifiers to their IDs.
id2lang (:obj:`Dict[int, str`, `optional`, defaults to :obj:`None`):
Dictionary mapping language IDs to their string identifiers.
do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase and remove accents when tokenizing.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -812,13 +852,26 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace("</w>", " ").strip() out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A XLM sequence has the following format: A XLM sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
...@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -826,20 +879,23 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -854,14 +910,29 @@ class XLMTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format: An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
if token_ids_1 is None, only returns the first portion of the mask (0's). ::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -870,7 +941,16 @@ class XLMTokenizer(PreTrainedTokenizer):
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """
Save the vocabulary and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return return
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
import logging import logging
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
from transformers.tokenization_utils import PreTrainedTokenizer from transformers.tokenization_utils import PreTrainedTokenizer
...@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -54,7 +55,50 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
Adapted from RobertaTokenizer and XLNetTokenizer Adapted from RobertaTokenizer and XLNetTokenizer
SentencePiece based tokenizer. Peculiarities: SentencePiece based tokenizer. Peculiarities:
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`str`):
Path to the vocabulary file.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
sep_token (:obj:`string`, `optional`, defaults to "</s>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
cls_token (:obj:`string`, `optional`, defaults to "<s>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -132,35 +176,52 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
A RoBERTa sequence has the following format: A XLM-R sequence has the following format:
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s> - single sequence: ``<s> X </s>``
- pair of sequences: ``<s> A </s></s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
if token_ids_1 is None: if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
sep = [self.sep_token_id] sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
if token_ids_1 is not None: if token_ids_1 is not None:
raise ValueError( raise ValueError(
...@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -173,12 +234,24 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return [1] + ([0] * len(token_ids_0)) + [1] return [1] + ([0] * len(token_ids_0)) + [1]
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
RoBERTa does not make use of token type ids, therefore a list of zeros is returned. XLM-R does not make use of token type ids, therefore a list of zeros is returned.
if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): ...@@ -216,8 +289,15 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
return out_string return out_string
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
...@@ -19,6 +19,7 @@ import logging ...@@ -19,6 +19,7 @@ import logging
import os import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
from typing import List, Optional
from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils import PreTrainedTokenizer
...@@ -51,9 +52,57 @@ SEG_ID_PAD = 4 ...@@ -51,9 +52,57 @@ SEG_ID_PAD = 4
class XLNetTokenizer(PreTrainedTokenizer): class XLNetTokenizer(PreTrainedTokenizer):
""" """
SentencePiece based tokenizer. Peculiarities: Constructs an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__
- requires `SentencePiece <https://github.com/google/sentencepiece>`_ This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
should refer to the superclass for more information regarding methods.
Args:
vocab_file (:obj:`string`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether to keep accents when tokenizing.
bos_token (:obj:`string`, `optional`, defaults to "<s>"):
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning
of sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`string`, `optional`, defaults to "</s>"):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end
of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`string`, `optional`, defaults to "<unk>"):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`string`, `optional`, defaults to "<sep>"):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
for sequence classification or for a text and a question for question answering.
It is also used as the last token of a sequence built with special tokens.
pad_token (:obj:`string`, `optional`, defaults to "<pad>"):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`string`, `optional`, defaults to "<cls>"):
The classifier token which is used when doing sequence classification (classification of the whole
sequence instead of per-token classification). It is the first token of the sequence when built with
special tokens.
mask_token (:obj:`string`, `optional`, defaults to "<mask>"):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
...@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -189,13 +238,25 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Build model inputs from a sequence or a pair of sequence for sequence classification tasks Build model inputs from a sequence or a pair of sequence for sequence classification tasks
by concatenating and adding special tokens. by concatenating and adding special tokens.
An XLNet sequence has the following format: An XLNet sequence has the following format:
single sequence: X <sep> <cls>
pair of sequences: A <sep> B <sep> <cls> - single sequence: ``X <sep> <cls>``
- pair of sequences: ``A <sep> B <sep> <cls>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
...@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -203,20 +264,23 @@ class XLNetTokenizer(PreTrainedTokenizer):
return token_ids_0 + sep + cls return token_ids_0 + sep + cls
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
""" """
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args: Args:
token_ids_0: list of ids (must not contain special tokens) token_ids_0 (:obj:`List[int]`):
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids List of ids.
for sequence pairs token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
already_has_special_tokens: (default False) Set to True if the token list is already formated with Optional second list of IDs for sequence pairs.
special tokens for the model already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns: Returns:
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
""" """
if already_has_special_tokens: if already_has_special_tokens:
...@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -231,7 +295,9 @@ class XLNetTokenizer(PreTrainedTokenizer):
return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
return ([0] * len(token_ids_0)) + [1, 1] return ([0] * len(token_ids_0)) + [1, 1]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLNet sequence pair mask has the following format: An XLNet sequence pair mask has the following format:
...@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -239,6 +305,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
| first sequence | second sequence | CLS segment ID | first sequence | second sequence | CLS segment ID
if token_ids_1 is None, only returns the first portion of the mask (0's). if token_ids_1 is None, only returns the first portion of the mask (0's).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
""" """
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls_segment_id = [2] cls_segment_id = [2]
...@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -248,8 +324,15 @@ class XLNetTokenizer(PreTrainedTokenizer):
return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """
to a directory. Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
Args:
save_directory (:obj:`str`):
The directory in which to save the vocabulary.
Returns:
:obj:`Tuple(str)`: Paths to the files saved.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
......
...@@ -594,7 +594,7 @@ def main(): ...@@ -594,7 +594,7 @@ def main():
# Setup CUDA, GPU & distributed training # Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda: if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count() args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank) torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank) device = torch.device("cuda", args.local_rank)
......
...@@ -57,8 +57,18 @@ class ConfigTester(object): ...@@ -57,8 +57,18 @@ class ConfigTester(object):
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
def create_and_test_config_with_num_labels(self):
config = self.config_class(**self.inputs_dict, num_labels=5)
self.parent.assertEqual(len(config.id2label), 5)
self.parent.assertEqual(len(config.label2id), 5)
config.num_labels = 3
self.parent.assertEqual(len(config.id2label), 3)
self.parent.assertEqual(len(config.label2id), 3)
def run_common_tests(self): def run_common_tests(self):
self.create_and_test_config_common_properties() self.create_and_test_config_common_properties()
self.create_and_test_config_to_json_string() self.create_and_test_config_to_json_string()
self.create_and_test_config_to_json_file() self.create_and_test_config_to_json_file()
self.create_and_test_config_from_and_save_pretrained() self.create_and_test_config_from_and_save_pretrained()
self.create_and_test_config_with_num_labels()
...@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase): ...@@ -78,6 +78,7 @@ class TestCodeExamples(unittest.TestCase):
for file in files: for file in files:
# Open all files # Open all files
print("Testing", file, end=" ")
with open(os.path.join(directory, file)) as f: with open(os.path.join(directory, file)) as f:
# Retrieve examples # Retrieve examples
examples = get_examples_from_file(f) examples = get_examples_from_file(f)
...@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase): ...@@ -99,7 +100,7 @@ class TestCodeExamples(unittest.TestCase):
joined_examples.append(example) joined_examples.append(example)
joined_examples_index += 1 joined_examples_index += 1
print("Testing", file, str(len(joined_examples)) + "/" + str(len(joined_examples))) print(str(len(joined_examples)) + "/" + str(len(joined_examples)))
# Execute sub tests with every example. # Execute sub tests with every example.
for index, code_example in enumerate(joined_examples): for index, code_example in enumerate(joined_examples):
...@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase): ...@@ -114,7 +115,8 @@ class TestCodeExamples(unittest.TestCase):
def test_main_doc_examples(self): def test_main_doc_examples(self):
doc_directory = "docs/source" doc_directory = "docs/source"
self.analyze_directory(doc_directory) ignore_files = ["favicon.ico"]
self.analyze_directory(doc_directory, ignore_files=ignore_files)
def test_modeling_examples(self): def test_modeling_examples(self):
transformers_directory = "src/transformers" transformers_directory = "src/transformers"
...@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase): ...@@ -125,5 +127,7 @@ class TestCodeExamples(unittest.TestCase):
"modeling_tf_auto.py", "modeling_tf_auto.py",
"modeling_utils.py", "modeling_utils.py",
"modeling_tf_t5.py", "modeling_tf_t5.py",
"modeling_bart.py",
"modeling_tf_utils.py",
] ]
self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files) self.analyze_directory(transformers_directory, identifier=modeling_files, ignore_files=ignore_files)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment