Unverified Commit 9fa836a7 authored by Julien Chaumond's avatar Julien Chaumond Committed by GitHub
Browse files

fill_mask helper (#2576)

* fill_mask helper

* [poc] FillMaskPipeline

* Revert "[poc] FillMaskPipeline"

This reverts commit 67eeea55b0f97b46c2b828de0f4ee97d87338335.

* Revert "fill_mask helper"

This reverts commit cacc17b884e14bb6b07989110ffe884ad9e36eaa.

* README: clarify that Pipelines can also do text-classification

cf. question at the AI&ML meetup last week, @mfuntowicz

* Fix test: test feature-extraction pipeline

* Test tweaks

* Slight refactor of existing pipeline (in preparation of new FillMaskPipeline)

* Extraneous doc

* More robust way of doing this

@mfuntowicz as we don't rely on the model name anymore (see AutoConfig)

* Also add RobertaConfig as a quickfix for wrong token_type_ids

* cs

* [BIG] FillMaskPipeline
parent b43cb09a
...@@ -521,8 +521,9 @@ You can create `Pipeline` objects for the following down-stream tasks: ...@@ -521,8 +521,9 @@ You can create `Pipeline` objects for the following down-stream tasks:
- `feature-extraction`: Generates a tensor representation for the input sequence - `feature-extraction`: Generates a tensor representation for the input sequence
- `ner`: Generates named entity mapping for each word in the input sequence. - `ner`: Generates named entity mapping for each word in the input sequence.
- `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence. - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
- `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
in the context. - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
- `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
```python ```python
from transformers import pipeline from transformers import pipeline
......
...@@ -93,6 +93,7 @@ from .modeling_tf_pytorch_utils import ( ...@@ -93,6 +93,7 @@ from .modeling_tf_pytorch_utils import (
from .pipelines import ( from .pipelines import (
CsvPipelineDataFormat, CsvPipelineDataFormat,
FeatureExtractionPipeline, FeatureExtractionPipeline,
FillMaskPipeline,
JsonPipelineDataFormat, JsonPipelineDataFormat,
NerPipeline, NerPipeline,
PipedPipelineDataFormat, PipedPipelineDataFormat,
......
...@@ -28,7 +28,10 @@ from typing import Dict, List, Optional, Tuple, Union ...@@ -28,7 +28,10 @@ from typing import Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, AutoConfig
from .configuration_distilbert import DistilBertConfig
from .configuration_roberta import RobertaConfig
from .configuration_utils import PretrainedConfig from .configuration_utils import PretrainedConfig
from .configuration_xlm import XLMConfig
from .data import SquadExample, squad_convert_examples_to_features from .data import SquadExample, squad_convert_examples_to_features
from .file_utils import is_tf_available, is_torch_available from .file_utils import is_tf_available, is_torch_available
from .modelcard import ModelCard from .modelcard import ModelCard
...@@ -44,6 +47,7 @@ if is_tf_available(): ...@@ -44,6 +47,7 @@ if is_tf_available():
TFAutoModelForSequenceClassification, TFAutoModelForSequenceClassification,
TFAutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering,
TFAutoModelForTokenClassification, TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
) )
if is_torch_available(): if is_torch_available():
...@@ -53,6 +57,7 @@ if is_torch_available(): ...@@ -53,6 +57,7 @@ if is_torch_available():
AutoModelForSequenceClassification, AutoModelForSequenceClassification,
AutoModelForQuestionAnswering, AutoModelForQuestionAnswering,
AutoModelForTokenClassification, AutoModelForTokenClassification,
AutoModelWithLMHead,
) )
...@@ -64,7 +69,7 @@ def get_framework(model=None): ...@@ -64,7 +69,7 @@ def get_framework(model=None):
If both frameworks are installed and no specific model is provided, defaults to using PyTorch. If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
""" """
if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
# Both framework are available but the use supplied a model class instance. # Both framework are available but the user supplied a model class instance.
# Try to guess which framework to use from the model classname # Try to guess which framework to use from the model classname
framework = "tf" if model.__class__.__name__.startswith("TF") else "pt" framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
elif not is_tf_available() and not is_torch_available(): elif not is_tf_available() and not is_torch_available():
...@@ -364,7 +369,6 @@ class Pipeline(_ScikitCompat): ...@@ -364,7 +369,6 @@ class Pipeline(_ScikitCompat):
def predict(self, X): def predict(self, X):
""" """
Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
Se
""" """
return self(X=X) return self(X=X)
...@@ -406,9 +410,8 @@ class Pipeline(_ScikitCompat): ...@@ -406,9 +410,8 @@ class Pipeline(_ScikitCompat):
dict holding all the required parameters for model's forward dict holding all the required parameters for model's forward
""" """
args = ["input_ids", "attention_mask"] args = ["input_ids", "attention_mask"]
model_type = type(self.model).__name__.lower()
if "distilbert" not in model_type and "xlm" not in model_type: if not isinstance(self.model.config, (DistilBertConfig, XLMConfig, RobertaConfig)):
args += ["token_type_ids"] args += ["token_type_ids"]
# PR #1548 (CLI) There is an issue with attention_mask # PR #1548 (CLI) There is an issue with attention_mask
...@@ -420,7 +423,10 @@ class Pipeline(_ScikitCompat): ...@@ -420,7 +423,10 @@ class Pipeline(_ScikitCompat):
else: else:
return {k: [feature[k] for feature in features] for k in args} return {k: [feature[k] for feature in features] for k in args}
def __call__(self, *texts, **kwargs): def _parse_and_tokenize(self, *texts, **kwargs):
"""
Parse arguments and tokenize
"""
# Parse arguments # Parse arguments
inputs = self._args_parser(*texts, **kwargs) inputs = self._args_parser(*texts, **kwargs)
inputs = self.tokenizer.batch_encode_plus( inputs = self.tokenizer.batch_encode_plus(
...@@ -429,13 +435,19 @@ class Pipeline(_ScikitCompat): ...@@ -429,13 +435,19 @@ class Pipeline(_ScikitCompat):
# Filter out features not available on specific models # Filter out features not available on specific models
inputs = self.inputs_for_model(inputs) inputs = self.inputs_for_model(inputs)
return inputs
def __call__(self, *texts, **kwargs):
inputs = self._parse_and_tokenize(*texts, **kwargs)
return self._forward(inputs) return self._forward(inputs)
def _forward(self, inputs): def _forward(self, inputs, return_tensors=False):
""" """
Internal framework specific forward dispatching. Internal framework specific forward dispatching.
Args: Args:
inputs: dict holding all the keyworded arguments for required by the model forward method. inputs: dict holding all the keyworded arguments for required by the model forward method.
return_tensors: Whether to return native framework (pt/tf) tensors rather than numpy array.
Returns: Returns:
Numpy array Numpy array
""" """
...@@ -449,6 +461,9 @@ class Pipeline(_ScikitCompat): ...@@ -449,6 +461,9 @@ class Pipeline(_ScikitCompat):
inputs = self.ensure_tensor_on_device(**inputs) inputs = self.ensure_tensor_on_device(**inputs)
predictions = self.model(**inputs)[0].cpu() predictions = self.model(**inputs)[0].cpu()
if return_tensors:
return predictions
else:
return predictions.numpy() return predictions.numpy()
...@@ -491,6 +506,71 @@ class TextClassificationPipeline(Pipeline): ...@@ -491,6 +506,71 @@ class TextClassificationPipeline(Pipeline):
return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores] return [{"label": self.model.config.id2label[item.argmax()], "score": item.max()} for item in scores]
class FillMaskPipeline(Pipeline):
"""
Masked language modeling prediction pipeline using ModelWithLMHead head.
"""
def __init__(
self,
model,
tokenizer: PreTrainedTokenizer = None,
modelcard: ModelCard = None,
framework: Optional[str] = None,
args_parser: ArgumentHandler = None,
device: int = -1,
topk=5,
):
super().__init__(
model=model,
tokenizer=tokenizer,
modelcard=modelcard,
framework=framework,
args_parser=args_parser,
device=device,
binary_output=True,
)
self.topk = topk
def __call__(self, *args, **kwargs):
inputs = self._parse_and_tokenize(*args, **kwargs)
outputs = self._forward(inputs, return_tensors=True)
results = []
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
for i in range(batch_size):
input_ids = inputs["input_ids"][i]
result = []
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy().item()
logits = outputs[i, masked_index, :]
probs = tf.nn.softmax(logits)
topk = tf.math.top_k(probs, k=self.topk)
values, predictions = topk.values.numpy(), topk.indices.numpy()
else:
masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero().item()
logits = outputs[i, masked_index, :]
probs = logits.softmax(dim=0)
values, predictions = probs.topk(self.topk)
for v, p in zip(values.tolist(), predictions.tolist()):
tokens = input_ids.numpy()
tokens[masked_index] = p
# Filter padding out:
tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
result.append({"sequence": self.tokenizer.decode(tokens), "score": v, "token": p})
# Append
results += [result]
if len(results) == 1:
return results[0]
return results
class NerPipeline(Pipeline): class NerPipeline(Pipeline):
""" """
Named Entity Recognition pipeline using ModelForTokenClassification head. Named Entity Recognition pipeline using ModelForTokenClassification head.
...@@ -523,7 +603,8 @@ class NerPipeline(Pipeline): ...@@ -523,7 +603,8 @@ class NerPipeline(Pipeline):
self.ignore_labels = ignore_labels self.ignore_labels = ignore_labels
def __call__(self, *texts, **kwargs): def __call__(self, *texts, **kwargs):
inputs, answers = self._args_parser(*texts, **kwargs), [] inputs = self._args_parser(*texts, **kwargs)
answers = []
for sentence in inputs: for sentence in inputs:
# Manage correct placement of the tensors # Manage correct placement of the tensors
...@@ -903,6 +984,16 @@ SUPPORTED_TASKS = { ...@@ -903,6 +984,16 @@ SUPPORTED_TASKS = {
"tokenizer": "distilbert-base-uncased", "tokenizer": "distilbert-base-uncased",
}, },
}, },
"fill-mask": {
"impl": FillMaskPipeline,
"tf": TFAutoModelWithLMHead if is_tf_available() else None,
"pt": AutoModelWithLMHead if is_torch_available() else None,
"default": {
"model": {"pt": "distilroberta-base", "tf": "distilroberta-base"},
"config": None,
"tokenizer": "distilroberta-base",
},
},
} }
......
import unittest import unittest
from typing import Iterable from typing import Iterable, List, Optional
from transformers import pipeline from transformers import pipeline
from transformers.pipelines import Pipeline
from .utils import require_tf, require_torch from .utils import require_tf, require_torch
...@@ -62,9 +63,25 @@ TEXT_CLASSIF_FINETUNED_MODELS = { ...@@ -62,9 +63,25 @@ TEXT_CLASSIF_FINETUNED_MODELS = {
) )
} }
FILL_MASK_FINETUNED_MODELS = {
("distilroberta-base", "distilroberta-base", None),
}
TF_FILL_MASK_FINETUNED_MODELS = {
("distilroberta-base", "distilroberta-base", None),
}
class MonoColumnInputTestCase(unittest.TestCase): class MonoColumnInputTestCase(unittest.TestCase):
def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): def _test_mono_column_pipeline(
self,
nlp: Pipeline,
valid_inputs: List,
invalid_inputs: List,
output_keys: Iterable[str],
expected_multi_result: Optional[List] = None,
expected_check_keys: Optional[List[str]] = None,
):
self.assertIsNotNone(nlp) self.assertIsNotNone(nlp)
mono_result = nlp(valid_inputs[0]) mono_result = nlp(valid_inputs[0])
...@@ -81,6 +98,13 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -81,6 +98,13 @@ class MonoColumnInputTestCase(unittest.TestCase):
self.assertIsInstance(multi_result, list) self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list)) self.assertIsInstance(multi_result[0], (dict, list))
if expected_multi_result is not None:
for result, expect in zip(multi_result, expected_multi_result):
for key in expected_check_keys or []:
self.assertEqual(
set([o[key] for o in result]), set([o[key] for o in expect]),
)
if isinstance(multi_result[0], list): if isinstance(multi_result[0], list):
multi_result = multi_result[0] multi_result = multi_result[0]
...@@ -110,7 +134,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -110,7 +134,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
@require_torch @require_torch
def test_sentiment_analysis(self): def test_sentiment_analysis(self):
mandatory_keys = {"label"} mandatory_keys = {"label", "score"}
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None] invalid_inputs = [None]
for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS:
...@@ -119,7 +143,7 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -119,7 +143,7 @@ class MonoColumnInputTestCase(unittest.TestCase):
@require_tf @require_tf
def test_tf_sentiment_analysis(self): def test_tf_sentiment_analysis(self):
mandatory_keys = {"label"} mandatory_keys = {"label", "score"}
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None] invalid_inputs = [None]
for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS: for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS:
...@@ -127,21 +151,87 @@ class MonoColumnInputTestCase(unittest.TestCase): ...@@ -127,21 +151,87 @@ class MonoColumnInputTestCase(unittest.TestCase):
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys)
@require_torch @require_torch
def test_features_extraction(self): def test_feature_extraction(self):
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None] invalid_inputs = [None]
for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
@require_tf @require_tf
def test_tf_features_extraction(self): def test_tf_feature_extraction(self):
valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
invalid_inputs = [None] invalid_inputs = [None]
for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS: for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS:
nlp = pipeline(task="sentiment-analysis", model=model, config=config, tokenizer=tokenizer) nlp = pipeline(task="feature-extraction", model=model, config=config, tokenizer=tokenizer)
self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {})
@require_torch
def test_fill_mask(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
invalid_inputs = [None]
expected_multi_result = [
[
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
],
[
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
{
"score": 0.19764970242977142,
"sequence": "<s>The largest city in France is Lyon</s>",
"token": 12790,
},
],
]
for tokenizer, model, config in FILL_MASK_FINETUNED_MODELS:
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
self._test_mono_column_pipeline(
nlp,
valid_inputs,
invalid_inputs,
mandatory_keys,
expected_multi_result=expected_multi_result,
expected_check_keys=["sequence"],
)
@require_tf
def test_tf_fill_mask(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
invalid_inputs = [None]
expected_multi_result = [
[
{"score": 0.008698059245944023, "sequence": "<s>My name is John</s>", "token": 610},
{"score": 0.007750614080578089, "sequence": "<s>My name is Chris</s>", "token": 1573},
],
[
{"score": 0.2721288502216339, "sequence": "<s>The largest city in France is Paris</s>", "token": 2201},
{
"score": 0.19764970242977142,
"sequence": "<s>The largest city in France is Lyon</s>",
"token": 12790,
},
],
]
for tokenizer, model, config in TF_FILL_MASK_FINETUNED_MODELS:
nlp = pipeline(task="fill-mask", model=model, config=config, tokenizer=tokenizer, topk=2)
self._test_mono_column_pipeline(
nlp,
valid_inputs,
invalid_inputs,
mandatory_keys,
expected_multi_result=expected_multi_result,
expected_check_keys=["sequence"],
)
class MultiColumnInputTestCase(unittest.TestCase): class MultiColumnInputTestCase(unittest.TestCase):
def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment