Unverified Commit d58926ab authored by Nicolas Patry's avatar Nicolas Patry Committed by GitHub
Browse files

Moving fill-mask pipeline to new testing scheme (#12943)

* Fill mask pipelines test updates.

* Model eval !!

* Adding slow test with actual values.

* Making all tests pass (skipping quite a bit.)

* Doc styling.

* Better doc cleanup.

* Making an explicit test with no pad token tokenizer.

* Typo.
parent a04d4bf2
......@@ -748,6 +748,8 @@ class Pipeline(_ScikitCompat):
Parse arguments and tokenize
"""
# Parse arguments
if self.tokenizer.pad_token is None:
padding = False
inputs = self.tokenizer(
inputs,
add_special_tokens=add_special_tokens,
......
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Union
import numpy as np
......@@ -9,6 +9,8 @@ from ..utils import logging
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
if TYPE_CHECKING:
from ..modeling_tf_utils import TFPreTrainedModel
from ..modeling_utils import PreTrainedModel
......@@ -30,7 +32,13 @@ logger = logging.get_logger(__name__)
@add_end_docstrings(
PIPELINE_INIT_ARGS,
r"""
top_k (:obj:`int`, defaults to 5): The number of predictions to return.
top_k (:obj:`int`, defaults to 5):
The number of predictions to return.
targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
token will be used (with a warning, and that might be slower).
""",
)
class FillMaskPipeline(Pipeline):
......@@ -59,6 +67,7 @@ class FillMaskPipeline(Pipeline):
args_parser: ArgumentHandler = None,
device: int = -1,
top_k=5,
targets=None,
task: str = "",
):
super().__init__(
......@@ -74,8 +83,23 @@ class FillMaskPipeline(Pipeline):
self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING)
self.top_k = top_k
self.targets = targets
if self.tokenizer.mask_token_id is None:
raise PipelineException(
"fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
)
def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
elif self.framework == "pt":
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
else:
raise ValueError("Unsupported framework")
return masked_index
def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
masked_index = self.get_masked_index(input_ids)
numel = np.prod(masked_index.shape)
if numel > 1:
raise PipelineException(
......@@ -90,7 +114,25 @@ class FillMaskPipeline(Pipeline):
f"No mask_token ({self.tokenizer.mask_token}) found on the input",
)
def __call__(self, *args, targets=None, top_k: Optional[int] = None, **kwargs):
def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
if isinstance(model_inputs, list):
for model_input in model_inputs:
self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
else:
for input_ids in model_inputs["input_ids"]:
self._ensure_exactly_one_mask_token(input_ids)
def get_model_inputs(self, inputs, *args, **kwargs) -> Dict:
if isinstance(inputs, list) and self.tokenizer.pad_token is None:
model_inputs = []
for input_ in inputs:
model_input = self._parse_and_tokenize(input_, padding=False, *args, **kwargs)
model_inputs.append(model_input)
else:
model_inputs = self._parse_and_tokenize(inputs, *args, **kwargs)
return model_inputs
def __call__(self, inputs, *args, targets=None, top_k: Optional[int] = None, **kwargs):
"""
Fill the masked token in the text(s) given as inputs.
......@@ -112,16 +154,27 @@ class FillMaskPipeline(Pipeline):
- **token** (:obj:`int`) -- The predicted token id (to replace the masked one).
- **token** (:obj:`str`) -- The predicted token (to replace the masked one).
"""
inputs = self._parse_and_tokenize(*args, **kwargs)
outputs = self._forward(inputs, return_tensors=True)
model_inputs = self.get_model_inputs(inputs, *args, **kwargs)
self.ensure_exactly_one_mask_token(model_inputs)
if isinstance(model_inputs, list):
outputs = []
for model_input in model_inputs:
output = self._forward(model_input, return_tensors=True)
outputs.append(output)
batch_size = len(model_inputs)
else:
outputs = self._forward(model_inputs, return_tensors=True)
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
# top_k must be defined
if top_k is None:
top_k = self.top_k
results = []
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
if targets is None and self.targets is not None:
targets = self.targets
if targets is not None:
if isinstance(targets, str):
targets = [targets]
......@@ -167,16 +220,21 @@ class FillMaskPipeline(Pipeline):
top_k = target_ids.shape[0]
for i in range(batch_size):
input_ids = inputs["input_ids"][i]
if isinstance(model_inputs, list):
input_ids = model_inputs[i]["input_ids"][0]
else:
input_ids = model_inputs["input_ids"][i]
result = []
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
# Fill mask pipeline supports only one ${mask_token} per sample
self.ensure_exactly_one_mask_token(masked_index)
logits = outputs[i, masked_index.item(), :]
if isinstance(outputs, list):
logits = outputs[i][0, masked_index.item(), :]
else:
logits = outputs[i, masked_index.item(), :]
probs = tf.nn.softmax(logits)
if targets is not None:
probs = tf.gather_nd(probs, tf.reshape(target_ids, (-1, 1)))
......@@ -185,11 +243,12 @@ class FillMaskPipeline(Pipeline):
values, predictions = topk.values.numpy(), topk.indices.numpy()
else:
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
# Fill mask pipeline supports only one ${mask_token} per sample
self.ensure_exactly_one_mask_token(masked_index.numpy())
logits = outputs[i, masked_index.item(), :]
if isinstance(outputs, list):
logits = outputs[i][0, masked_index.item(), :]
else:
logits = outputs[i, masked_index.item(), :]
probs = logits.softmax(dim=0)
if targets is not None:
probs = probs[..., target_ids]
......
......@@ -189,6 +189,7 @@ class ReformerModelTester:
def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 100
config.is_decoder = False
return config
def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):
......
......@@ -74,10 +74,10 @@ def get_tiny_config_from_class(configuration_class):
@lru_cache(maxsize=100)
def get_tiny_tokenizer_from_checkpoint(checkpoint):
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
logger.warning("Training new from iterator ...")
logger.info("Training new from iterator ...")
vocabulary = string.ascii_letters + string.digits + " "
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
logger.warning("Trained.")
logger.info("Trained.")
return tokenizer
......@@ -109,9 +109,7 @@ class PipelineTestCaseMeta(type):
# Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
# provide some default tokenizer and hope for the best.
except: # noqa: E722
logger.warning(f"Tokenizer cannot be created from checkpoint {checkpoint}")
tokenizer = get_tiny_tokenizer_from_checkpoint("gpt2")
tokenizer.model_max_length = model.config.max_position_embeddings
self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
self.run_pipeline_test(model, tokenizer)
return test
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment