"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "e05baad86119efcbf1e2dae64eef0c29f589b07b"
Commit 941b4442 authored by Lysandre's avatar Lysandre
Browse files

Temporarily revert the `fill-mask` improvements.

parent 4bdff2cd
...@@ -98,9 +98,9 @@ class FillMaskPipeline(Pipeline): ...@@ -98,9 +98,9 @@ class FillMaskPipeline(Pipeline):
args (:obj:`str` or :obj:`List[str]`): args (:obj:`str` or :obj:`List[str]`):
One or several texts (or one list of prompts) with masked tokens. One or several texts (or one list of prompts) with masked tokens.
targets (:obj:`str` or :obj:`List[str]`, `optional`): targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole When passed, the model will return the scores for the passed token or tokens rather than the top k
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first predictions in the entire vocabulary. If the provided targets are not in the model vocab, they will be
resulting token will be used (with a warning, and that might be slower). tokenized and the first resulting token will be used (with a warning).
top_k (:obj:`int`, `optional`): top_k (:obj:`int`, `optional`):
When passed, overrides the number of predictions to return. When passed, overrides the number of predictions to return.
...@@ -115,56 +115,25 @@ class FillMaskPipeline(Pipeline): ...@@ -115,56 +115,25 @@ class FillMaskPipeline(Pipeline):
inputs = self._parse_and_tokenize(*args, **kwargs) inputs = self._parse_and_tokenize(*args, **kwargs)
outputs = self._forward(inputs, return_tensors=True) outputs = self._forward(inputs, return_tensors=True)
# top_k must be defined
if top_k is None:
top_k = self.top_k
results = [] results = []
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0) batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
if targets is not None: if targets is not None:
if len(targets) == 0 or len(targets[0]) == 0:
raise ValueError("At least one target must be provided when passed.")
if isinstance(targets, str): if isinstance(targets, str):
targets = [targets] targets = [targets]
try: targets_proc = []
vocab = self.tokenizer.get_vocab()
except Exception:
vocab = {}
target_ids = []
for target in targets: for target in targets:
id_ = vocab.get(target, None) target_enc = self.tokenizer.tokenize(target)
if id_ is None: if len(target_enc) > 1 or target_enc[0] == self.tokenizer.unk_token:
input_ids = self.tokenizer(
target,
add_special_tokens=False,
return_attention_mask=False,
return_token_type_ids=False,
max_length=1,
truncation=True,
)["input_ids"]
if len(input_ids) == 0:
logger.warning(
f"The specified target token `{target}` does not exist in the model vocabulary. "
f"We cannot replace it with anything meaningful, ignoring it"
)
continue
id_ = input_ids[0]
# XXX: If users encounter this pass
# it becomes pretty slow, so let's make sure
# The warning enables them to fix the input to
# get faster performance.
logger.warning( logger.warning(
f"The specified target token `{target}` does not exist in the model vocabulary. " f"The specified target token `{target}` does not exist in the model vocabulary. "
f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`." f"Replacing with `{target_enc[0]}`."
) )
target_ids.append(id_) targets_proc.append(target_enc[0])
target_ids = list(set(target_ids)) target_inds = np.array(self.tokenizer.convert_tokens_to_ids(targets_proc))
if len(target_ids) == 0:
raise ValueError("At least one target must be provided when passed.")
target_ids = np.array(target_ids)
# Cap top_k if there are targets
if top_k > target_ids.shape[0]:
top_k = target_ids.shape[0]
for i in range(batch_size): for i in range(batch_size):
input_ids = inputs["input_ids"][i] input_ids = inputs["input_ids"][i]
...@@ -178,11 +147,14 @@ class FillMaskPipeline(Pipeline): ...@@ -178,11 +147,14 @@ class FillMaskPipeline(Pipeline):
logits = outputs[i, masked_index.item(), :] logits = outputs[i, masked_index.item(), :]
probs = tf.nn.softmax(logits) probs = tf.nn.softmax(logits)
if targets is not None: if targets is None:
probs = tf.gather_nd(probs, tf.reshape(target_ids, (-1, 1))) topk = tf.math.top_k(probs, k=top_k if top_k is not None else self.top_k)
values, predictions = topk.values.numpy(), topk.indices.numpy()
topk = tf.math.top_k(probs, k=top_k) else:
values, predictions = topk.values.numpy(), topk.indices.numpy() values = tf.gather_nd(probs, tf.reshape(target_inds, (-1, 1)))
sort_inds = tf.reverse(tf.argsort(values), [0])
values = tf.gather_nd(values, tf.reshape(sort_inds, (-1, 1))).numpy()
predictions = target_inds[sort_inds.numpy()]
else: else:
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False) masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
...@@ -191,11 +163,13 @@ class FillMaskPipeline(Pipeline): ...@@ -191,11 +163,13 @@ class FillMaskPipeline(Pipeline):
logits = outputs[i, masked_index.item(), :] logits = outputs[i, masked_index.item(), :]
probs = logits.softmax(dim=0) probs = logits.softmax(dim=0)
if targets is None:
if targets is not None: values, predictions = probs.topk(top_k if top_k is not None else self.top_k)
probs = probs[..., target_ids] else:
values = probs[..., target_inds]
values, predictions = probs.topk(top_k) sort_inds = list(reversed(values.argsort(dim=-1)))
values = values[..., sort_inds]
predictions = target_inds[sort_inds]
for v, p in zip(values.tolist(), predictions.tolist()): for v, p in zip(values.tolist(), predictions.tolist()):
tokens = input_ids.numpy() tokens = input_ids.numpy()
......
...@@ -78,8 +78,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): ...@@ -78,8 +78,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
@require_torch @require_torch
def test_torch_fill_mask_with_targets(self): def test_torch_fill_mask_with_targets(self):
valid_inputs = ["My name is <mask>"] valid_inputs = ["My name is <mask>"]
# ' Sam' will yield a warning but work valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
valid_targets = [[" Teven", "ĠPatrick", "ĠClara"], ["ĠSam"], [" Sam"]]
invalid_targets = [[], [""], ""] invalid_targets = [[], [""], ""]
for model_name in self.small_models: for model_name in self.small_models:
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt") unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
...@@ -90,34 +89,10 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): ...@@ -90,34 +89,10 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
for targets in invalid_targets: for targets in invalid_targets:
self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets) self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)
@require_torch
def test_torch_fill_mask_with_targets_and_topk(self):
model_name = self.small_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
targets = [" Teven", "ĠPatrick", "ĠClara"]
top_k = 2
outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
self.assertEqual(len(outputs), 2)
@require_torch
def test_torch_fill_mask_with_duplicate_targets_and_topk(self):
model_name = self.small_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
# String duplicates + id duplicates
targets = [" Teven", "ĠPatrick", "ĠClara", "ĠClara", " Clara"]
top_k = 10
outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
# The target list contains duplicates, so we can't output more
# than them
self.assertEqual(len(outputs), 3)
@require_tf @require_tf
def test_tf_fill_mask_with_targets(self): def test_tf_fill_mask_with_targets(self):
valid_inputs = ["My name is <mask>"] valid_inputs = ["My name is <mask>"]
# ' Sam' will yield a warning but work valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
valid_targets = [[" Teven", "ĠPatrick", "ĠClara"], ["ĠSam"], [" Sam"]]
invalid_targets = [[], [""], ""] invalid_targets = [[], [""], ""]
for model_name in self.small_models: for model_name in self.small_models:
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf") unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
...@@ -136,7 +111,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): ...@@ -136,7 +111,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
"My name is <mask>", "My name is <mask>",
"The largest city in France is <mask>", "The largest city in France is <mask>",
] ]
valid_targets = ["ĠPatrick", "ĠClara"] valid_targets = [" Patrick", " Clara"]
for model_name in self.large_models: for model_name in self.large_models:
unmasker = pipeline( unmasker = pipeline(
task="fill-mask", task="fill-mask",
...@@ -209,7 +184,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase): ...@@ -209,7 +184,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
"My name is <mask>", "My name is <mask>",
"The largest city in France is <mask>", "The largest city in France is <mask>",
] ]
valid_targets = ["ĠPatrick", "ĠClara"] valid_targets = [" Patrick", " Clara"]
for model_name in self.large_models: for model_name in self.large_models:
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2) unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment