"src/git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "888e25fbd0029f1c753e8466f736088e1cc46868"
Unverified Commit 29b536a7 authored by Ceyda Cinarel's avatar Ceyda Cinarel Committed by GitHub
Browse files

[WIP] Ner pipeline grouped_entities fixes (#5970)



* Bug fix: NER pipeline shouldn't group separate entities of same type

* style fix

* [Bug Fix] Shouldn't group entities that are both 'B' even if they are same type
	(B-type1 B-type1) != (B-type1 I-type1)
[Bug Fix] add an option `ignore_subwords` to ignore subsequent ##wordpieces in predictions. Because some models train on only the first token of a word and not on the subsequent wordpieces (BERT NER default). So it makes sense doing the same thing at inference time.
	The simplest fix is to just group the subwords with the first wordpiece.
	[TODO] how to handle ignored scores? just set them to 0 and calculate zero invariant mean ?
	[TODO] handle different wordpiece_prefix ## ? possible approaches:
		get it from tokenizer? but currently most tokenizers dont have a wordpiece_prefix property?
		have an _is_subword(token)
[Feature add] added option to `skip_special_tokens`. Cause It was harder to remove them after grouping.
[Additional Changes] remove B/I prefix on returned grouped_entities
[Feature Request/TODO] Return indexes?
[Bug TODO]  can't use fast tokenizer with grouped_entities ('BertTokenizerFast' object has no attribute 'convert_tokens_to_string')

* use offset_mapping to fix [UNK] token problem

* ignore score for subwords

* modify ner_pipeline test

* modify ner_pipeline test

* modify ner_pipeline test

* ner_pipeline change ignore_subwords default to true

* add ner_pipeline ignore_subword=False test case

* fix offset_mapping index

* fix style again duh

* change is_subword and convert_tokens_to_string logic

* merge tests with new test structure

* change test names

* remove old tests

* ner tests for fast tokenizer

* fast tokenizers have convert_tokens_to_string

* Fix the incorrect merge
Co-authored-by: default avatarCeyda Cinarel <snu-ceyda@users.noreply.github.com>
Co-authored-by: default avatarLysandre Debut <lysandre@huggingface.co>
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent 1bb4bba5
......@@ -1324,6 +1324,29 @@ class FillMaskPipeline(Pipeline):
return results
class TokenClassificationArgumentHandler(ArgumentHandler):
"""
Handles arguments for token classification.
"""
def __call__(self, *args, **kwargs):
if args is not None and len(args) > 0:
if isinstance(args, str):
inputs = [args]
else:
inputs = args
batch_size = len(inputs)
offset_mapping = kwargs.get("offset_mapping", None)
if offset_mapping:
if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
offset_mapping = [offset_mapping]
if len(offset_mapping) != batch_size:
raise ("offset_mapping should have the same batch size as the input")
return inputs, offset_mapping
@add_end_docstrings(
PIPELINE_INIT_ARGS,
r"""
......@@ -1361,13 +1384,14 @@ class TokenClassificationPipeline(Pipeline):
ignore_labels=["O"],
task: str = "",
grouped_entities: bool = False,
ignore_subwords: bool = True,
):
super().__init__(
model=model,
tokenizer=tokenizer,
modelcard=modelcard,
framework=framework,
args_parser=args_parser,
args_parser=TokenClassificationArgumentHandler(),
device=device,
binary_output=binary_output,
task=task,
......@@ -1382,6 +1406,7 @@ class TokenClassificationPipeline(Pipeline):
self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
self.ignore_labels = ignore_labels
self.grouped_entities = grouped_entities
self.ignore_subwords = ignore_subwords
def __call__(self, inputs: Union[str, List[str]], **kwargs):
"""
......@@ -1402,10 +1427,15 @@ class TokenClassificationPipeline(Pipeline):
- **index** (:obj:`int`, only present when ``self.grouped_entities=False``) -- The index of the
corresponding token in the sentence.
"""
if isinstance(inputs, str):
inputs = [inputs]
offset_mappings = kwargs.get("offset_mappings")
answers = []
for sentence in inputs:
for i, sentence in enumerate(inputs):
# Manage correct placement of the tensors
with self.device_placement():
......@@ -1415,7 +1445,18 @@ class TokenClassificationPipeline(Pipeline):
return_attention_mask=False,
return_tensors=self.framework,
truncation=True,
)
return_special_tokens_mask=True,
return_offsets_mapping=self.tokenizer.is_fast,
)
if self.tokenizer.is_fast:
offset_mapping = tokens["offset_mapping"].cpu().numpy()[0]
del tokens["offset_mapping"]
elif offset_mappings:
offset_mapping = offset_mappings[i]
else:
raise Exception("To decode [UNK] tokens use a fast tokenizer or provide offset_mapping parameter")
special_tokens_mask = tokens["special_tokens_mask"].cpu().numpy()[0]
del tokens["special_tokens_mask"]
# Forward
if self.framework == "tf":
......@@ -1432,24 +1473,35 @@ class TokenClassificationPipeline(Pipeline):
entities = []
# Filter to labels not in `self.ignore_labels`
# Filter special_tokens
filtered_labels_idx = [
(idx, label_idx)
for idx, label_idx in enumerate(labels_idx)
if self.model.config.id2label[label_idx] not in self.ignore_labels
if (self.model.config.id2label[label_idx] not in self.ignore_labels) and not special_tokens_mask[idx]
]
for idx, label_idx in filtered_labels_idx:
start_ind, end_ind = offset_mapping[idx]
word_ref = sentence[start_ind:end_ind]
word = self.tokenizer.convert_ids_to_tokens([int(input_ids[idx])])[0]
is_subword = len(word_ref) != len(word)
if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref
is_subword = False
entity = {
"word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
"word": word,
"score": score[idx][label_idx].item(),
"entity": self.model.config.id2label[label_idx],
"index": idx,
}
if self.grouped_entities and self.ignore_subwords:
entity["is_subword"] = is_subword
entities += [entity]
# Append grouped entities
if self.grouped_entities:
answers += [self.group_entities(entities)]
# Append ungrouped entities
......@@ -1468,8 +1520,8 @@ class TokenClassificationPipeline(Pipeline):
entities (:obj:`dict`): The entities predicted by the pipeline.
"""
# Get the first entity in the entity group
entity = entities[0]["entity"]
scores = np.mean([entity["score"] for entity in entities])
entity = entities[0]["entity"].split("-")[-1]
scores = np.nanmean([entity["score"] for entity in entities])
tokens = [entity["word"] for entity in entities]
entity_group = {
......@@ -1494,7 +1546,9 @@ class TokenClassificationPipeline(Pipeline):
last_idx = entities[-1]["index"]
for entity in entities:
is_last_idx = entity["index"] == last_idx
is_subword = self.ignore_subwords and entity["is_subword"]
if not entity_group_disagg:
entity_group_disagg += [entity]
if is_last_idx:
......@@ -1503,10 +1557,19 @@ class TokenClassificationPipeline(Pipeline):
# If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
# The split is meant to account for the "B" and "I" suffixes
# Shouldn't merge if both entities are B-type
if (
(
entity["entity"].split("-")[-1] == entity_group_disagg[-1]["entity"].split("-")[-1]
and entity["entity"].split("-")[0] != "B"
)
and entity["index"] == entity_group_disagg[-1]["index"] + 1
):
) or is_subword:
# Modify subword type to be previous_type
if is_subword:
entity["entity"] = entity_group_disagg[-1]["entity"].split("-")[-1]
entity["score"] = np.nan # set ignored scores to nan and use np.nanmean
entity_group_disagg += [entity]
# Group the entities at the last entity
if is_last_idx:
......
import unittest
from transformers import pipeline
from transformers import AutoTokenizer, pipeline
from transformers.pipelines import Pipeline
from transformers.testing_utils import require_tf
from transformers.testing_utils import require_tf, require_torch
from .test_pipelines_common import CustomInputPipelineCommonMixin
......@@ -19,38 +19,54 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
def _test_pipeline(self, nlp: Pipeline):
output_keys = {"entity", "word", "score"}
if nlp.grouped_entities:
output_keys = {"entity_group", "word", "score"}
ungrouped_ner_inputs = [
[
{"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "word": "Cons"},
{"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "word": "##uelo"},
{"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "word": "Ara"},
{"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "word": "##új"},
{"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "word": "##o"},
{"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "word": "No"},
{"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "word": "##guera"},
{"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "word": "Andrés"},
{"entity": "I-PER", "index": 16, "score": 0.999740719795227, "word": "Pas"},
{"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "word": "##tran"},
{"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "word": "##a"},
{"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "word": "Far"},
{"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "word": "##c"},
{"entity": "B-PER", "index": 1, "score": 0.9994944930076599, "is_subword": False, "word": "Cons"},
{"entity": "B-PER", "index": 2, "score": 0.8025449514389038, "is_subword": True, "word": "##uelo"},
{"entity": "I-PER", "index": 3, "score": 0.9993102550506592, "is_subword": False, "word": "Ara"},
{"entity": "I-PER", "index": 4, "score": 0.9993743896484375, "is_subword": True, "word": "##új"},
{"entity": "I-PER", "index": 5, "score": 0.9992871880531311, "is_subword": True, "word": "##o"},
{"entity": "I-PER", "index": 6, "score": 0.9993029236793518, "is_subword": False, "word": "No"},
{"entity": "I-PER", "index": 7, "score": 0.9981776475906372, "is_subword": True, "word": "##guera"},
{"entity": "B-PER", "index": 15, "score": 0.9998136162757874, "is_subword": False, "word": "Andrés"},
{"entity": "I-PER", "index": 16, "score": 0.999740719795227, "is_subword": False, "word": "Pas"},
{"entity": "I-PER", "index": 17, "score": 0.9997414350509644, "is_subword": True, "word": "##tran"},
{"entity": "I-PER", "index": 18, "score": 0.9996136426925659, "is_subword": True, "word": "##a"},
{"entity": "B-ORG", "index": 28, "score": 0.9989739060401917, "is_subword": False, "word": "Far"},
{"entity": "I-ORG", "index": 29, "score": 0.7188422083854675, "is_subword": True, "word": "##c"},
],
[
{"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "word": "En"},
{"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "word": "##zo"},
{"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "word": "UN"},
{"entity": "I-PER", "index": 1, "score": 0.9968166351318359, "is_subword": False, "word": "En"},
{"entity": "I-PER", "index": 2, "score": 0.9957635998725891, "is_subword": True, "word": "##zo"},
{"entity": "I-ORG", "index": 7, "score": 0.9986497163772583, "is_subword": False, "word": "UN"},
],
]
expected_grouped_ner_results = [
[
{"entity_group": "B-PER", "score": 0.9710702640669686, "word": "Consuelo Araújo Noguera"},
{"entity_group": "B-PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"},
{"entity_group": "B-ORG", "score": 0.8589080572128296, "word": "Farc"},
{"entity_group": "PER", "score": 0.999369223912557, "word": "Consuelo Araújo Noguera"},
{"entity_group": "PER", "score": 0.9997771680355072, "word": "Andrés Pastrana"},
{"entity_group": "ORG", "score": 0.9989739060401917, "word": "Farc"},
],
[
{"entity_group": "PER", "score": 0.9968166351318359, "word": "Enzo"},
{"entity_group": "ORG", "score": 0.9986497163772583, "word": "UN"},
],
]
expected_grouped_ner_results_w_subword = [
[
{"entity_group": "PER", "score": 0.9994944930076599, "word": "Cons"},
{"entity_group": "PER", "score": 0.9663328925768534, "word": "##uelo Araújo Noguera"},
{"entity_group": "PER", "score": 0.9997273534536362, "word": "Andrés Pastrana"},
{"entity_group": "ORG", "score": 0.8589080572128296, "word": "Farc"},
],
[
{"entity_group": "I-PER", "score": 0.9962901175022125, "word": "Enzo"},
{"entity_group": "I-ORG", "score": 0.9986497163772583, "word": "UN"},
{"entity_group": "PER", "score": 0.9962901175022125, "word": "Enzo"},
{"entity_group": "ORG", "score": 0.9986497163772583, "word": "UN"},
],
]
......@@ -77,12 +93,80 @@ class NerPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase):
for key in output_keys:
self.assertIn(key, result)
if nlp.grouped_entities:
if nlp.ignore_subwords:
for ungrouped_input, grouped_result in zip(ungrouped_ner_inputs, expected_grouped_ner_results):
self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result)
else:
for ungrouped_input, grouped_result in zip(
ungrouped_ner_inputs, expected_grouped_ner_results_w_subword
):
self.assertEqual(nlp.group_entities(ungrouped_input), grouped_result)
@require_tf
def test_tf_only(self):
model_name = "Narsil/small" # This model only has a TensorFlow version
# We test that if we don't specificy framework='tf', it gets detected automatically
nlp = pipeline(task="ner", model=model_name, tokenizer=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)
# offset=tokenizer(VALID_INPUTS[0],return_offsets_mapping=True)['offset_mapping']
# pipeline_running_kwargs = {"offset_mapping"} # Additional kwargs to run the pipeline with
@require_tf
def test_tf_defaults(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="tf")
self._test_pipeline(nlp)
@require_tf
def test_tf_small(self):
for model_name in self.small_models:
print(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=True,
)
self._test_pipeline(nlp)
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner",
model=model_name,
tokenizer=tokenizer,
framework="tf",
grouped_entities=True,
ignore_subwords=False,
)
self._test_pipeline(nlp)
@require_torch
def test_pt_defaults(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(task="ner", model=model_name, tokenizer=tokenizer)
self._test_pipeline(nlp)
@require_torch
def test_torch_small(self):
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=True
)
self._test_pipeline(nlp)
for model_name in self.small_models:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
nlp = pipeline(
task="ner", model=model_name, tokenizer=tokenizer, grouped_entities=True, ignore_subwords=False
)
self._test_pipeline(nlp)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment