Unverified Commit 042f4203 authored by David's avatar David Committed by GitHub
Browse files

Update pipeline word heuristic to work with whitespace in token offsets (#18402)

* Update pipeline word heuristic to work with whitespace in token offsets

This change checks for whitespace in the input string at either the
character preceding the token or in the first character of the token.
This works with tokenizers that return offsets excluding whitespace
between words or with offsets including whitespace.

fixes #18111

starting

* Use smaller model, ensure expected tokenization

* Re-run CI (please squash)
parent c382ed8a
......@@ -291,7 +291,7 @@ class TokenClassificationPipeline(Pipeline):
AggregationStrategy.MAX,
}:
warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning)
is_subword = sentence[start_ind - 1 : start_ind] != " " if start_ind > 0 else False
is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref
......
......@@ -535,6 +535,20 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
[{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}],
)
@require_torch
@slow
def test_aggregation_strategy_offsets_with_leading_space(self):
sentence = "We're from New York"
model_name = "brandon25/deberta-base-finetuned-ner"
ner = pipeline("ner", model=model_name, ignore_labels=[], aggregation_strategy="max")
self.assertEqual(
nested_simplify(ner(sentence)),
[
{"entity_group": "O", "score": 1.0, "word": " We're from", "start": 0, "end": 10},
{"entity_group": "LOC", "score": 1.0, "word": " New York", "start": 10, "end": 19},
],
)
@require_torch
def test_gather_pre_entities(self):
model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
......@@ -580,6 +594,41 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
],
)
@require_torch
def test_word_heuristic_leading_space(self):
model_name = "hf-internal-testing/tiny-random-deberta-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
sentence = "I play the theremin"
tokens = tokenizer(
sentence,
return_attention_mask=False,
return_tensors="pt",
return_special_tokens_mask=True,
return_offsets_mapping=True,
)
offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
input_ids = tokens["input_ids"].numpy()[0]
scores = np.array([[1, 0] for _ in input_ids]) # values irrelevant for heuristic
pre_entities = token_classifier.gather_pre_entities(
sentence,
input_ids,
scores,
offset_mapping,
special_tokens_mask,
aggregation_strategy=AggregationStrategy.FIRST,
)
# ensure expected tokenization and correct is_subword values
self.assertEqual(
[(entity["word"], entity["is_subword"]) for entity in pre_entities],
[("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)],
)
@require_tf
def test_tf_only(self):
model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment