Unverified Commit 3ec8171b authored by Ceyda Cinarel's avatar Ceyda Cinarel Committed by GitHub
Browse files

Bug fix: token classification pipeline while passing offset_mapping (#22034)

fix slow tokenizers with passing offset_mapping
parent 1cbac686
...@@ -304,7 +304,9 @@ class TokenClassificationPipeline(Pipeline): ...@@ -304,7 +304,9 @@ class TokenClassificationPipeline(Pipeline):
start_ind = start_ind.item() start_ind = start_ind.item()
end_ind = end_ind.item() end_ind = end_ind.item()
word_ref = sentence[start_ind:end_ind] word_ref = sentence[start_ind:end_ind]
if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None): if getattr(self.tokenizer, "_tokenizer", None) and getattr(
self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
):
# This is a BPE, word aware tokenizer, there is a correct way # This is a BPE, word aware tokenizer, there is a correct way
# to fuse tokens # to fuse tokens
is_subword = len(word) != len(word_ref) is_subword = len(word) != len(word_ref)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment