Unverified Commit d447c460 authored by Patrick Deutschmann's avatar Patrick Deutschmann Committed by GitHub
Browse files

Speed up TF token classification postprocessing by converting complete tensors to numpy (#19976)



* Speed up TF postprocessing by converting to numpy before

* Fix bug that was triggered when offset_mapping was None
Co-authored-by: default avatarPatrick Deutschmann <patrick.deutschmann@dedalus.com>
parent 06886d5a
......@@ -238,6 +238,10 @@ class TokenClassificationPipeline(Pipeline):
shifted_exp = np.exp(logits - maxes)
scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
if self.framework == "tf":
input_ids = input_ids.numpy()
offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
pre_entities = self.gather_pre_entities(
sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
)
......@@ -276,9 +280,6 @@ class TokenClassificationPipeline(Pipeline):
if self.framework == "pt":
start_ind = start_ind.item()
end_ind = end_ind.item()
else:
start_ind = int(start_ind.numpy())
end_ind = int(end_ind.numpy())
word_ref = sentence[start_ind:end_ind]
if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
# This is a BPE, word aware tokenizer, there is a correct way
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment