"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "2d506ea4c4980a4cab43c2940d9836ddfd629524"
Unverified Commit 6793f0cf authored by Stephan Tulkens's avatar Stephan Tulkens Committed by GitHub
Browse files

Fix bug in slow tokenizer conversion, make it a lot faster (#24266)



* Make conversion faster, fix None vs 0 bug

* Add second sort for consistency

* Update src/transformers/convert_slow_tokenizer.py
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>

---------
Co-authored-by: default avatarArthur <48595927+ArthurZucker@users.noreply.github.com>
parent 1609a436
...@@ -54,12 +54,15 @@ class SentencePieceExtractor: ...@@ -54,12 +54,15 @@ class SentencePieceExtractor:
# Merges # Merges
merges = [] merges = []
for piece_l in vocab.keys(): for merge, piece_score in vocab_scores.items():
for piece_r in vocab.keys(): local = []
merge = f"{piece_l}{piece_r}" for index in range(1, len(merge)):
piece_score = vocab_scores.get(merge, None) piece_l, piece_r = merge[:index], merge[index:]
if piece_score: if piece_l in vocab and piece_r in vocab:
merges += [(piece_l, piece_r, piece_score)] local.append((piece_l, piece_r, piece_score))
local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
merges.extend(local)
merges = sorted(merges, key=lambda val: val[2], reverse=reverse) merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
merges = [(val[0], val[1]) for val in merges] merges = [(val[0], val[1]) for val in merges]
return vocab, merges return vocab, merges
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment