Unverified Commit e3f39a29 authored by Jabin Huang's avatar Jabin Huang Committed by GitHub
Browse files

fix ids_to_tokens naming error in tokenizer of deberta v2 (#12412)


Co-authored-by: default avatarJipeng Huang <jihuan@microsoft.com>
parent 81332868
......@@ -288,7 +288,7 @@ class SPMTokenizer:
# <s> 1+1
# </s> 2+1
self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
# self.vocab['[PAD]'] = 0
# self.vocab['[CLS]'] = 1
# self.vocab['[SEP]'] = 2
......@@ -351,7 +351,7 @@ class SPMTokenizer:
self.special_tokens.append(token)
if token not in self.vocab:
self.vocab[token] = len(self.vocab) - 1
self.id_to_tokens.append(token)
self.ids_to_tokens.append(token)
return self.id(token)
def part_of_whole_word(self, token, is_bos=False):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment