"docs/source/vscode:/vscode.git/clone" did not exist on "afc45b13ca40f56268e5f135aab2487377fc536b"
Unverified Commit e3f39a29 authored by Jabin Huang's avatar Jabin Huang Committed by GitHub
Browse files

fix ids_to_tokens naming error in tokenizer of deberta v2 (#12412)


Co-authored-by: default avatarJipeng Huang <jihuan@microsoft.com>
parent 81332868
...@@ -288,7 +288,7 @@ class SPMTokenizer: ...@@ -288,7 +288,7 @@ class SPMTokenizer:
# <s> 1+1 # <s> 1+1
# </s> 2+1 # </s> 2+1
self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)} self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
self.id_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)] self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
# self.vocab['[PAD]'] = 0 # self.vocab['[PAD]'] = 0
# self.vocab['[CLS]'] = 1 # self.vocab['[CLS]'] = 1
# self.vocab['[SEP]'] = 2 # self.vocab['[SEP]'] = 2
...@@ -351,7 +351,7 @@ class SPMTokenizer: ...@@ -351,7 +351,7 @@ class SPMTokenizer:
self.special_tokens.append(token) self.special_tokens.append(token)
if token not in self.vocab: if token not in self.vocab:
self.vocab[token] = len(self.vocab) - 1 self.vocab[token] = len(self.vocab) - 1
self.id_to_tokens.append(token) self.ids_to_tokens.append(token)
return self.id(token) return self.id(token)
def part_of_whole_word(self, token, is_bos=False): def part_of_whole_word(self, token, is_bos=False):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment