token=''.join(self.byte_encoder[b]forbintoken.encode('utf-8'))# Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
# token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
# else:
# token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
# bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
returnsplit_tokens
def_convert_token_to_id(self,token):
""" Converts a token (str/unicode) in an id using the vocab. """
...
...
@@ -203,13 +201,12 @@ class CTRLTokenizer(PreTrainedTokenizer):
def_convert_id_to_token(self,index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
returnself.decoder.get(index)
returnself.decoder.get(index,self.unk_token)
defconvert_tokens_to_string(self,tokens):
""" Converts a sequence of tokens (string) in a single string. """