@@ -178,7 +197,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
break
else:
pairs=get_pairs(word)
word=' '.join(word)
word=" ".join(word)
self.cache[token]=word
returnword
...
...
@@ -189,15 +208,19 @@ class GPT2Tokenizer(PreTrainedTokenizer):
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
ifadd_prefix_space:
text=' '+text
text=" "+text
bpe_tokens=[]
fortokeninre.findall(self.pat,text):
ifsys.version_info[0]==2:
token=''.join(self.byte_encoder[ord(b)]forbintoken)# Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
token="".join(
self.byte_encoder[ord(b)]forbintoken
)# Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
else:
token=''.join(self.byte_encoder[b]forbintoken.encode('utf-8'))# Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
@@ -69,14 +71,22 @@ class PreTrainedTokenizer(object):
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
logger.error(
"Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
)
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")