Commit 4a49c225 authored by Catalin Voss's avatar Catalin Voss
Browse files

Warn instead of raising in BERT and GPT-2 tokenizers as well, to allow for pre-caching of tokens

parent e99bc87e
...@@ -101,7 +101,7 @@ class BertTokenizer(object): ...@@ -101,7 +101,7 @@ class BertTokenizer(object):
for token in tokens: for token in tokens:
ids.append(self.vocab[token]) ids.append(self.vocab[token])
if len(ids) > self.max_len: if len(ids) > self.max_len:
raise ValueError( logger.warning(
"Token indices sequence length is longer than the specified maximum " "Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this" " sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len) " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
......
...@@ -193,7 +193,7 @@ class GPT2Tokenizer(object): ...@@ -193,7 +193,7 @@ class GPT2Tokenizer(object):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
if len(bpe_tokens) > self.max_len: if len(bpe_tokens) > self.max_len:
raise ValueError( logger.warning(
"Token indices sequence length is longer than the specified maximum " "Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT-2 model ({} > {}). Running this" " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
" sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len) " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment