Warn instead of raising in BERT and GPT-2 tokenizers as well, to allow for pre-caching of tokens

4a49c225 · Catalin Voss · e99bc87e · 4a49c225 · 4a49c225
Commit 4a49c225 authored Mar 05, 2019 by Catalin Voss
Showing with 2 additions and 2 deletions

pytorch_pretrained_bert/tokenization.py pytorch_pretrained_bert/tokenization.py +1 -1

pytorch_pretrained_bert/tokenization_gpt2.py pytorch_pretrained_bert/tokenization_gpt2.py +1 -1

No files found.
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -101,7 +101,7 @@ class BertTokenizer(object):
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
-            raise ValueError(
+            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)

--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -193,7 +193,7 @@ class GPT2Tokenizer(object):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        if len(bpe_tokens) > self.max_len:
-            raise ValueError(
+            logger.warning(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this OpenAI GPT-2 model ({} > {}). Running this"
                " sequence through the model will result in indexing errors".format(len(bpe_tokens), self.max_len)