update docstring of BERT tokenizer to reflect do_wordpiece_only

4d1ad832 · John Hewitt · e14c6b52 · 4d1ad832
Commit 4d1ad832 authored Feb 27, 2019 by John Hewitt
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 2 deletions

pytorch_pretrained_bert/tokenization.py pytorch_pretrained_bert/tokenization.py +10 -2

No files found.
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -79,8 +79,16 @@ class BertTokenizer(object):
        """Constructs a BertTokenizer.

        Args:
-          do_lower_case: Whether to lower case the input.
-          do_wordpiece_only: Whether to do basic tokenization before wordpiece.
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(