"examples/git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "f75497308bfe59e68ba657d61d41a0c88705d7f9"
Commit a60ae1a5 authored by LysandreJik's avatar LysandreJik
Browse files

Docstrings best practice shown in the BERT documentation.

parent 64fd9863
This diff is collapsed.
...@@ -182,7 +182,8 @@ SCHEDULES = { ...@@ -182,7 +182,8 @@ SCHEDULES = {
class BertAdam(Optimizer): class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix. """Implements BERT version of Adam algorithm with weight decay fix.
Params:
Parameters:
lr: learning rate lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning t_total: total number of training steps for the learning
......
...@@ -84,24 +84,22 @@ def whitespace_tokenize(text): ...@@ -84,24 +84,22 @@ def whitespace_tokenize(text):
class BertTokenizer(object): class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece""" r"""
Constructs a BertTokenizer.
:class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_wordpiece_only=False
"""
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BertTokenizer.
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
"""
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment