@@ -76,6 +78,8 @@ class PreTrainedTokenizer(object):
"pad_token","cls_token","mask_token",
"additional_special_tokens"]
padding_side="right"
@property
defbos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
...
...
@@ -189,6 +193,11 @@ class PreTrainedTokenizer(object):
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """
returnself.convert_tokens_to_ids(self.pad_token)
@property
defpad_token_type_id(self):
""" Id of the padding token type in the vocabulary."""
returnself._pad_token_type_id
@property
defcls_token_id(self):
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
...
...
@@ -212,10 +221,14 @@ class PreTrainedTokenizer(object):
Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
description:The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.
type:integer
default:128
-name:per_gpu_train_batch_size
pass-as:--per_gpu_train_batch_size={v}
description:Batch size per GPU/CPU for training.
type:integer
default:8
-name:per_gpu_eval_batch_size
pass-as:--per_gpu_eval_batch_size={v}
description:Batch size per GPU/CPU for evaluation.
type:integer
default:8
-name:gradient_accumulation_steps
pass-as:--gradient_accumulation_steps={v}
description:Number of updates steps to accumulate before performing a backward/update pass.
type:integer
default:1
-name:learning_rate
pass-as:--learning_rate={v}
description:The initial learning rate for Adam.
type:float
default:0.00005
-name:adam_epsilon
pass-as:--adam_epsilon={v}
description:Epsilon for Adam optimizer.
type:float
default:0.00000001
-name:max_grad_norm
pass-as:--max_grad_norm={v}
description:Max gradient norm.
type:float
default:1.0
-name:num_train_epochs
pass-as:--num_train_epochs={v}
description:Total number of training epochs to perform.
type:integer
default:3
-name:max_steps
pass-as:--max_steps={v}
description:If > 0, set total number of training steps to perform. Override num_train_epochs.
type:integer
default:-1
-name:warmup_steps
pass-as:--warmup_steps={v}
description:Linear warmup over warmup_steps.
type:integer
default:-1
-name:logging_steps
pass-as:--logging_steps={v}
description:Log every X updates steps.
type:integer
default:25
-name:save_steps
pass-as:--save_steps={v}
description:Save checkpoint every X updates steps.
type:integer
default:-1
-name:output_dir
pass-as:--output_dir={v}
type:string
default:/valohai/outputs
-name:evaluate_during_training
description:Run evaluation during training at each logging step.
type:flag
default:true
-name:do_lower_case
description:Set this flag if you are using an uncased model.