Commit d94c6b01 authored by thomwolf's avatar thomwolf
Browse files

fix training schedules in examples to match new API

parent c36cca07
...@@ -14,7 +14,7 @@ from tqdm import tqdm ...@@ -14,7 +14,7 @@ from tqdm import tqdm
from pytorch_pretrained_bert.modeling import BertForPreTraining from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next") InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
...@@ -268,7 +268,8 @@ def main(): ...@@ -268,7 +268,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = BertAdam(optimizer_grouped_parameters, optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate, lr=args.learning_rate,
...@@ -314,8 +315,8 @@ def main(): ...@@ -314,8 +315,8 @@ def main():
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically # if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
args.warmup_proportion) args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
......
...@@ -31,7 +31,7 @@ from tqdm import tqdm, trange ...@@ -31,7 +31,7 @@ from tqdm import tqdm, trange
from pytorch_pretrained_bert.modeling import BertForPreTraining from pytorch_pretrained_bert.modeling import BertForPreTraining
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S', datefmt='%m/%d/%Y %H:%M:%S',
...@@ -556,6 +556,8 @@ def main(): ...@@ -556,6 +556,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = BertAdam(optimizer_grouped_parameters, optimizer = BertAdam(optimizer_grouped_parameters,
...@@ -601,7 +603,8 @@ def main(): ...@@ -601,7 +603,8 @@ def main():
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically # if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
......
...@@ -38,7 +38,7 @@ from sklearn.metrics import matthews_corrcoef, f1_score ...@@ -38,7 +38,7 @@ from sklearn.metrics import matthews_corrcoef, f1_score
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -784,6 +784,8 @@ def main(): ...@@ -784,6 +784,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = BertAdam(optimizer_grouped_parameters, optimizer = BertAdam(optimizer_grouped_parameters,
...@@ -852,7 +854,8 @@ def main(): ...@@ -852,7 +854,8 @@ def main():
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically # if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
......
...@@ -36,7 +36,7 @@ from tqdm import tqdm, trange ...@@ -36,7 +36,7 @@ from tqdm import tqdm, trange
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig from pytorch_pretrained_bert.modeling import BertForQuestionAnswering, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from pytorch_pretrained_bert.tokenization import (BasicTokenizer, from pytorch_pretrained_bert.tokenization import (BasicTokenizer,
BertTokenizer, BertTokenizer,
whitespace_tokenize) whitespace_tokenize)
...@@ -949,6 +949,8 @@ def main(): ...@@ -949,6 +949,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = BertAdam(optimizer_grouped_parameters, optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate, lr=args.learning_rate,
...@@ -1013,7 +1015,8 @@ def main(): ...@@ -1013,7 +1015,8 @@ def main():
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used and handles this automatically # if args.fp16 is False, BertAdam is used and handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
......
...@@ -34,7 +34,7 @@ from tqdm import tqdm, trange ...@@ -34,7 +34,7 @@ from tqdm import tqdm, trange
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
...@@ -411,6 +411,8 @@ def main(): ...@@ -411,6 +411,8 @@ def main():
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = BertAdam(optimizer_grouped_parameters, optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate, lr=args.learning_rate,
...@@ -464,7 +466,8 @@ def main(): ...@@ -464,7 +466,8 @@ def main():
if args.fp16: if args.fp16:
# modify learning rate with special warm up BERT uses # modify learning rate with special warm up BERT uses
# if args.fp16 is False, BertAdam is used that handles this automatically # if args.fp16 is False, BertAdam is used that handles this automatically
lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
args.warmup_proportion)
for param_group in optimizer.param_groups: for param_group in optimizer.param_groups:
param_group['lr'] = lr_this_step param_group['lr'] = lr_this_step
optimizer.step() optimizer.step()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment