Unverified Commit 0198399d authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #570 from MottoX/fix-1

Create optimizer only when args.do_train is True
parents 50fa92c0 74dbba64
...@@ -534,36 +534,37 @@ def main(): ...@@ -534,36 +534,37 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try: if args.fp16:
from apex.optimizers import FP16_Optimizer try:
from apex.optimizers import FusedAdam from apex.optimizers import FP16_Optimizer
except ImportError: from apex.optimizers import FusedAdam
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
......
...@@ -763,35 +763,36 @@ def main(): ...@@ -763,35 +763,36 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
if args.fp16: ]
try: if args.fp16:
from apex.optimizers import FP16_Optimizer try:
from apex.optimizers import FusedAdam from apex.optimizers import FP16_Optimizer
except ImportError: from apex.optimizers import FusedAdam
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else:
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
nb_tr_steps = 0 nb_tr_steps = 0
......
...@@ -183,19 +183,20 @@ def main(): ...@@ -183,19 +183,20 @@ def main():
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size ]
optimizer = OpenAIAdam(optimizer_grouped_parameters, num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
lr=args.learning_rate, optimizer = OpenAIAdam(optimizer_grouped_parameters,
warmup=args.warmup_proportion, lr=args.learning_rate,
max_grad_norm=args.max_grad_norm, warmup=args.warmup_proportion,
weight_decay=args.weight_decay, max_grad_norm=args.max_grad_norm,
t_total=num_train_optimization_steps) weight_decay=args.weight_decay,
t_total=num_train_optimization_steps)
if args.do_train: if args.do_train:
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
......
...@@ -922,40 +922,41 @@ def main(): ...@@ -922,40 +922,41 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex # hack to remove pooler, which is not used
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try: if args.fp16:
from apex.optimizers import FP16_Optimizer try:
from apex.optimizers import FusedAdam from apex.optimizers import FP16_Optimizer
except ImportError: from apex.optimizers import FusedAdam
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate, optimizer = FusedAdam(optimizer_grouped_parameters,
bias_correction=False, lr=args.learning_rate,
max_grad_norm=1.0) bias_correction=False,
if args.loss_scale == 0: max_grad_norm=1.0)
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
else: t_total=num_train_optimization_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
......
...@@ -385,39 +385,40 @@ def main(): ...@@ -385,39 +385,40 @@ def main():
model = torch.nn.DataParallel(model) model = torch.nn.DataParallel(model)
# Prepare optimizer # Prepare optimizer
param_optimizer = list(model.named_parameters()) if args.do_train:
param_optimizer = list(model.named_parameters())
# hack to remove pooler, which is not used
# thus it produce None grad that break apex # hack to remove pooler, which is not used
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # thus it produce None grad that break apex
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [ no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
] {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
if args.fp16: ]
try: if args.fp16:
from apex.optimizers import FP16_Optimizer try:
from apex.optimizers import FusedAdam from apex.optimizers import FP16_Optimizer
except ImportError: from apex.optimizers import FusedAdam
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate, optimizer = FusedAdam(optimizer_grouped_parameters,
bias_correction=False, lr=args.learning_rate,
max_grad_norm=1.0) bias_correction=False,
if args.loss_scale == 0: max_grad_norm=1.0)
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
else: else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) optimizer = BertAdam(optimizer_grouped_parameters,
warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, lr=args.learning_rate,
t_total=num_train_optimization_steps) warmup=args.warmup_proportion,
else: t_total=num_train_optimization_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
warmup=args.warmup_proportion,
t_total=num_train_optimization_steps)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment