"ml/backend/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "ed443a03930a10bec6182c55091f0880baa1e620"
Commit 5f04aa00 authored by thomwolf's avatar thomwolf
Browse files

option to perform optimization and keep the optimizer averages on CPU

parent 9e95cd8c
...@@ -719,7 +719,6 @@ def main(): ...@@ -719,7 +719,6 @@ def main():
parser.add_argument("--max_answer_length", default=30, type=int, parser.add_argument("--max_answer_length", default=30, type=int,
help="The maximum length of an answer that can be generated. This is needed because the start " help="The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another.") "and end predictions are not conditioned on one another.")
parser.add_argument("--verbose_logging", default=False, action='store_true', parser.add_argument("--verbose_logging", default=False, action='store_true',
help="If true, all of the warnings related to data processing will be printed. " help="If true, all of the warnings related to data processing will be printed. "
"A number of warnings are expected for a normal SQuAD evaluation.") "A number of warnings are expected for a normal SQuAD evaluation.")
...@@ -727,10 +726,6 @@ def main(): ...@@ -727,10 +726,6 @@ def main():
default=False, default=False,
action='store_true', action='store_true',
help="Whether not to use CUDA when available") help="Whether not to use CUDA when available")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed', parser.add_argument('--seed',
type=int, type=int,
default=42, default=42,
...@@ -738,7 +733,16 @@ def main(): ...@@ -738,7 +733,16 @@ def main():
parser.add_argument('--gradient_accumulation_steps', parser.add_argument('--gradient_accumulation_steps',
type=int, type=int,
default=1, default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.") help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--optimize_on_cpu',
default=False,
action='store_true',
help="Whether to perform optimization and keep the optimizer averages on CPU")
args = parser.parse_args() args = parser.parse_args()
...@@ -802,25 +806,26 @@ def main(): ...@@ -802,25 +806,26 @@ def main():
model = BertForQuestionAnswering(bert_config) model = BertForQuestionAnswering(bert_config)
if args.init_checkpoint is not None: if args.init_checkpoint is not None:
model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')) model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
if not args.optimize_on_cpu:
model.to(device)
no_decay = ['bias', 'gamma', 'beta'] no_decay = ['bias', 'gamma', 'beta']
optimizer_parameters = [ optimizer_parameters = [
{'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01},
{'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0}
] ]
optimizer = BERTAdam(optimizer_parameters, optimizer = BERTAdam(optimizer_parameters,
lr=args.learning_rate, lr=args.learning_rate,
warmup=args.warmup_proportion, warmup=args.warmup_proportion,
t_total=num_train_steps) t_total=num_train_steps)
model.to(device)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
global_step = 0 global_step = 0
if args.do_train: if args.do_train:
train_features = convert_examples_to_features( train_features = convert_examples_to_features(
...@@ -862,8 +867,12 @@ def main(): ...@@ -862,8 +867,12 @@ def main():
loss = loss / args.gradient_accumulation_steps loss = loss / args.gradient_accumulation_steps
loss.backward() loss.backward()
if (step + 1) % args.gradient_accumulation_steps == 0: if (step + 1) % args.gradient_accumulation_steps == 0:
if args.optimize_on_cpu:
model.to('cpu')
optimizer.step() # We have accumulated enought gradients optimizer.step() # We have accumulated enought gradients
model.zero_grad() model.zero_grad()
if args.optimize_on_cpu:
model.to(device)
global_step += 1 global_step += 1
if args.do_predict: if args.do_predict:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment