Commit 5d2e13a6 authored by Vijay Korthikanti's avatar Vijay Korthikanti
Browse files

reverting pre allocation of optimizer states; it does not seem to help with fragmentation

parent cfd2e216
...@@ -91,18 +91,6 @@ def get_megatron_optimizer(model, ...@@ -91,18 +91,6 @@ def get_megatron_optimizer(model,
weight_decay=args.weight_decay, weight_decay=args.weight_decay,
betas=(args.adam_beta1, args.adam_beta2), betas=(args.adam_beta1, args.adam_beta2),
eps=args.adam_eps) eps=args.adam_eps)
# preallocating state tensors to avoid fragmentation
for param_group in optimizer.param_groups:
for i, param in enumerate(param_group['params']):
if param.requires_grad:
state = optimizer.state[param]
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(param.data, dtype=torch.float)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(param.data, dtype=torch.float)
elif args.optimizer == 'sgd': elif args.optimizer == 'sgd':
optimizer = SGD(param_groups, optimizer = SGD(param_groups,
lr=args.lr, lr=args.lr,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment