"docs/vscode:/vscode.git/clone" did not exist on "4941dd23f368c17cd202155fc7f443b54b95f3b9"
Commit c88bc979 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

updated FP32Optimizer for latest changes.

parent 7ac342b7
...@@ -156,7 +156,8 @@ def get_megatron_optimizer(model, ...@@ -156,7 +156,8 @@ def get_megatron_optimizer(model,
# else Float32Optimizer # else Float32Optimizer
# return opt_ty(optimizer, args.clip_grad, # return opt_ty(optimizer, args.clip_grad,
# <<< # <<<
return Float32Optimizer(optimizer, args.clip_grad, return FP32Optimizer(optimizer, args.clip_grad,
args.log_num_zeros_in_grad, args.log_num_zeros_in_grad,
params_have_main_grad, params_have_main_grad,
args.use_contiguous_buffers_in_local_ddp) args.use_contiguous_buffers_in_local_ddp,
model)
...@@ -75,7 +75,8 @@ class MegatronOptimizer(ABC): ...@@ -75,7 +75,8 @@ class MegatronOptimizer(ABC):
def __init__(self, optimizer, clip_grad, def __init__(self, optimizer, clip_grad,
log_num_zeros_in_grad, log_num_zeros_in_grad,
params_have_main_grad, params_have_main_grad,
use_contiguous_buffers_in_local_ddp): use_contiguous_buffers_in_local_ddp,
models):
"""Input optimizer is the base optimizer for example Adam.""" """Input optimizer is the base optimizer for example Adam."""
self.optimizer = optimizer self.optimizer = optimizer
...@@ -86,6 +87,10 @@ class MegatronOptimizer(ABC): ...@@ -86,6 +87,10 @@ class MegatronOptimizer(ABC):
self.params_have_main_grad = params_have_main_grad self.params_have_main_grad = params_have_main_grad
self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp self.use_contiguous_buffers_in_local_ddp = use_contiguous_buffers_in_local_ddp
# 'models' are retained for access to the contiguous grad buffers.
# (see distributed optimizer)
self.models = models
if self.use_contiguous_buffers_in_local_ddp: if self.use_contiguous_buffers_in_local_ddp:
assert self.params_have_main_grad, \ assert self.params_have_main_grad, \
"use of contiguous buffer requires that params have main grad" "use of contiguous buffer requires that params have main grad"
...@@ -260,11 +265,9 @@ class MixedPrecisionOptimizer(MegatronOptimizer): ...@@ -260,11 +265,9 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
super().__init__( super().__init__(
optimizer, clip_grad, log_num_zeros_in_grad, optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp) params_have_main_grad, use_contiguous_buffers_in_local_ddp,
models)
# >>>
self.models = models
# <<<
self.bf16 = bf16 self.bf16 = bf16
self.grad_scaler = grad_scaler self.grad_scaler = grad_scaler
# None grad scaler is only supported for bf16. # None grad scaler is only supported for bf16.
...@@ -382,8 +385,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer): ...@@ -382,8 +385,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
@torch.no_grad() @torch.no_grad()
def step(self, args, timers, ITERATION): def step(self, args, timers, ITERATION):
# timers = get_timers()
# >>> # >>>
# self.debug_model(ITERATION, "before copy grad.", 0) # self.debug_model(ITERATION, "before copy grad.", 0)
# self.debug_main(ITERATION, "before copy grad.", 0) # self.debug_main(ITERATION, "before copy grad.", 0)
...@@ -608,16 +609,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer): ...@@ -608,16 +609,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
if not self.use_contiguous_buffers_in_local_ddp: if not self.use_contiguous_buffers_in_local_ddp:
model_param.main_grad = None model_param.main_grad = None
# >>>
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "** branch **" : "** main. **",
# "ITERATION" : ITERATION,
# "model grads" :
# [ p.main_grad for m in self.models for p in m.parameters() ],
# })
# <<<
def _copy_main_params_to_model_params(self, ITERATION): def _copy_main_params_to_model_params(self, ITERATION):
# Only needed for the float16 params. # Only needed for the float16 params.
...@@ -679,11 +670,13 @@ class FP32Optimizer(MegatronOptimizer): ...@@ -679,11 +670,13 @@ class FP32Optimizer(MegatronOptimizer):
def __init__(self, optimizer, clip_grad, def __init__(self, optimizer, clip_grad,
log_num_zeros_in_grad, log_num_zeros_in_grad,
params_have_main_grad, params_have_main_grad,
use_contiguous_buffers_in_local_ddp): use_contiguous_buffers_in_local_ddp,
models):
super(FP32Optimizer, self).__init__( super(FP32Optimizer, self).__init__(
optimizer, clip_grad, log_num_zeros_in_grad, optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp) params_have_main_grad, use_contiguous_buffers_in_local_ddp,
models)
self._scale = torch.cuda.FloatTensor([1.0]) self._scale = torch.cuda.FloatTensor([1.0])
...@@ -700,7 +693,7 @@ class FP32Optimizer(MegatronOptimizer): ...@@ -700,7 +693,7 @@ class FP32Optimizer(MegatronOptimizer):
@torch.no_grad() @torch.no_grad()
def step(self, args, timers): def step(self, args, timers, ITERATION):
"""Clip gradients (if needed) and step the base optimizer. """Clip gradients (if needed) and step the base optimizer.
Always return successful since there is no overflow.""" Always return successful since there is no overflow."""
...@@ -719,7 +712,7 @@ class FP32Optimizer(MegatronOptimizer): ...@@ -719,7 +712,7 @@ class FP32Optimizer(MegatronOptimizer):
# Clip gradients. # Clip gradients.
grad_norm = None grad_norm = None
if self.clip_grad > 0.0: if self.clip_grad > 0.0:
grad_norm = self.clip_grad_norm(self.clip_grad) grad_norm = self.clip_grad_norm(self.clip_grad, ITERATION)
# count the zeros in the grads # count the zeros in the grads
num_zeros_in_grad = self.count_zeros() if \ num_zeros_in_grad = self.count_zeros() if \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment