Commit e858d338 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

moved 'get_main_grads_for_grad_norm()'; for fp16/fp32 sharing.

parent 11581195
...@@ -93,6 +93,7 @@ class MegatronOptimizer(ABC): ...@@ -93,6 +93,7 @@ class MegatronOptimizer(ABC):
assert self.params_have_main_grad, \ assert self.params_have_main_grad, \
"use of contiguous buffer requires that params have main grad" "use of contiguous buffer requires that params have main grad"
def get_parameters(self): def get_parameters(self):
params = [] params = []
for param_group in self.optimizer.param_groups: for param_group in self.optimizer.param_groups:
...@@ -100,9 +101,25 @@ class MegatronOptimizer(ABC): ...@@ -100,9 +101,25 @@ class MegatronOptimizer(ABC):
params.append(param) params.append(param)
return params return params
@abstractmethod
def get_main_grads_for_grad_norm(self): def get_main_grads_for_grad_norm(self):
pass
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
params = self.get_parameters()
grads_for_norm = []
for param in params:
grad = param.grad
grad_not_none = grad is not None
is_not_shared = param_is_not_shared(param)
is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
if grad_not_none and is_not_shared and is_not_tp_duplicate:
grads_for_norm.append(grad)
return grads_for_norm
def get_model_parallel_group(self): def get_model_parallel_group(self):
'''Default returned here, but the distributed optimizer overrides this.''' '''Default returned here, but the distributed optimizer overrides this.'''
...@@ -544,36 +561,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer): ...@@ -544,36 +561,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
_zero_grad_group_helper(group, set_to_none) _zero_grad_group_helper(group, set_to_none)
def get_main_grads_for_grad_norm(self):
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
params = self.get_parameters()
# grads = []
grads_for_norm = []
for param in params:
grad = param.grad
grad_not_none = grad is not None
is_not_shared = param_is_not_shared(param)
is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
# if grad_not_none:
# grad = param.grad.detach()
# if grad_not_none:
# # Make sure the grads are in fp32
# assert param.grad.type() == 'torch.cuda.FloatTensor'
# grads.append(grad)
if grad_not_none and is_not_shared and is_not_tp_duplicate:
grads_for_norm.append(grad)
# pax(0, {"grads_for_norm": [
# str(tuple(g.shape))
# for g in grads_for_norm
# ]})
return grads_for_norm
def _collect_main_grad_data_for_unscaling(self): def _collect_main_grad_data_for_unscaling(self):
main_grads = [] main_grads = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment