Commit 82491e4b authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

more cleanup; timing tests vs. 'dupe' dist-opt.

parent 91f3579e
......@@ -227,7 +227,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
shard_fp32_from_float16_params_this_group.append(shard_main_param)
# fp32 params.
elif param.type() == 'torch.cuda.FloatTensor':
elif model_param.type() == 'torch.cuda.FloatTensor':
shard_model_param = model_param.view(-1) \
[param_range.start:param_range.end]
full_fp32_params_this_group.append(model_param)
......
......@@ -556,7 +556,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
self.optimizer.load_state_dict(self.optimizer.state_dict())
# >>>
def zero_grad(self, set_to_none=True):
"""We only need to zero the model related parameters, i.e.,
float16_groups & fp32_from_fp32_groups. We additionally zero
......@@ -569,7 +568,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
_zero_grad_group_helper(group, set_to_none)
for group in self.fp32_from_fp32_groups:
_zero_grad_group_helper(group, set_to_none)
# <<<
def _collect_main_grad_data_for_unscaling(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment