Commit 9b96c824 authored by Kexin Yu's avatar Kexin Yu
Browse files

seg fault debugging

parent 92186863
...@@ -108,7 +108,8 @@ class FusedLAMB(torch.optim.Optimizer): ...@@ -108,7 +108,8 @@ class FusedLAMB(torch.optim.Optimizer):
g_all_32, g_all_16 = [], [] g_all_32, g_all_16 = [], []
for group in self.param_groups: for group in self.param_groups:
for p in group['params']: for p in group['params']:
if p.grad is not None: if p.grad is None:
continue
if p.dtype == torch.float32: if p.dtype == torch.float32:
g_all_32.append(p.grad.data) g_all_32.append(p.grad.data)
elif p.dytpe == torch.float16: elif p.dytpe == torch.float16:
...@@ -116,6 +117,7 @@ class FusedLAMB(torch.optim.Optimizer): ...@@ -116,6 +117,7 @@ class FusedLAMB(torch.optim.Optimizer):
else: else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.') raise RuntimeError('FusedLAMB only support fp16 and fp32.')
print("====after collect")
# compute grad norm for two lists # compute grad norm for two lists
g_norm_32, _ = multi_tensor_applier(self.multi_tensor_l2norm, g_norm_32, _ = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf, self._dummy_overflow_buf,
...@@ -124,9 +126,13 @@ class FusedLAMB(torch.optim.Optimizer): ...@@ -124,9 +126,13 @@ class FusedLAMB(torch.optim.Optimizer):
self._dummy_overflow_buf, self._dummy_overflow_buf,
[g_all_16], False) [g_all_16], False)
print("====after multi_tensor_l2norm")
# blend two grad norms to get global grad norm # blend two grad norms to get global grad norm
global_grad_norm = math.sqrt(g_norm_32 * g_norm_32 + g_norm_16 * g_norm_16) global_grad_norm = math.sqrt(g_norm_32 * g_norm_32 + g_norm_16 * g_norm_16)
max_grad_norm = self.defaults['max_grad_norm'] max_grad_norm = self.defaults['max_grad_norm']
print("====global_grad_norm:", global_grad_norm)
print("====max_grad_norm:", max_grad_norm)
for group in self.param_groups: for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0 bias_correction = 1 if group['bias_correction'] else 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment