Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
087cbff1
Commit
087cbff1
authored
Mar 29, 2022
by
Lawrence McAfee
Browse files
removed debug_base/main/model methods.
parent
be8de1b3
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
0 additions
and
78 deletions
+0
-78
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+0
-5
megatron/optimizer/distrib_optimizer.py
megatron/optimizer/distrib_optimizer.py
+0
-12
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+0
-61
No files found.
megatron/optimizer/clip_grads.py
View file @
087cbff1
...
@@ -99,11 +99,6 @@ def clip_grad_norm_fp32(parameters, grads_for_norm,
...
@@ -99,11 +99,6 @@ def clip_grad_norm_fp32(parameters, grads_for_norm,
group
=
model_parallel_group
)
group
=
model_parallel_group
)
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
# >>>
# from lutil import pax, tp, print_seq
# print_seq("norm : grad %s, total %s." % (grad_norm.item(), total_norm))
# <<<
# Scale.
# Scale.
clip_coeff
=
max_norm
/
(
total_norm
+
1.0e-6
)
clip_coeff
=
max_norm
/
(
total_norm
+
1.0e-6
)
if
clip_coeff
<
1.0
:
if
clip_coeff
<
1.0
:
...
...
megatron/optimizer/distrib_optimizer.py
View file @
087cbff1
...
@@ -27,10 +27,6 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
...
@@ -27,10 +27,6 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
from
.optimizer
import
MixedPrecisionOptimizer
,
_zero_grad_group_helper
from
.optimizer
import
MixedPrecisionOptimizer
,
_zero_grad_group_helper
# >>>
from
lutil
import
pax
,
tp
,
print_seq
# <<<
class
Range
:
class
Range
:
...
@@ -363,14 +359,6 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -363,14 +359,6 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
fp32_from_float16_groups as a memory optimization to reduce
fp32_from_float16_groups as a memory optimization to reduce
fragmentation; in the case of set_to_none==True, the space
fragmentation; in the case of set_to_none==True, the space
used by this field can be safely deallocated at this point."""
used by this field can be safely deallocated at this point."""
# >>>
# params = [ p for g in self.shard_fp32_groups for p in g ]
# pax(0, {
# "shard_fp32_groups" : self.shard_fp32_groups,
# "params" : params,
# "grads" : [ p.grad for p in params ],
# })
# <<<
for
groups
in
(
for
groups
in
(
self
.
full_float16_groups
,
self
.
full_float16_groups
,
self
.
full_fp32_groups
,
self
.
full_fp32_groups
,
...
...
megatron/optimizer/optimizer.py
View file @
087cbff1
...
@@ -33,10 +33,6 @@ from megatron.utils import unwrap_model
...
@@ -33,10 +33,6 @@ from megatron.utils import unwrap_model
from
.clip_grads
import
clip_grad_norm_fp32
,
count_zeros_fp32
from
.clip_grads
import
clip_grad_norm_fp32
,
count_zeros_fp32
# >>>
from
lutil
import
pax
,
tp
,
print_seq
# <<<
def
_zero_grad_group_helper
(
group
,
set_to_none
):
def
_zero_grad_group_helper
(
group
,
set_to_none
):
"""Zero out the gradient for a group of parameters.
"""Zero out the gradient for a group of parameters.
...
@@ -349,63 +345,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -349,63 +345,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
return
found_inf_flag
return
found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# @classmethod
# def debug_base(cls, ITERATION, key, value):
# from megatron import get_args
# args = get_args()
# my_rank = torch.distributed.get_rank()
# DEBUG_ITERATION = ITERATION
# if ITERATION != DEBUG_ITERATION:
# return
# for r in range(torch.distributed.get_world_size()):
# if my_rank == r:
# # prefix = " + "
# prefix = ""
# print("%sbr/%s; [r%d, i%d]; %s, %.12e" % (prefix, "fix " if args.use_distributed_optimizer else "main", my_rank, ITERATION, key, value))
# torch.distributed.barrier()
# torch.distributed.barrier()
# # if my_rank == 0:
# # raise Exception("debug.")
# # else:
# # exit(0)
# exit(0)
# def debug_model(self, ITERATION, key, use_grad):
# use_grad = bool(use_grad)
# tensors = [
# (p.main_grad.float() if use_grad else p.float())
# for m in self.models for p in m.parameters()
# ]
# count = sum(t.nelement() for t in tensors)
# return self.debug_base(
# ITERATION,
# "model/%s, %s [count %d]" % (
# "grad" if use_grad else "param",
# key,
# count,
# ),
# # sum(torch.sum(torch.abs(t)) for t in tensors).item() / count,
# sum(torch.sum(torch.abs(t)) for t in tensors),
# )
# def debug_main(self, ITERATION, key, use_grad):
# use_grad = bool(use_grad)
# tensors = [
# p.grad if use_grad else p
# for g in self.optimizer.param_groups
# for p in g["params"]
# ]
# tensors = [ t.float() for t in tensors ]
# count = sum(t.nelement() for t in tensors)
# return self.debug_base(
# ITERATION,
# "main/%s, %s [count %d]" % (
# "grad" if use_grad else "param",
# key,
# count,
# ),
# sum(torch.sum(torch.abs(t)) for t in tensors),
# )
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
step
(
self
,
args
,
timers
):
def
step
(
self
,
args
,
timers
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment