Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
dbd96fe9
Unverified
Commit
dbd96fe9
authored
Apr 11, 2022
by
HELSON
Committed by
GitHub
Apr 11, 2022
Browse files
[zero] check whether gradients have inf and nan in gpu (#712)
parent
715b86ea
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
90 additions
and
11 deletions
+90
-11
colossalai/zero/sharded_model/sharded_model_v2.py
colossalai/zero/sharded_model/sharded_model_v2.py
+8
-0
colossalai/zero/sharded_optim/sharded_optim_v2.py
colossalai/zero/sharded_optim/sharded_optim_v2.py
+5
-11
tests/test_zero_data_parallel/test_found_inf.py
tests/test_zero_data_parallel/test_found_inf.py
+77
-0
No files found.
colossalai/zero/sharded_model/sharded_model_v2.py
View file @
dbd96fe9
...
@@ -148,6 +148,9 @@ class ShardedModelV2(nn.Module):
...
@@ -148,6 +148,9 @@ class ShardedModelV2(nn.Module):
self
.
_cuda_margin_space
=
0
self
.
_cuda_margin_space
=
0
self
.
reuse_fp16_shard
=
reuse_fp16_shard
self
.
reuse_fp16_shard
=
reuse_fp16_shard
# record whether gradients have inf or nan
self
.
overflow_counter
=
0
def
adjust_stateful_tensor_layout
(
self
)
->
None
:
def
adjust_stateful_tensor_layout
(
self
)
->
None
:
self
.
_stateful_tensor_mgr
.
adjust_layout
()
self
.
_stateful_tensor_mgr
.
adjust_layout
()
...
@@ -345,6 +348,11 @@ class ShardedModelV2(nn.Module):
...
@@ -345,6 +348,11 @@ class ShardedModelV2(nn.Module):
# FIXME(ver217): refactor the below line when impl eviction policy
# FIXME(ver217): refactor the below line when impl eviction policy
def
_save_grad
(
self
,
param
:
Parameter
,
grad
:
torch
.
Tensor
):
def
_save_grad
(
self
,
param
:
Parameter
,
grad
:
torch
.
Tensor
):
# record whether we have overflow
self
.
overflow_counter
+=
torch
.
isinf
(
grad
).
any
().
item
()
self
.
overflow_counter
+=
torch
.
isnan
(
grad
).
any
().
item
()
# move gradient to cpu
# move gradient to cpu
if
param
.
colo_attr
.
offload_grad
:
if
param
.
colo_attr
.
offload_grad
:
colo_model_data_move_to_cpu
(
grad
)
colo_model_data_move_to_cpu
(
grad
)
...
...
colossalai/zero/sharded_optim/sharded_optim_v2.py
View file @
dbd96fe9
...
@@ -118,7 +118,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -118,7 +118,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
growth_interval
=
growth_interval
,
growth_interval
=
growth_interval
,
hysteresis
=
hysteresis
,
hysteresis
=
hysteresis
,
max_scale
=
max_scale
)
max_scale
=
max_scale
)
self
.
_found_overflow
:
Tensor
=
torch
.
Floa
tTensor
([
0
]).
to
(
torch
.
cuda
.
current_device
())
self
.
_found_overflow
:
Tensor
=
torch
.
In
tTensor
([
0
]).
to
(
torch
.
cuda
.
current_device
())
self
.
_logger
=
get_dist_logger
(
"ShardedOptimizerV2"
)
self
.
_logger
=
get_dist_logger
(
"ShardedOptimizerV2"
)
# Store fp32 param shards
# Store fp32 param shards
...
@@ -210,20 +210,13 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -210,20 +210,13 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
def
_check_overflow
(
self
):
def
_check_overflow
(
self
):
# clear previous overflow record
# clear previous overflow record
self
.
_found_overflow
.
fill_
(
0.0
)
self
.
_found_overflow
.
fill_
(
self
.
model
.
overflow_counter
)
# check for overflow
for
group
in
self
.
optim
.
param_groups
:
for
p
in
group
[
'params'
]:
if
has_inf_or_nan
(
p
.
grad
):
self
.
_found_overflow
.
fill_
(
1.0
)
break
# all-reduce across dp group
# all-reduce across dp group
dist
.
all_reduce
(
self
.
_found_overflow
,
op
=
dist
.
ReduceOp
.
MAX
,
group
=
self
.
dp_process_group
)
dist
.
all_reduce
(
self
.
_found_overflow
,
group
=
self
.
dp_process_group
)
# all-reduce over model parallel group
# all-reduce over model parallel group
dist
.
all_reduce
(
self
.
_found_overflow
,
op
=
dist
.
ReduceOp
.
MAX
,
group
=
self
.
mp_process_group
)
dist
.
all_reduce
(
self
.
_found_overflow
,
group
=
self
.
mp_process_group
)
return
self
.
_found_overflow
.
item
()
>
0
return
self
.
_found_overflow
.
item
()
>
0
...
@@ -259,6 +252,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -259,6 +252,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
else
:
else
:
# release saved gradient
# release saved gradient
p
.
colo_attr
.
saved_grad
.
set_null
()
p
.
colo_attr
.
saved_grad
.
set_null
()
self
.
model
.
overflow_counter
=
0
# set overflow counter to zero
def
sync_grad
(
self
):
def
sync_grad
(
self
):
pass
pass
...
...
tests/test_zero_data_parallel/test_found_inf.py
0 → 100644
View file @
dbd96fe9
from
functools
import
partial
import
colossalai
from
colossalai.utils.cuda
import
get_current_device
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
colossalai.nn.optimizer
import
HybridAdam
from
colossalai.testing
import
parameterize
,
rerun_on_exception
from
colossalai.utils
import
free_port
from
colossalai.zero.init_ctx
import
ZeroInitContext
from
colossalai.zero.shard_utils
import
BucketTensorShardStrategy
from
colossalai.zero.sharded_model
import
ShardedModelV2
from
colossalai.zero.sharded_optim
import
ShardedOptimizerV2
from
colossalai.zero.sharded_optim._utils
import
has_inf_or_nan
from
tests.components_to_test.registry
import
non_distributed_component_funcs
from
tests.test_zero_data_parallel.test_sharded_optim_v2
import
_run_step
from
common
import
CONFIG
@
parameterize
(
"cpu_offload"
,
[
True
,
False
])
@
parameterize
(
"shard_strategy_class"
,
[
BucketTensorShardStrategy
])
@
parameterize
(
"gpu_margin_mem_ratio"
,
[
0.0
,
0.7
])
def
_run_test_found_inf
(
cpu_offload
,
shard_strategy_class
,
gpu_margin_mem_ratio
):
test_models
=
[
'repeated_computed_layers'
]
shard_strategy
=
shard_strategy_class
()
for
model_name
in
test_models
:
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
model_name
)
model_builder
,
train_dataloader
,
_
,
optimizer_class
,
criterion
=
get_components_func
()
with
ZeroInitContext
(
target_device
=
torch
.
device
(
f
'cpu:0'
)
if
cpu_offload
else
torch
.
device
(
f
'cuda:
{
get_current_device
()
}
'
),
shard_strategy
=
shard_strategy
,
shard_param
=
True
):
zero_model
=
model_builder
(
checkpoint
=
True
)
zero_model
=
ShardedModelV2
(
zero_model
,
shard_strategy
,
offload_config
=
dict
(
device
=
'cpu'
)
if
cpu_offload
else
None
,
use_memory_tracer
=
gpu_margin_mem_ratio
>
0.0
,
reuse_fp16_shard
=
True
,
)
sharded_optim
=
HybridAdam
(
zero_model
.
parameters
(),
lr
=
1e-3
)
sharded_optim
=
ShardedOptimizerV2
(
zero_model
,
sharded_optim
,
cpu_offload
=
cpu_offload
,
gpu_margin_mem_ratio
=
gpu_margin_mem_ratio
)
for
i
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
if
i
>
1
:
break
assert
zero_model
.
overflow_counter
==
0
data
,
label
=
data
.
cuda
(),
label
.
cuda
()
_run_step
(
zero_model
,
sharded_optim
,
data
,
label
,
criterion
,
False
)
for
param
in
zero_model
.
parameters
():
assert
not
has_inf_or_nan
(
param
.
colo_attr
.
sharded_data_tensor
.
payload
)
def
_run_dist
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
_run_test_found_inf
()
# use_cpuadam = True can be used with cpu_offload = False
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
1
,
2
])
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_found_inf
(
world_size
):
run_func
=
partial
(
_run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_found_inf
(
world_size
=
2
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment