Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
17e73e62
Unverified
Commit
17e73e62
authored
Apr 03, 2022
by
HELSON
Committed by
GitHub
Apr 03, 2022
Browse files
[hotfix] fix bugs for unsharded parameters when restore data (#664)
parent
0aab5230
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
5 deletions
+11
-5
colossalai/zero/sharded_optim/sharded_optim_v2.py
colossalai/zero/sharded_optim/sharded_optim_v2.py
+11
-5
No files found.
colossalai/zero/sharded_optim/sharded_optim_v2.py
View file @
17e73e62
...
@@ -132,7 +132,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -132,7 +132,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
# Store fp32 param shards
# Store fp32 param shards
self
.
_register_master_weight
()
self
.
_register_master_weight
()
self
.
_logger
.
debug
(
f
"After init ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory!"
,
self
.
_logger
.
debug
(
f
"After init ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory!"
,
ranks
=
[
0
])
ranks
=
[
0
])
self
.
_use_memory_tracer
=
self
.
model
.
use_memory_tracer
self
.
_use_memory_tracer
=
self
.
model
.
use_memory_tracer
...
@@ -185,13 +185,13 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -185,13 +185,13 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
self
.
_point_param_fp16_to_master_param
()
self
.
_point_param_fp16_to_master_param
()
self
.
_logger
.
debug
(
self
.
_logger
.
debug
(
f
"Before step ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory,
{
self
.
get_memory_usage
()[
1
]
/
1e6
}
MB CUDA Memory!"
,
f
"Before step ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory,
{
self
.
get_memory_usage
()[
1
]
/
1e6
}
MB CUDA Memory!"
,
ranks
=
[
0
])
ranks
=
[
0
])
ret
=
self
.
optim
.
step
(
*
args
,
**
kwargs
)
ret
=
self
.
optim
.
step
(
*
args
,
**
kwargs
)
self
.
_logger
.
debug
(
self
.
_logger
.
debug
(
f
"After step ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory,
{
self
.
get_memory_usage
()[
1
]
/
1e6
}
MB CUDA Memory!"
,
f
"After step ShardedOptimizerV2 consumes
{
self
.
get_memory_usage
()[
0
]
/
1e6
}
MB CUDA Memory,
{
self
.
get_memory_usage
()[
1
]
/
1e6
}
MB CUDA Memory!"
,
ranks
=
[
0
])
ranks
=
[
0
])
self
.
_copy_master_param_to_param_fp16
()
self
.
_copy_master_param_to_param_fp16
()
return
ret
return
ret
...
@@ -264,8 +264,14 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -264,8 +264,14 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
reuse_fp16_shard
=
p
.
colo_attr
.
saved_grad
.
data_ptr
()
==
p
.
colo_attr
.
sharded_data_tensor
.
data_ptr
()
reuse_fp16_shard
=
p
.
colo_attr
.
saved_grad
.
data_ptr
()
==
p
.
colo_attr
.
sharded_data_tensor
.
data_ptr
()
p
.
colo_attr
.
saved_grad
.
set_null
()
p
.
colo_attr
.
saved_grad
.
set_null
()
if
recover_data
and
reuse_fp16_shard
:
if
recover_data
and
reuse_fp16_shard
:
# We should write like this to trigger ForceFP32Paramter's half method
p
.
data
=
self
.
master_params
[
p
].
payload
p
.
colo_attr
.
sharded_data_tensor
.
reset_payload
(
p
.
colo_attr
.
sharded_data_tensor
.
reset_payload
(
colo_model_tensor_clone
(
self
.
master_params
[
p
].
payload
.
half
(),
torch
.
cuda
.
current_device
()))
colo_model_tensor_clone
(
p
.
half
(),
torch
.
cuda
.
current_device
()))
if
not
p
.
colo_attr
.
param_is_sharded
:
# FIXME(hhc): add hook for unsharded parameters
p
.
data
=
p
.
colo_attr
.
sharded_data_tensor
.
payload
def
sync_grad
(
self
):
def
sync_grad
(
self
):
pass
pass
...
@@ -281,7 +287,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
...
@@ -281,7 +287,7 @@ class ShardedOptimizerV2(ColossalaiOptimizer):
# As we only store param shard, we shard it here
# As we only store param shard, we shard it here
self
.
shard_strategy
.
shard
([
p
.
colo_attr
.
sharded_data_tensor
],
self
.
dp_process_group
)
self
.
shard_strategy
.
shard
([
p
.
colo_attr
.
sharded_data_tensor
],
self
.
dp_process_group
)
self
.
master_params
[
p
]
=
StatefulTensor
(
self
.
master_params
[
p
]
=
StatefulTensor
(
cast_tensor_to_fp32
(
p
.
colo_attr
.
sharded_data_tensor
.
payload
)
.
to
(
self
.
device
))
cast_tensor_to_fp32
(
p
.
colo_attr
.
sharded_data_tensor
.
payload
.
to
(
self
.
device
))
)
if
not
is_param_sharded
and
not
self
.
keep_unshard
:
if
not
is_param_sharded
and
not
self
.
keep_unshard
:
# In this branch, there's no need to shard param
# In this branch, there's no need to shard param
# So we gather here
# So we gather here
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment