Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
b31daed4
Unverified
Commit
b31daed4
authored
Apr 02, 2022
by
HELSON
Committed by
GitHub
Apr 02, 2022
Browse files
fix bugs in CPU adam (#633)
* add cpu adam counter for all cpu adam * fixed updating error in adam kernel
parent
1e2557e8
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
25 additions
and
13 deletions
+25
-13
colossalai/kernel/cuda_native/csrc/cpu_adam.cpp
colossalai/kernel/cuda_native/csrc/cpu_adam.cpp
+1
-1
colossalai/nn/optimizer/__init__.py
colossalai/nn/optimizer/__init__.py
+3
-1
colossalai/nn/optimizer/cpu_adam.py
colossalai/nn/optimizer/cpu_adam.py
+4
-5
colossalai/nn/optimizer/hybrid_adam.py
colossalai/nn/optimizer/hybrid_adam.py
+2
-3
colossalai/nn/optimizer/utils.py
colossalai/nn/optimizer/utils.py
+14
-0
tests/test_moe/test_moe_zero_optim.py
tests/test_moe/test_moe_zero_optim.py
+1
-3
No files found.
colossalai/kernel/cuda_native/csrc/cpu_adam.cpp
View file @
b31daed4
...
...
@@ -493,7 +493,7 @@ int adam_step(int optimizer_id,
grads_ptr
,
exp_avg_ptr
,
exp_avg_sq_ptr
,
params_c
.
size
(
0
),
params_c
.
numel
(
),
(
params
.
options
().
dtype
()
==
at
::
kHalf
),
(
grads
.
options
().
dtype
()
==
at
::
kHalf
),
loss_scale
);
...
...
colossalai/nn/optimizer/__init__.py
View file @
b31daed4
from
.utils
import
CPU_ADAM_CNT
from
.colossalai_optimizer
import
ColossalaiOptimizer
from
.fused_adam
import
FusedAdam
from
.fused_lamb
import
FusedLAMB
...
...
@@ -7,4 +8,5 @@ from .lars import Lars
from
.cpu_adam
import
CPUAdam
from
.hybrid_adam
import
HybridAdam
__all__
=
[
'ColossalaiOptimizer'
,
'FusedLAMB'
,
'FusedAdam'
,
'FusedSGD'
,
'Lamb'
,
'Lars'
,
'CPUAdam'
,
'HybridAdam'
]
__all__
=
[
'ColossalaiOptimizer'
,
'FusedLAMB'
,
'FusedAdam'
,
'FusedSGD'
,
'Lamb'
,
'Lars'
,
'CPUAdam'
,
'HybridAdam'
,
'CPU_ADAM_CNT'
]
colossalai/nn/optimizer/cpu_adam.py
View file @
b31daed4
...
...
@@ -2,6 +2,7 @@ import math
import
torch
from
colossalai.registry
import
OPTIMIZERS
from
colossalai.nn.optimizer
import
CPU_ADAM_CNT
@
OPTIMIZERS
.
register_module
...
...
@@ -51,7 +52,6 @@ class CPUAdam(torch.optim.Optimizer):
https://openreview.net/forum?id=ryQu7f-RZ
"""
optimizer_id
=
0
# Number of fp32 shards for per parameter
# Param weight, grad, momentum and variance
num_fp32_shards_per_param
=
4
...
...
@@ -68,8 +68,7 @@ class CPUAdam(torch.optim.Optimizer):
default_args
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
bias_correction
=
bias_correction
)
super
(
CPUAdam
,
self
).
__init__
(
model_params
,
default_args
)
self
.
opt_id
=
CPUAdam
.
optimizer_id
CPUAdam
.
optimizer_id
=
CPUAdam
.
optimizer_id
+
1
self
.
opt_id
=
CPU_ADAM_CNT
()
self
.
adamw_mode
=
adamw_mode
try
:
import
cpu_adam
...
...
@@ -152,8 +151,8 @@ class CPUAdam(torch.optim.Optimizer):
assert
state
[
'exp_avg'
].
device
.
type
==
'cuda'
,
"exp_avg should stay on cuda"
assert
state
[
'exp_avg_sq'
].
device
.
type
==
'cuda'
,
"exp_avg should stay on cuda"
bias_correction1
=
1
-
beta1
**
state
[
'step'
]
bias_correction2
=
1
-
beta2
**
state
[
'step'
]
bias_correction1
=
1
-
beta1
**
state
[
'step'
]
bias_correction2
=
1
-
beta2
**
state
[
'step'
]
# adam on cuda
self
.
torch_adam_update
(
p
.
data
,
p
.
grad
.
data
,
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
],
group
[
'lr'
],
...
...
colossalai/nn/optimizer/hybrid_adam.py
View file @
b31daed4
...
...
@@ -2,6 +2,7 @@ import torch
from
colossalai.utils
import
multi_tensor_applier
from
colossalai.registry
import
OPTIMIZERS
from
colossalai.nn.optimizer
import
CPU_ADAM_CNT
@
OPTIMIZERS
.
register_module
...
...
@@ -50,7 +51,6 @@ class HybridAdam(torch.optim.Optimizer):
https://openreview.net/forum?id=ryQu7f-RZ
"""
optimizer_id
=
0
# Number of fp32 shards for per parameter
# Param weight, grad, momentum and variance
num_fp32_shards_per_param
=
4
...
...
@@ -67,8 +67,7 @@ class HybridAdam(torch.optim.Optimizer):
default_args
=
dict
(
lr
=
lr
,
betas
=
betas
,
eps
=
eps
,
weight_decay
=
weight_decay
,
bias_correction
=
bias_correction
)
super
(
HybridAdam
,
self
).
__init__
(
model_params
,
default_args
)
self
.
opt_id
=
HybridAdam
.
optimizer_id
HybridAdam
.
optimizer_id
=
HybridAdam
.
optimizer_id
+
1
self
.
opt_id
=
CPU_ADAM_CNT
()
self
.
adamw_mode
=
adamw_mode
try
:
import
cpu_adam
...
...
colossalai/nn/optimizer/utils.py
0 → 100644
View file @
b31daed4
class
CpuAdamCounter
(
object
):
"""Used to record the total number of CPU Adam.
We must use it to avoid hybrid cpu adam and cpu adam using the same id.
"""
def
__init__
(
self
):
self
.
number
=
0
def
__call__
(
self
):
self
.
number
+=
1
return
self
.
number
-
1
CPU_ADAM_CNT
=
CpuAdamCounter
()
tests/test_moe/test_moe_zero_optim.py
View file @
b31daed4
from
functools
import
partial
import
colossalai
from
colossalai.utils.cuda
import
get_current_device
import
pytest
import
torch
import
torch.multiprocessing
as
mp
...
...
@@ -51,11 +50,10 @@ def _run_step(model, optimizer, data, label, criterion, grad_handler):
@
parameterize
(
"use_cpuadam"
,
[
True
,
False
])
@
parameterize
(
"shard_strategy_class"
,
[
TensorShardStrategy
,
BucketTensorShardStrategy
])
def
_run_test_sharded_optim_v2
(
cpu_offload
,
shard_strategy_class
,
use_cpuadam
,
gpu_margin_mem_ratio
=
0.0
):
MOE_CONTEXT
.
reset_loss
()
shard_strategy
=
shard_strategy_class
()
if
use_cpuadam
and
cpu_offload
is
False
:
return
MOE_CONTEXT
.
reset_loss
()
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
'no_leaf_module'
)
_
,
train_dataloader
,
_
,
optimizer_class
,
criterion
=
get_components_func
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment