Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
e619a651
Unverified
Commit
e619a651
authored
Apr 01, 2022
by
ver217
Committed by
GitHub
Apr 01, 2022
Browse files
polish optimizer docstring (#619)
parent
8432dc70
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
66 additions
and
83 deletions
+66
-83
colossalai/nn/optimizer/cpu_adam.py
colossalai/nn/optimizer/cpu_adam.py
+2
-2
colossalai/nn/optimizer/fused_adam.py
colossalai/nn/optimizer/fused_adam.py
+5
-5
colossalai/nn/optimizer/fused_lamb.py
colossalai/nn/optimizer/fused_lamb.py
+1
-1
colossalai/nn/optimizer/fused_sgd.py
colossalai/nn/optimizer/fused_sgd.py
+46
-63
colossalai/nn/optimizer/hybrid_adam.py
colossalai/nn/optimizer/hybrid_adam.py
+12
-12
No files found.
colossalai/nn/optimizer/cpu_adam.py
View file @
e619a651
...
...
@@ -44,8 +44,8 @@ class CPUAdam(torch.optim.Optimizer):
True for decoupled weight decay(also known as AdamW) (default: True)
simd_log (boolean, optional): whether to show if you are using SIMD to
accelerate. (default: False)
.. _Adam: A Method for Stochastic Optimization:
.. _Adam
\
: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
...
...
colossalai/nn/optimizer/fused_adam.py
View file @
e619a651
...
...
@@ -41,7 +41,7 @@ class FusedAdam(torch.optim.Optimizer):
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
.. _Adam: A Method for Stochastic Optimization:
.. _Adam
\
: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
...
...
@@ -128,14 +128,14 @@ class FusedAdam(torch.optim.Optimizer):
if
p
.
dtype
not
in
[
torch
.
float16
,
torch
.
float32
]:
raise
RuntimeError
(
'FusedAdam only support fp16 and fp32.'
)
g_l
.
append
(
p
.
grad
.
data
)
p_l
.
append
(
p
.
data
)
m_l
.
append
(
state
[
'exp_avg'
])
v_l
.
append
(
state
[
'exp_avg_sq'
])
multi_tensor_applier
(
self
.
multi_tensor_adam
,
self
.
_dummy_overflow_buf
,
[
g_l
,
p_l
,
m_l
,
v_l
],
group
[
'lr'
],
beta1
,
beta2
,
group
[
'eps'
],
group
[
'step'
],
self
.
adamw_mode
,
bias_correction
,
group
[
'weight_decay'
])
multi_tensor_applier
(
self
.
multi_tensor_adam
,
self
.
_dummy_overflow_buf
,
[
g_l
,
p_l
,
m_l
,
v_l
],
group
[
'lr'
],
beta1
,
beta2
,
group
[
'eps'
],
group
[
'step'
],
self
.
adamw_mode
,
bias_correction
,
group
[
'weight_decay'
])
return
loss
colossalai/nn/optimizer/fused_lamb.py
View file @
e619a651
...
...
@@ -10,7 +10,7 @@ class FusedLAMB(torch.optim.Optimizer):
"""Implements LAMB algorithm.
Currently GPU-only. Requires ColossalAI to be installed via
``pip install
-v --no-cache-dir --global-option="--cuda_ext" ./
``.
``pip install
.
``.
This version of fused LAMB implements 2 fusions.
...
...
colossalai/nn/optimizer/fused_sgd.py
View file @
e619a651
...
...
@@ -11,7 +11,7 @@ class FusedSGD(Optimizer):
r
"""Implements stochastic gradient descent (optionally with momentum).
Currently GPU-only. Requires ColossalAI to be installed via
``pip install
-v --no-cache-dir --global-option="--cuda_ext" ./
``.
``pip install
.
``.
This version of fused SGD implements 2 fusions.
...
...
@@ -57,8 +57,13 @@ class FusedSGD(Optimizer):
The Nesterov version is analogously modified.
"""
def
__init__
(
self
,
params
,
lr
=
required
,
momentum
=
0
,
dampening
=
0
,
weight_decay
=
0
,
nesterov
=
False
,
def
__init__
(
self
,
params
,
lr
=
required
,
momentum
=
0
,
dampening
=
0
,
weight_decay
=
0
,
nesterov
=
False
,
wd_after_momentum
=
False
,
materialize_master_grads
=
True
,
set_grad_none
=
False
):
...
...
@@ -67,14 +72,11 @@ class FusedSGD(Optimizer):
if
momentum
<
0.0
:
raise
ValueError
(
"Invalid momentum value: {}"
.
format
(
momentum
))
if
weight_decay
<
0.0
:
raise
ValueError
(
"Invalid weight_decay value: {}"
.
format
(
weight_decay
))
raise
ValueError
(
"Invalid weight_decay value: {}"
.
format
(
weight_decay
))
defaults
=
dict
(
lr
=
lr
,
momentum
=
momentum
,
dampening
=
dampening
,
weight_decay
=
weight_decay
,
nesterov
=
nesterov
)
defaults
=
dict
(
lr
=
lr
,
momentum
=
momentum
,
dampening
=
dampening
,
weight_decay
=
weight_decay
,
nesterov
=
nesterov
)
if
nesterov
and
(
momentum
<=
0
or
dampening
!=
0
):
raise
ValueError
(
"Nesterov momentum requires a momentum and zero dampening"
)
raise
ValueError
(
"Nesterov momentum requires a momentum and zero dampening"
)
super
(
FusedSGD
,
self
).
__init__
(
params
,
defaults
)
self
.
wd_after_momentum
=
wd_after_momentum
...
...
@@ -86,8 +88,9 @@ class FusedSGD(Optimizer):
if
multi_tensor_applier
.
available
:
import
colossal_C
# Skip buffer
self
.
_dummy_overflow_buf
=
torch
.
tensor
(
[
0
],
dtype
=
torch
.
int
,
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
)
self
.
_dummy_overflow_buf
=
torch
.
tensor
([
0
],
dtype
=
torch
.
int
,
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
)
self
.
multi_tensor_sgd
=
colossal_C
.
multi_tensor_sgd
else
:
raise
RuntimeError
(
'FusedSGD requires cuda extensions'
)
...
...
@@ -133,8 +136,7 @@ class FusedSGD(Optimizer):
if
closure
is
not
None
:
loss
=
closure
()
explicit_master_params
=
(
hasattr
(
self
,
"_amp_stash"
)
and
hasattr
(
self
.
_amp_stash
,
"fp32_from_fp16_groups"
))
explicit_master_params
=
(
hasattr
(
self
,
"_amp_stash"
)
and
hasattr
(
self
.
_amp_stash
,
"fp32_from_fp16_groups"
))
for
gid
,
group
in
enumerate
(
self
.
param_groups
):
weight_decay
=
group
[
'weight_decay'
]
...
...
@@ -154,71 +156,52 @@ class FusedSGD(Optimizer):
if
explicit_master_params
:
stash
=
self
.
_amp_stash
fp32_params
=
[
p
for
p
in
stash
.
fp32_from_fp32_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_grads
=
[
p
.
grad
for
p
in
stash
.
fp32_from_fp32_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_params
=
[
p
for
p
in
stash
.
fp32_from_fp32_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_grads
=
[
p
.
grad
for
p
in
stash
.
fp32_from_fp32_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_momentums
,
first_runs
[
1
]
=
self
.
get_momentums
(
fp32_params
)
if
self
.
materialize_master_grads
:
fp16_model_params
=
[
p
for
i
,
p
in
enumerate
(
stash
.
fp16_groups
[
gid
])
if
stash
.
fp32_from_fp16_groups
[
gid
][
i
].
grad
is
not
None
]
fp32_from_fp16_grads
=
[
p
.
grad
for
p
in
stash
.
fp32_from_fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp16_model_params
=
[
p
for
i
,
p
in
enumerate
(
stash
.
fp16_groups
[
gid
])
if
stash
.
fp32_from_fp16_groups
[
gid
][
i
].
grad
is
not
None
]
fp32_from_fp16_grads
=
[
p
.
grad
for
p
in
stash
.
fp32_from_fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_from_fp16_params
=
[
p
for
p
in
stash
.
fp32_from_fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_from_fp16_momentums
,
first_runs
[
0
]
=
self
.
get_momentums
(
fp32_from_fp16_params
)
fp16_set
=
[
fp32_from_fp16_grads
,
fp32_from_fp16_params
,
fp32_from_fp16_momentums
,
fp16_model_params
]
else
:
fp16_model_params
=
[
p
for
p
in
stash
.
fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp16_model_grads
=
[
p
.
grad
for
p
in
stash
.
fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_from_fp16_params
=
[
p
for
p
in
stash
.
fp32_from_fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_from_fp16_momentums
,
first_runs
[
0
]
=
self
.
get_momentums
(
fp32_from_fp16_params
)
p
for
i
,
p
in
enumerate
(
stash
.
fp32_from_fp16_groups
[
gid
])
if
stash
.
fp16_groups
[
gid
][
i
].
grad
is
not
None
]
fp32_from_fp16_momentums
,
first_runs
[
0
]
=
self
.
get_momentums
(
fp32_from_fp16_params
)
fp16_set
=
[
fp32_from_fp16_grads
,
fp32_from_fp16_params
,
fp32_from_fp16_momentums
,
fp16_model_params
]
else
:
fp16_model_params
=
[
p
for
p
in
stash
.
fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp16_model_grads
=
[
p
.
grad
for
p
in
stash
.
fp16_groups
[
gid
]
if
p
.
grad
is
not
None
]
fp32_from_fp16_params
=
[
p
for
i
,
p
in
enumerate
(
stash
.
fp32_from_fp16_groups
[
gid
])
if
stash
.
fp16_groups
[
gid
][
i
].
grad
is
not
None
]
fp32_from_fp16_momentums
,
first_runs
[
0
]
=
self
.
get_momentums
(
fp32_from_fp16_params
)
fp16_set
=
[
fp16_model_grads
,
fp32_from_fp16_params
,
fp32_from_fp16_momentums
,
fp16_model_params
]
launch_sets
=
[
fp16_set
,
[
fp32_grads
,
fp32_params
,
fp32_momentums
]]
fp16_set
=
[
fp16_model_grads
,
fp32_from_fp16_params
,
fp32_from_fp16_momentums
,
fp16_model_params
]
launch_sets
=
[
fp16_set
,
[
fp32_grads
,
fp32_params
,
fp32_momentums
]]
else
:
fp16_params
=
[
p
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float16
and
p
.
grad
is
not
None
)]
fp16_grads
=
[
p
.
grad
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float16
and
p
.
grad
is
not
None
)]
fp16_params
=
[
p
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float16
and
p
.
grad
is
not
None
)]
fp16_grads
=
[
p
.
grad
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float16
and
p
.
grad
is
not
None
)]
fp16_momentums
,
first_runs
[
0
]
=
self
.
get_momentums
(
fp16_params
)
fp32_params
=
[
p
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float32
and
p
.
grad
is
not
None
)]
fp32_grads
=
[
p
.
grad
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float32
and
p
.
grad
is
not
None
)]
fp32_params
=
[
p
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float32
and
p
.
grad
is
not
None
)]
fp32_grads
=
[
p
.
grad
for
p
in
group
[
'params'
]
if
(
p
.
dtype
==
torch
.
float32
and
p
.
grad
is
not
None
)]
fp32_momentums
,
first_runs
[
1
]
=
self
.
get_momentums
(
fp32_params
)
launch_sets
=
[[
fp16_grads
,
fp16_params
,
fp16_momentums
],
[
fp32_grads
,
fp32_params
,
fp32_momentums
]]
launch_sets
=
[[
fp16_grads
,
fp16_params
,
fp16_momentums
],
[
fp32_grads
,
fp32_params
,
fp32_momentums
]]
for
s
,
(
launch_set
,
first_run
)
in
enumerate
(
zip
(
launch_sets
,
first_runs
)):
assert
len
(
launch_set
[
0
])
==
len
(
launch_set
[
1
])
assert
len
(
launch_set
[
0
])
==
len
(
launch_set
[
2
])
if
len
(
launch_set
[
0
])
>
0
:
multi_tensor_applier
(
self
.
multi_tensor_sgd
,
self
.
_dummy_overflow_buf
,
launch_set
,
weight_decay
,
momentum
,
dampening
,
group
[
'lr'
],
nesterov
,
first_run
,
self
.
wd_after_momentum
,
1.0
/
self
.
most_recent_scale
)
multi_tensor_applier
(
self
.
multi_tensor_sgd
,
self
.
_dummy_overflow_buf
,
launch_set
,
weight_decay
,
momentum
,
dampening
,
group
[
'lr'
],
nesterov
,
first_run
,
self
.
wd_after_momentum
,
1.0
/
self
.
most_recent_scale
)
self
.
most_recent_scale
=
1.0
self
.
scale_set_by_backward
=
False
...
...
colossalai/nn/optimizer/hybrid_adam.py
View file @
e619a651
import
torch
from
colossalai.utils
import
multi_tensor_applier
from
colossalai.registry
import
OPTIMIZERS
...
...
@@ -14,13 +13,14 @@ class HybridAdam(torch.optim.Optimizer):
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
* Parameters on GPU and gradients on CPU is **not** allowed.
Requires ColossalAI to be installed via ``pip install .``
This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
* For parameters updating on CPU, it uses CPUAdam.
* For parameters updating on GPU, it uses FusedAdam.
* Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
* For parameters updating on CPU, it uses CPUAdam.
* For parameters updating on GPU, it uses FusedAdam.
* Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
:class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
or ``torch.optim.Adam`` with ``adamw_mode=False``
...
...
@@ -43,8 +43,8 @@ class HybridAdam(torch.optim.Optimizer):
True for decoupled weight decay(also known as AdamW) (default: True)
simd_log (boolean, optional): whether to show if you are using SIMD to
accelerate. (default: False)
.. _Adam: A Method for Stochastic Optimization:
.. _Adam
\
: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
...
...
@@ -75,7 +75,7 @@ class HybridAdam(torch.optim.Optimizer):
import
colossal_C
except
ImportError
:
raise
ImportError
(
'Please install colossalai from source code to use HybridAdam'
)
self
.
cpu_adam_op
=
cpu_adam
self
.
cpu_adam_op
.
create_adam
(
self
.
opt_id
,
lr
,
betas
[
0
],
betas
[
1
],
eps
,
weight_decay
,
adamw_mode
,
simd_log
)
...
...
@@ -131,14 +131,14 @@ class HybridAdam(torch.optim.Optimizer):
g_l
.
append
(
p
.
grad
.
data
)
p_l
.
append
(
p
.
data
)
m_l
.
append
(
state
[
'exp_avg'
])
v_l
.
append
(
state
[
'exp_avg_sq'
])
v_l
.
append
(
state
[
'exp_avg_sq'
])
else
:
raise
RuntimeError
if
len
(
g_l
)
>
0
:
adamw_mode
=
1
if
self
.
adamw_mode
else
0
bias_correction
=
1
if
group
[
'bias_correction'
]
else
0
multi_tensor_applier
(
self
.
gpu_adam_op
,
self
.
_dummy_overflow_buf
,
[
g_l
,
p_l
,
m_l
,
v_l
],
group
[
'lr'
],
group
[
'betas'
][
0
],
group
[
'betas'
][
1
],
group
[
'eps'
],
group_step
,
adamw_mode
,
bias_correction
,
group
[
'weight_decay'
])
multi_tensor_applier
(
self
.
gpu_adam_op
,
self
.
_dummy_overflow_buf
,
[
g_l
,
p_l
,
m_l
,
v_l
],
group
[
'lr'
],
group
[
'betas'
][
0
],
group
[
'betas'
][
1
],
group
[
'eps'
],
group_step
,
adamw_mode
,
bias_correction
,
group
[
'weight_decay'
])
return
loss
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment