Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
c7221cb2
Unverified
Commit
c7221cb2
authored
Jul 29, 2022
by
HELSON
Committed by
GitHub
Jul 29, 2022
Browse files
[hotfix] adapt ProcessGroup and Optimizer to ColoTensor (#1388)
parent
ad678921
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
20 additions
and
16 deletions
+20
-16
colossalai/nn/optimizer/cpu_adam.py
colossalai/nn/optimizer/cpu_adam.py
+2
-2
colossalai/nn/optimizer/fused_adam.py
colossalai/nn/optimizer/fused_adam.py
+2
-2
colossalai/nn/optimizer/fused_lamb.py
colossalai/nn/optimizer/fused_lamb.py
+2
-2
colossalai/nn/optimizer/fused_sgd.py
colossalai/nn/optimizer/fused_sgd.py
+1
-1
colossalai/nn/optimizer/hybrid_adam.py
colossalai/nn/optimizer/hybrid_adam.py
+2
-2
colossalai/nn/optimizer/lamb.py
colossalai/nn/optimizer/lamb.py
+2
-2
colossalai/tensor/process_group.py
colossalai/tensor/process_group.py
+9
-5
No files found.
colossalai/nn/optimizer/cpu_adam.py
View file @
c7221cb2
...
@@ -143,9 +143,9 @@ class CPUAdam(NVMeOptimizer):
...
@@ -143,9 +143,9 @@ class CPUAdam(NVMeOptimizer):
state
[
'step'
]
=
0
state
[
'step'
]
=
0
# gradient momentums
# gradient momentums
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
,
dtype
=
torch
.
float
,
device
=
target_device
)
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
,
dtype
=
torch
.
float
,
device
=
target_device
)
# gradient variances
# gradient variances
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
,
dtype
=
torch
.
float
,
device
=
target_device
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
,
dtype
=
torch
.
float
,
device
=
target_device
)
self
.
_post_state_init
(
p
)
self
.
_post_state_init
(
p
)
state
[
'step'
]
+=
1
state
[
'step'
]
+=
1
...
...
colossalai/nn/optimizer/fused_adam.py
View file @
c7221cb2
...
@@ -122,9 +122,9 @@ class FusedAdam(torch.optim.Optimizer):
...
@@ -122,9 +122,9 @@ class FusedAdam(torch.optim.Optimizer):
# State initialization
# State initialization
if
len
(
state
)
==
0
:
if
len
(
state
)
==
0
:
# Exponential moving average of gradient values
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
)
# Exponential moving average of squared gradient values
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
)
if
p
.
dtype
not
in
[
torch
.
float16
,
torch
.
float32
]:
if
p
.
dtype
not
in
[
torch
.
float16
,
torch
.
float32
]:
raise
RuntimeError
(
'FusedAdam only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedAdam only support fp16 and fp32.'
)
...
...
colossalai/nn/optimizer/fused_lamb.py
View file @
c7221cb2
...
@@ -162,9 +162,9 @@ class FusedLAMB(torch.optim.Optimizer):
...
@@ -162,9 +162,9 @@ class FusedLAMB(torch.optim.Optimizer):
# State initialization
# State initialization
if
len
(
state
)
==
0
:
if
len
(
state
)
==
0
:
# Exponential moving average of gradient values
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
)
# Exponential moving average of gradient values
# Exponential moving average of gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
)
if
p
.
dtype
==
torch
.
float16
:
if
p
.
dtype
==
torch
.
float16
:
g_16
.
append
(
p
.
grad
.
data
)
g_16
.
append
(
p
.
grad
.
data
)
...
...
colossalai/nn/optimizer/fused_sgd.py
View file @
c7221cb2
...
@@ -104,7 +104,7 @@ class FusedSGD(Optimizer):
...
@@ -104,7 +104,7 @@ class FusedSGD(Optimizer):
# momentum application can be skipped in the main kernel.
# momentum application can be skipped in the main kernel.
if
'momentum_buffer'
not
in
param_state
:
if
'momentum_buffer'
not
in
param_state
:
first_run
=
True
first_run
=
True
buf
=
param_state
[
'momentum_buffer'
]
=
torch
.
zeros_like
(
p
.
data
)
buf
=
param_state
[
'momentum_buffer'
]
=
torch
.
zeros_like
(
p
)
momentums
.
append
(
buf
)
momentums
.
append
(
buf
)
else
:
else
:
first_run
=
False
first_run
=
False
...
...
colossalai/nn/optimizer/hybrid_adam.py
View file @
c7221cb2
...
@@ -116,9 +116,9 @@ class HybridAdam(NVMeOptimizer):
...
@@ -116,9 +116,9 @@ class HybridAdam(NVMeOptimizer):
state
[
'step'
]
=
0
state
[
'step'
]
=
0
# gradient momentums
# gradient momentums
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
,
dtype
=
torch
.
float
,
device
=
target_device
)
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
,
dtype
=
torch
.
float
,
device
=
target_device
)
# gradient variances
# gradient variances
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
,
dtype
=
torch
.
float
,
device
=
target_device
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
,
dtype
=
torch
.
float
,
device
=
target_device
)
self
.
_post_state_init
(
p
)
self
.
_post_state_init
(
p
)
state
[
'step'
]
+=
1
state
[
'step'
]
+=
1
...
...
colossalai/nn/optimizer/lamb.py
View file @
c7221cb2
...
@@ -67,9 +67,9 @@ class Lamb(Optimizer):
...
@@ -67,9 +67,9 @@ class Lamb(Optimizer):
if
len
(
state
)
==
0
:
if
len
(
state
)
==
0
:
state
[
'step'
]
=
0
state
[
'step'
]
=
0
# Exponential moving average of gradient values
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
)
# Exponential moving average of squared gradient values
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
)
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
exp_avg
,
exp_avg_sq
=
state
[
'exp_avg'
],
state
[
'exp_avg_sq'
]
beta1
,
beta2
=
group
[
'betas'
]
beta1
,
beta2
=
group
[
'betas'
]
...
...
colossalai/tensor/process_group.py
View file @
c7221cb2
...
@@ -22,7 +22,6 @@ class PyTorchProcessGroupDict(metaclass=SingletonMeta):
...
@@ -22,7 +22,6 @@ class PyTorchProcessGroupDict(metaclass=SingletonMeta):
self
.
logger
=
get_dist_logger
(
'ProcessGroup'
)
self
.
logger
=
get_dist_logger
(
'ProcessGroup'
)
self
.
logger
.
info
(
f
'NCCL initialize ProcessGroup on
{
rank_list
}
'
,
ranks
=
[
0
])
self
.
logger
.
info
(
f
'NCCL initialize ProcessGroup on
{
rank_list
}
'
,
ranks
=
[
0
])
self
.
dict
[
pg_key
]
=
torch
.
distributed
.
new_group
(
ranks
=
rank_list
,
backend
=
backend
)
self
.
dict
[
pg_key
]
=
torch
.
distributed
.
new_group
(
ranks
=
rank_list
,
backend
=
backend
)
return
self
.
dict
[
pg_key
]
return
self
.
dict
[
pg_key
]
...
@@ -104,10 +103,15 @@ class ProcessGroup:
...
@@ -104,10 +103,15 @@ class ProcessGroup:
def
set_cpu_groups
(
self
):
def
set_cpu_groups
(
self
):
if
self
.
has_cpu_groups
:
if
self
.
has_cpu_groups
:
return
return
# self.logger.info(
# f'{self._rank} Gloo initialize TP group on {self._tp_rank_list}, DP group on {self._dp_rank_list}')
for
i
in
range
(
self
.
_dp_degree
):
PYTORCHPGDICT_
.
get
(
self
.
_tp_rank_list
,
'gloo'
)
i_tp_list
=
[
self
.
_rank_list
[
i
*
self
.
_tp_degree
+
j
]
for
j
in
range
(
self
.
_tp_degree
)]
PYTORCHPGDICT_
.
get
(
self
.
_dp_rank_list
,
'gloo'
)
PYTORCHPGDICT_
.
get
(
i_tp_list
,
'gloo'
)
for
j
in
range
(
self
.
_tp_degree
):
j_dp_list
=
[
self
.
_rank_list
[
i
*
self
.
_tp_degree
+
j
]
for
i
in
range
(
self
.
_dp_degree
)]
PYTORCHPGDICT_
.
get
(
j_dp_list
,
'gloo'
)
self
.
_has_cpu_groups
=
True
self
.
_has_cpu_groups
=
True
@
property
@
property
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment