Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
404ecbdc
Commit
404ecbdc
authored
Oct 28, 2021
by
zbian
Browse files
Migrated project
parent
2ebaefc5
Changes
409
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1319 additions
and
0 deletions
+1319
-0
colossalai/context/process_group_initializer/initializer_1d.py
...salai/context/process_group_initializer/initializer_1d.py
+44
-0
colossalai/context/process_group_initializer/initializer_2d.py
...salai/context/process_group_initializer/initializer_2d.py
+123
-0
colossalai/context/process_group_initializer/initializer_2p5d.py
...lai/context/process_group_initializer/initializer_2p5d.py
+255
-0
colossalai/context/process_group_initializer/initializer_3d.py
...salai/context/process_group_initializer/initializer_3d.py
+172
-0
colossalai/context/process_group_initializer/initializer_data.py
...lai/context/process_group_initializer/initializer_data.py
+41
-0
colossalai/context/process_group_initializer/initializer_pipeline.py
...context/process_group_initializer/initializer_pipeline.py
+63
-0
colossalai/context/process_group_initializer/initializer_sequence.py
...context/process_group_initializer/initializer_sequence.py
+27
-0
colossalai/context/process_group_initializer/initializer_tensor.py
...i/context/process_group_initializer/initializer_tensor.py
+41
-0
colossalai/context/process_group_initializer/process_group_initializer.py
...xt/process_group_initializer/process_group_initializer.py
+30
-0
colossalai/context/random/__init__.py
colossalai/context/random/__init__.py
+8
-0
colossalai/context/random/_helper.py
colossalai/context/random/_helper.py
+144
-0
colossalai/context/random/seed_manager.py
colossalai/context/random/seed_manager.py
+74
-0
colossalai/core.py
colossalai/core.py
+16
-0
colossalai/engine/__init__.py
colossalai/engine/__init__.py
+7
-0
colossalai/engine/_base_engine.py
colossalai/engine/_base_engine.py
+170
-0
colossalai/engine/amp_type.py
colossalai/engine/amp_type.py
+10
-0
colossalai/engine/gradient_handler/__init__.py
colossalai/engine/gradient_handler/__init__.py
+5
-0
colossalai/engine/gradient_handler/_base_gradient_handler.py
colossalai/engine/gradient_handler/_base_gradient_handler.py
+25
-0
colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
...ngine/gradient_handler/_data_parallel_gradient_handler.py
+48
-0
colossalai/engine/gradient_handler/_zero_gradient_handler.py
colossalai/engine/gradient_handler/_zero_gradient_handler.py
+16
-0
No files found.
colossalai/context/process_group_initializer/initializer_1d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch.distributed
as
dist
from
colossalai.context
import
Config
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_1D
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for 1d tensor parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
def
init_dist_group
(
self
):
'''Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
:rtype: tuple
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_1D
for
i
in
range
(
self
.
num_group
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
for
j
in
range
(
self
.
tensor_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_2d.py
0 → 100644
View file @
404ecbdc
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
SUMMA_DIM
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
def
_check_summa_env_var
(
summa_dim
):
# check environment variable for SUMMA
env_summa_dim
=
os
.
environ
.
get
(
SUMMA_DIM
,
None
)
if
env_summa_dim
:
assert
int
(
env_summa_dim
)
==
summa_dim
,
\
'SUMMA_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
SUMMA_DIM
]
=
str
(
summa_dim
)
class
Initializer_2D_Row
(
ProcessGroupInitializer
):
'''2d tensor parallel initialization among rows.
'''
def
__init__
(
self
,
num_group
,
summa_dim
,
*
args
,
**
kwargs
):
super
(
Initializer_2D_Row
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
num_group
self
.
summa_dim
=
summa_dim
def
init_dist_group
(
self
):
'''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor row parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2D_ROW
for
i
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
summa_dim
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
*
self
.
summa_dim
+
k
for
k
in
range
(
self
.
summa_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2D_Col
(
ProcessGroupInitializer
):
'''2d tensor parallel initialization among cols.
'''
def
__init__
(
self
,
num_group
,
summa_dim
,
*
args
,
**
kwargs
):
super
(
Initializer_2D_Col
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
num_group
self
.
summa_dim
=
summa_dim
def
init_dist_group
(
self
):
'''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor col parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2D_COL
for
i
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
summa_dim
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
+
k
*
self
.
summa_dim
for
k
in
range
(
self
.
summa_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_2D
(
ProcessGroupInitializer
):
"""
Serve as the single entry point to 2D parallel initialization.
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
summa_dim
=
int
(
math
.
sqrt
(
self
.
tensor_parallel_size
))
assert
self
.
tensor_parallel_size
==
self
.
summa_dim
**
2
,
\
"2D summa dim should equal to tensor parallel size ^ 0.5"
_check_summa_env_var
(
self
.
summa_dim
)
self
.
col_initializer
=
Initializer_2D_Col
(
self
.
num_group
,
self
.
summa_dim
,
*
args
,
**
kwargs
)
self
.
row_initializer
=
Initializer_2D_Row
(
self
.
num_group
,
self
.
summa_dim
,
*
args
,
**
kwargs
)
def
init_dist_group
(
self
):
'''Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
row_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
col_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_2p5d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
TESSERACT_DIM
,
TESSERACT_DEP
from
colossalai.context
import
Config
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
def
_check_tesseract_env_var
(
tesseract_dim
:
int
,
tesseract_dep
:
int
):
# check environment variable for TESSERACT
env_tesseract_dim
=
os
.
environ
.
get
(
TESSERACT_DIM
,
None
)
env_tesseract_dep
=
os
.
environ
.
get
(
TESSERACT_DEP
,
None
)
if
env_tesseract_dim
and
env_tesseract_dep
:
assert
int
(
env_tesseract_dim
)
==
tesseract_dim
,
\
'TESSERACT_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
assert
int
(
env_tesseract_dep
)
==
tesseract_dep
,
\
'TESSERACT_DEP has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
TESSERACT_DIM
]
=
str
(
tesseract_dim
)
os
.
environ
[
TESSERACT_DEP
]
=
str
(
tesseract_dep
)
# i row j col k dep
class
Initializer_2p5D_ROW
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among rows.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_ROW
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor row parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_ROW
for
h
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
tesseract_dim
):
for
k
in
range
(
self
.
tesseract_dep
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
i
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2p5D_Col
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among cols.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_Col
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor col parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_COL
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
for
k
in
range
(
self
.
tesseract_dep
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
j
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2p5D_Dep
(
ProcessGroupInitializer
):
'''2p5D tensor parallel initialization among depths.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_Dep
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor depth parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_DEP
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
for
j
in
range
(
self
.
tesseract_dim
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
k
in
range
(
self
.
tesseract_dep
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
# i row j col k dep
class
Initializer_2p5D_XZ
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among cols times dep.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_XZ
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor colXdepth parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_XZ
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
k
in
range
(
self
.
tesseract_dep
)
for
j
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_2p5D
(
ProcessGroupInitializer
):
"""
Serve as the single entry point to Tesseract parallel initialization.
"""
def
__init__
(
self
,
rank
:
int
,
world_size
:
int
,
config
:
Config
,
data_parallel_size
:
int
,
pipeline_parlalel_size
:
int
,
tensor_parallel_size
:
int
,
depth
:
int
):
args
=
(
rank
,
world_size
,
config
,
data_parallel_size
,
pipeline_parlalel_size
,
tensor_parallel_size
)
super
().
__init__
(
*
args
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dim
=
int
(
math
.
sqrt
(
self
.
tensor_parallel_size
/
depth
))
self
.
tesseract_dep
=
depth
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"2.5D tesseract dim should equal to (tensor parallel size / tesseract dep) ^ 0.5"
_check_tesseract_env_var
(
self
.
tesseract_dim
,
self
.
tesseract_dep
)
self
.
col_initializer
=
Initializer_2p5D_Col
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
row_initializer
=
Initializer_2p5D_ROW
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
dep_initializer
=
Initializer_2p5D_Dep
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
xz_initializer
=
Initializer_2p5D_XZ
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: Whole 2p5D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
col_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
row_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
dep_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
xz_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_3d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
DEPTH_3D
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
..parallel_mode
import
ParallelMode
from
.process_group_initializer
import
ProcessGroupInitializer
def
_check_depth_env_var
(
depth
):
# check environment variable for SUMMA
env_depth
=
os
.
environ
.
get
(
DEPTH_3D
,
None
)
if
env_depth
:
assert
int
(
env_depth
)
==
depth
,
\
'SUMMA_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
DEPTH_3D
]
=
str
(
depth
)
class
Initializer_3D_Input
(
ProcessGroupInitializer
):
'''2D tensor parallel initialization among input.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among input
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_INPUT
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
depth
):
for
k
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
j
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_3D_Weight
(
ProcessGroupInitializer
):
'''3D tensor parallel initialization among weight.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among weight
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_WEIGHT
for
h
in
range
(
self
.
num_group
):
for
k
in
range
(
self
.
depth
):
for
j
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
i
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_3D_Output
(
ProcessGroupInitializer
):
'''2D tensor parallel initialization among weight.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among output
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_OUTPUT
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
depth
):
for
j
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
k
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_3D
(
ProcessGroupInitializer
):
'''Serve as the single entry point to 3D parallel initialization.
'''
def
__init__
(
self
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
depth
=
round
(
math
.
pow
(
self
.
tensor_parallel_size
,
1
/
3
))
assert
self
.
tensor_parallel_size
==
self
.
depth
**
3
,
\
f
'3D depth (
{
self
.
depth
}
) if not cube root of tensor parallel size (
{
self
.
tensor_parallel_size
}
)'
_check_depth_env_var
(
self
.
depth
)
self
.
input_initializer
=
Initializer_3D_Input
(
self
.
num_group
,
self
.
depth
,
*
args
)
self
.
weight_initializer
=
Initializer_3D_Weight
(
self
.
num_group
,
self
.
depth
,
*
args
)
self
.
output_initializer
=
Initializer_3D_Output
(
self
.
num_group
,
self
.
depth
,
*
args
)
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
input_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
weight_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
output_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_data.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
torch
import
distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Data
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for data parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_data_parallel_group
=
self
.
world_size
//
self
.
data_parallel_size
def
init_dist_group
(
self
):
'''Initialize data parallel groups, and assign local_ranks and groups to each gpu.
:return: data parallelism's information
:rtype: tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
DATA
for
i
in
range
(
self
.
num_data_parallel_group
):
ranks
=
[
i
+
j
*
self
.
num_data_parallel_group
for
j
in
range
(
self
.
data_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_pipeline.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
torch
import
distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Pipeline
(
ProcessGroupInitializer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
data_group_size
=
self
.
world_size
//
self
.
data_parallel_size
self
.
pipeline_stage_size
=
self
.
data_group_size
//
self
.
pipeline_parallel_size
def
init_dist_group
(
self
):
dist_settings
=
list
()
for
i
in
range
(
self
.
data_parallel_size
):
for
j
in
range
(
self
.
pipeline_stage_size
):
pipe_ranks
=
list
(
range
(
i
*
self
.
data_group_size
+
j
,
(
i
+
1
)
*
self
.
data_group_size
,
self
.
pipeline_stage_size
))
pipe_group_size
=
len
(
pipe_ranks
)
pipe_group
=
dist
.
new_group
(
pipe_ranks
)
if
self
.
rank
in
pipe_ranks
:
local_rank
=
pipe_ranks
.
index
(
self
.
rank
)
group_world_size
=
pipe_group_size
process_group
=
pipe_group
ranks_in_group
=
pipe_ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE
)))
for
k
in
range
(
pipe_group_size
):
first
=
pipe_ranks
[
k
]
second
=
pipe_ranks
[(
k
+
1
)
%
pipe_group_size
]
ranks
=
[
first
,
second
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
==
first
:
local_rank
=
0
group_world_size
=
2
process_group
=
group
ranks_in_group
=
ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE_NEXT
)))
elif
self
.
rank
==
second
:
local_rank
=
1
group_world_size
=
2
process_group
=
group
ranks_in_group
=
ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE_PREV
)))
return
dist_settings
colossalai/context/process_group_initializer/initializer_sequence.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.initializer_tensor
import
Initializer_Tensor
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Sequence
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for sequence parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
# reuse tensor parallel code
self
.
_initializer
=
Initializer_Tensor
(
*
args
,
**
kwargs
)
def
init_dist_group
(
self
):
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
=
self
.
_initializer
.
init_dist_group
()
# change mode to sequence
mode
=
ParallelMode
.
SEQUENCE
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_tensor.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch.distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Tensor
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for tensor parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_tensor_parallel_group
=
self
.
world_size
//
self
.
tensor_parallel_size
def
init_dist_group
(
self
):
'''Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: tensor parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
TENSOR
for
i
in
range
(
self
.
num_tensor_parallel_group
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
for
j
in
range
(
self
.
tensor_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/process_group_initializer.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
abc
import
ABC
,
abstractmethod
from
colossalai.context
import
Config
class
ProcessGroupInitializer
(
ABC
):
'''An object, knowing the parallelism configuration, that initializes parallel groups.
'''
def
__init__
(
self
,
rank
:
int
,
world_size
:
int
,
config
:
Config
,
data_parallel_size
:
int
,
pipeline_parlalel_size
:
int
,
tensor_parallel_size
:
int
):
self
.
rank
=
rank
self
.
world_size
=
world_size
self
.
data_parallel_size
=
data_parallel_size
self
.
config
=
config
self
.
pipeline_parallel_size
=
pipeline_parlalel_size
self
.
tensor_parallel_size
=
tensor_parallel_size
super
().
__init__
()
@
abstractmethod
def
init_dist_group
(
self
):
pass
colossalai/context/random/__init__.py
0 → 100644
View file @
404ecbdc
from
._helper
import
(
seed
,
set_mode
,
with_seed
,
add_seed
,
get_seeds
,
get_states
,
get_current_mode
,
set_seed_states
,
sync_states
)
__all__
=
[
'seed'
,
'set_mode'
,
'with_seed'
,
'add_seed'
,
'get_seeds'
,
'get_states'
,
'get_current_mode'
,
'set_seed_states'
,
'sync_states'
]
colossalai/context/random/_helper.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
functools
from
contextlib
import
contextmanager
import
torch.cuda
from
torch
import
Tensor
from
.seed_manager
import
SeedManager
from
..parallel_mode
import
ParallelMode
_SEED_MANAGER
=
SeedManager
()
def
get_seeds
():
"""Returns the seeds of the seed manager.
:return: The seeds of the seed manager
:rtype: dict
"""
return
_SEED_MANAGER
.
seeds
def
get_states
(
copy
=
False
):
"""Returns the seed states of the seed manager.
:return: The seed states of the seed manager
:rtype: dict
"""
states
=
_SEED_MANAGER
.
seed_states
if
copy
:
new_states
=
dict
()
for
parallel_mode
,
state
in
states
.
items
():
new_states
[
parallel_mode
]
=
state
.
clone
()
return
new_states
else
:
return
_SEED_MANAGER
.
seed_states
def
get_current_mode
():
"""Returns the current mode of the seed manager.
:return: The current mode of the seed manager.
:rtype: :class:`torch.ByteTensor`
"""
return
_SEED_MANAGER
.
current_mode
def
add_seed
(
parallel_mode
:
ParallelMode
,
seed
:
int
):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
_SEED_MANAGER
.
add_seed
(
parallel_mode
,
seed
)
def
set_mode
(
parallel_mode
:
ParallelMode
):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
def
set_seed_states
(
parallel_mode
:
ParallelMode
,
state
:
Tensor
):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
_SEED_MANAGER
.
set_state
(
parallel_mode
,
state
)
def
sync_states
():
current_mode
=
get_current_mode
()
current_states
=
torch
.
cuda
.
get_rng_state
()
set_seed_states
(
current_mode
,
current_states
)
@
contextmanager
def
seed
(
parallel_mode
:
ParallelMode
):
""" A context for seed switch
Examples::
with seed(ParallelMode.DATA):
output = F.dropout(input)
"""
try
:
# set to new mode
current_mode
=
_SEED_MANAGER
.
current_mode
yield
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
finally
:
# recover
_SEED_MANAGER
.
set_mode
(
current_mode
)
def
with_seed
(
func
,
parallel_mode
:
ParallelMode
):
"""
A function wrapper which executes the function with a specified seed.
Examples::
# use with decorator
@with_seed(ParallelMode.DATA)
def forward(input):
return F.dropout(input)
out = forward(input)
# OR use it inline
def forward(input):
return F.dropout(input)
wrapper_forward = with_seed(forward, ParallelMode.DATA)
out = wrapped_forward(input)
"""
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
# switch mode
current_mode
=
_SEED_MANAGER
.
current_mode
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
# exec func
out
=
func
(
*
args
,
**
kwargs
)
# recover state
_SEED_MANAGER
.
set_mode
(
current_mode
)
return
out
return
wrapper
colossalai/context/random/seed_manager.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch
from
torch
import
Tensor
from
colossalai.context.parallel_mode
import
ParallelMode
class
SeedManager
:
"""This class is a manager of all random seeds involved in the system.
"""
def
__init__
(
self
):
self
.
_current_mode
=
None
self
.
_seeds
=
dict
()
self
.
_seed_states
=
dict
()
@
property
def
current_mode
(
self
):
return
self
.
_current_mode
@
property
def
seeds
(
self
):
return
self
.
_seeds
@
property
def
seed_states
(
self
):
return
self
.
_seed_states
def
set_state
(
self
,
parallel_mode
:
ParallelMode
,
state
:
Tensor
):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
assert
parallel_mode
in
self
.
_seed_states
,
f
'Parallel mode
{
parallel_mode
}
is not found in the seed manager'
self
.
_seed_states
[
parallel_mode
]
=
state
def
set_mode
(
self
,
parallel_mode
:
ParallelMode
):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
if
self
.
current_mode
:
# save the current state for current mode
self
.
_seed_states
[
self
.
_current_mode
]
=
torch
.
cuda
.
get_rng_state
()
# set the new state for new mode
self
.
_current_mode
=
parallel_mode
torch
.
cuda
.
set_rng_state
(
self
.
_seed_states
[
parallel_mode
])
def
add_seed
(
self
,
parallel_mode
:
ParallelMode
,
seed
:
int
):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
assert
isinstance
(
parallel_mode
,
ParallelMode
),
'A valid ParallelMode must be provided'
assert
parallel_mode
not
in
self
.
_seed_states
,
f
'The seed for
{
parallel_mode
}
has been added'
current_state
=
torch
.
cuda
.
get_rng_state
()
torch
.
cuda
.
manual_seed
(
seed
)
self
.
_seed_states
[
parallel_mode
]
=
torch
.
cuda
.
get_rng_state
()
self
.
_seeds
[
parallel_mode
]
=
seed
torch
.
cuda
.
set_rng_state
(
current_state
)
colossalai/core.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
colossalai.context
import
ParallelContext
global_context
=
ParallelContext
()
def
set_global_context
(
context
:
ParallelContext
):
'''Reset global context to be identical to a given :class:ParallelContext.
:param context: Parallel context to generate our global parallel context.
:type context: ParallelContext
'''
global
global_context
global_context
=
context
colossalai/engine/__init__.py
0 → 100644
View file @
404ecbdc
from
.amp_type
import
AMP_TYPE
from
._base_engine
import
Engine
from
.gradient_handler
import
*
from
.schedule
import
*
__all__
=
[
'Engine'
]
colossalai/engine/_base_engine.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
typing
import
Optional
from
colossalai.builder
import
build_gradient_handler
from
colossalai.context
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_global_dist_logger
from
colossalai.nn
import
(
ZeroRedundancyOptimizer_Level_2
,
ZeroRedundancyOptimizer_Level_3
)
from
torch.nn
import
Module
from
torch.nn.modules.loss
import
_Loss
from
torch.optim
import
Optimizer
from
torch.optim.lr_scheduler
import
_LRScheduler
from
torch.utils.data
import
DataLoader
from
.schedule
import
BaseSchedule
,
NoPipelineSchedule
class
Engine
:
"""Basic engine class for training and evaluation. It runs a specific process method
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
:param train_dataloader: Dataloader in training
:param test_dataloader: Dataloader in evaluation
:param model: The neural network model
:param criterion: Criterion for calculating loss
:param optimizer: Optimizer for updating the parameters
:param lr_scheduler: Learning rate scheduler ajusting learning rate during the training or evaluation
:param schedule: Running schedule in :meth:`step`
:type train_dataloader: DataLoader, optional
:type test_dataloader: DataLoader, optional
:type model: Module
:type criterion: _Loss, optional
:type optimizer: Optimizer, optional
:type lr_scheduler: _LRScheduler, optional
:type schedule: BaseSchedule, optional
"""
def
__init__
(
self
,
train_dataloader
:
Optional
[
DataLoader
]
=
None
,
test_dataloader
:
Optional
[
DataLoader
]
=
None
,
model
:
Module
=
None
,
criterion
:
_Loss
=
None
,
optimizer
:
Optimizer
=
None
,
lr_scheduler
:
Optional
[
_LRScheduler
]
=
None
,
schedule
:
BaseSchedule
=
None
):
self
.
train_dataloader
=
train_dataloader
self
.
test_dataloader
=
test_dataloader
assert
model
is
not
None
,
"Engine requires a model"
self
.
model
=
model
self
.
criterion
=
criterion
self
.
optimizer
=
optimizer
self
.
lr_scheduler
=
lr_scheduler
self
.
schedule
=
schedule
if
schedule
is
not
None
\
else
NoPipelineSchedule
()
self
.
_logger
=
get_global_dist_logger
()
# build gradient handler
self
.
_gradient_handlers
=
[]
gradient_handler_cfg
=
[]
if
hasattr
(
gpc
.
config
,
'gradient_handler'
):
assert
isinstance
(
gpc
.
config
.
gradient_handler
,
list
),
\
f
'argument gradient_handler_cfg expected type list, '
\
f
'but got type
{
type
(
gpc
.
config
.
gradient_handler
)
}
'
gradient_handler_cfg
=
gpc
.
config
.
gradient_handler
elif
isinstance
(
self
.
optimizer
,
(
ZeroRedundancyOptimizer_Level_2
,
ZeroRedundancyOptimizer_Level_3
)):
gradient_handler_cfg
=
[
dict
(
type
=
'ZeROGradientHandler'
)]
self
.
_logger
.
info
(
"Training with zero is detected, ZeROGradientHandler is automatically "
"added even though not specified in the configuration"
,
ranks
=
[
0
])
elif
gpc
.
is_initialized
(
ParallelMode
.
DATA
)
and
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
>
1
:
gradient_handler_cfg
=
[
dict
(
type
=
'DataParallelGradientHandler'
)]
self
.
_logger
.
info
(
"Data parallel training is detected, DataParallelGradientHandler is automatically "
"added even though not specified in the configuration"
,
ranks
=
[
0
])
if
len
(
gradient_handler_cfg
)
==
0
:
self
.
_logger
.
warning
(
"No gradient handler is set up, please make sure you do not need "
"to all-reduce the gradients after a training step."
,
ranks
=
[
0
])
for
cfg
in
gradient_handler_cfg
:
handler
=
build_gradient_handler
(
cfg
,
self
.
model
,
self
.
optimizer
)
self
.
_gradient_handlers
.
append
(
handler
)
self
.
schedule
.
initialize
(
self
.
train_dataloader
,
self
.
model
,
self
.
criterion
,
self
.
optimizer
,
self
.
lr_scheduler
)
self
.
forward_only
=
False
def
handle_gradient
(
self
):
"""Handles all-reduce operations of gradients across different parallel groups.
"""
for
handler
in
self
.
_gradient_handlers
:
handler
.
handle_gradient
()
def
set_dataloader
(
self
,
data
:
DataLoader
,
train
:
bool
=
True
):
"""Sets dataloader in training or evaluation.
:param data: Dataloader to be set
:param train: Set training dataloader if True, otherwise evaluation dataloader
:type data: DataLoader
:type train: bool
"""
if
train
:
self
.
train_dataloader
=
data
else
:
self
.
test_dataloader
=
data
def
get_model
(
self
):
"""Returns the neural network model in the engine.
"""
return
self
.
model
def
get_optimizer
(
self
):
"""Returns optimizier in the engine.
"""
return
self
.
optimizer
def
get_lr_scheduler
(
self
):
"""Returns the learning rate scheduler in the engine.
"""
return
self
.
lr_scheduler
def
train
(
self
):
"""Sets the model to training mode.
"""
self
.
forward_only
=
False
self
.
schedule
.
train
(
dataloader
=
self
.
train_dataloader
,
mode
=
True
)
def
eval
(
self
):
"""Sets the model to evaluation mode.
"""
self
.
forward_only
=
True
self
.
schedule
.
train
(
dataloader
=
self
.
test_dataloader
,
mode
=
False
)
def
is_train
(
self
):
"""Returns True if it is in training, otherwise False.
"""
return
not
self
.
forward_only
def
get_lr
(
self
):
"""Gets current learning rate.
"""
return
self
.
schedule
.
get_lr
()
def
step
(
self
,
return_loss
=
True
):
"""A running step based on the schedule. Usually, it runs a training or
evaluation over a batch of dataset.
:param return_loss: loss will be returned if True
:type return_loss: bool
:return: (output, lablel, loss)
"""
self
.
schedule
.
zero_grad
(
forward_only
=
self
.
forward_only
)
output
,
label
,
loss
=
self
.
schedule
.
forward_backward_step
(
forward_only
=
self
.
forward_only
,
return_loss
=
return_loss
)
if
not
self
.
forward_only
:
# all reduce gradients
self
.
handle_gradient
()
self
.
schedule
.
step
()
return
output
,
label
,
loss
colossalai/engine/amp_type.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
enum
import
Enum
class
AMP_TYPE
(
Enum
):
APEX
=
'apex'
TORCH
=
'torch'
PARALLEL
=
'parallel'
colossalai/engine/gradient_handler/__init__.py
0 → 100644
View file @
404ecbdc
from
._base_gradient_handler
import
BaseGradientHandler
from
._data_parallel_gradient_handler
import
DataParallelGradientHandler
from
._zero_gradient_handler
import
ZeROGradientHandler
__all__
=
[
'BaseGradientHandler'
,
'DataParallelGradientHandler'
,
'ZeROGradientHandler'
]
colossalai/engine/gradient_handler/_base_gradient_handler.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
abc
import
ABC
,
abstractmethod
class
BaseGradientHandler
(
ABC
):
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups
before optimization.
:param model: Model where the gradients accumulate
:param optimizer: Optimizer for updating the parameters
:type model: Module
:type optimizer: Optimizer
"""
def
__init__
(
self
,
model
,
optimizer
):
self
.
_model
=
model
self
.
_optimizer
=
optimizer
@
abstractmethod
def
handle_gradient
(
self
):
"""A method to accumulate gradients across different parallel groups. Users should
write their own functions or just use the functions in pre-defined subclasses.
"""
pass
colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
import
torch.distributed
as
dist
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
GRADIENT_HANDLER
from
._base_gradient_handler
import
BaseGradientHandler
from
...context.parallel_mode
import
ParallelMode
@
GRADIENT_HANDLER
.
register_module
class
DataParallelGradientHandler
(
BaseGradientHandler
):
"""A helper class to handle all-reduce operations in a data parallel group.
A all-reduce collective communication will be operated in
:func:`handle_gradient` among a data parallel group.
For better performance, it bucketizes the gradients of all parameters that are
the same type to improve the efficiency of communication.
"""
def
handle_gradient
(
self
):
"""A method running a all-reduce operation in a data parallel group.
"""
# TODO: add memory buffer
if
gpc
.
data_parallel_size
>
1
:
# bucketize and all-reduce
buckets
=
{}
# Pack the buckets.
for
param
in
self
.
_model
.
parameters
():
if
param
.
requires_grad
and
param
.
grad
is
not
None
:
tp
=
param
.
data
.
type
()
if
tp
not
in
buckets
:
buckets
[
tp
]
=
[]
buckets
[
tp
].
append
(
param
)
param
.
main_grad
=
param
.
grad
# For each bucket, all-reduce and copy all-reduced grads.
for
tp
in
buckets
:
bucket
=
buckets
[
tp
]
grads
=
[
param
.
grad
.
data
for
param
in
bucket
]
coalesced
=
_flatten_dense_tensors
(
grads
)
coalesced
/=
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
dist
.
all_reduce
(
coalesced
,
group
=
gpc
.
get_group
(
ParallelMode
.
DATA
))
for
buf
,
synced
in
zip
(
grads
,
_unflatten_dense_tensors
(
coalesced
,
grads
)):
buf
.
copy_
(
synced
)
colossalai/engine/gradient_handler/_zero_gradient_handler.py
0 → 100644
View file @
404ecbdc
from
colossalai.registry
import
GRADIENT_HANDLER
from
._base_gradient_handler
import
BaseGradientHandler
@
GRADIENT_HANDLER
.
register_module
class
ZeROGradientHandler
(
BaseGradientHandler
):
"""A helper class to handle all-reduce operations in a data parallel group.
A all-reduce collective communication will be operated in
:func:`handle_gradient` among a data parallel group.
This class is specialized with ZeRO optimization.
"""
def
handle_gradient
(
self
):
"""A method running a all-reduce operation in a data parallel group.
"""
self
.
_optimizer
.
allreduce_gradients
()
Prev
1
2
3
4
5
6
…
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment