Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
404ecbdc
"tests/git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "839847b7d78bce6af5dfe58d27b5ce2c74a3619b"
Commit
404ecbdc
authored
Oct 28, 2021
by
zbian
Browse files
Migrated project
parent
2ebaefc5
Changes
409
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1319 additions
and
0 deletions
+1319
-0
colossalai/context/process_group_initializer/initializer_1d.py
...salai/context/process_group_initializer/initializer_1d.py
+44
-0
colossalai/context/process_group_initializer/initializer_2d.py
...salai/context/process_group_initializer/initializer_2d.py
+123
-0
colossalai/context/process_group_initializer/initializer_2p5d.py
...lai/context/process_group_initializer/initializer_2p5d.py
+255
-0
colossalai/context/process_group_initializer/initializer_3d.py
...salai/context/process_group_initializer/initializer_3d.py
+172
-0
colossalai/context/process_group_initializer/initializer_data.py
...lai/context/process_group_initializer/initializer_data.py
+41
-0
colossalai/context/process_group_initializer/initializer_pipeline.py
...context/process_group_initializer/initializer_pipeline.py
+63
-0
colossalai/context/process_group_initializer/initializer_sequence.py
...context/process_group_initializer/initializer_sequence.py
+27
-0
colossalai/context/process_group_initializer/initializer_tensor.py
...i/context/process_group_initializer/initializer_tensor.py
+41
-0
colossalai/context/process_group_initializer/process_group_initializer.py
...xt/process_group_initializer/process_group_initializer.py
+30
-0
colossalai/context/random/__init__.py
colossalai/context/random/__init__.py
+8
-0
colossalai/context/random/_helper.py
colossalai/context/random/_helper.py
+144
-0
colossalai/context/random/seed_manager.py
colossalai/context/random/seed_manager.py
+74
-0
colossalai/core.py
colossalai/core.py
+16
-0
colossalai/engine/__init__.py
colossalai/engine/__init__.py
+7
-0
colossalai/engine/_base_engine.py
colossalai/engine/_base_engine.py
+170
-0
colossalai/engine/amp_type.py
colossalai/engine/amp_type.py
+10
-0
colossalai/engine/gradient_handler/__init__.py
colossalai/engine/gradient_handler/__init__.py
+5
-0
colossalai/engine/gradient_handler/_base_gradient_handler.py
colossalai/engine/gradient_handler/_base_gradient_handler.py
+25
-0
colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
...ngine/gradient_handler/_data_parallel_gradient_handler.py
+48
-0
colossalai/engine/gradient_handler/_zero_gradient_handler.py
colossalai/engine/gradient_handler/_zero_gradient_handler.py
+16
-0
No files found.
colossalai/context/process_group_initializer/initializer_1d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch.distributed
as
dist
from
colossalai.context
import
Config
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_1D
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for 1d tensor parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
def
init_dist_group
(
self
):
'''Initialize 1D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: (local_rank, group_world_size, process_group, ranks_in_group, mode)
:rtype: tuple
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_1D
for
i
in
range
(
self
.
num_group
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
for
j
in
range
(
self
.
tensor_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_2d.py
0 → 100644
View file @
404ecbdc
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
SUMMA_DIM
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
def
_check_summa_env_var
(
summa_dim
):
# check environment variable for SUMMA
env_summa_dim
=
os
.
environ
.
get
(
SUMMA_DIM
,
None
)
if
env_summa_dim
:
assert
int
(
env_summa_dim
)
==
summa_dim
,
\
'SUMMA_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
SUMMA_DIM
]
=
str
(
summa_dim
)
class
Initializer_2D_Row
(
ProcessGroupInitializer
):
'''2d tensor parallel initialization among rows.
'''
def
__init__
(
self
,
num_group
,
summa_dim
,
*
args
,
**
kwargs
):
super
(
Initializer_2D_Row
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
num_group
self
.
summa_dim
=
summa_dim
def
init_dist_group
(
self
):
'''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor row parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2D_ROW
for
i
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
summa_dim
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
*
self
.
summa_dim
+
k
for
k
in
range
(
self
.
summa_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2D_Col
(
ProcessGroupInitializer
):
'''2d tensor parallel initialization among cols.
'''
def
__init__
(
self
,
num_group
,
summa_dim
,
*
args
,
**
kwargs
):
super
(
Initializer_2D_Col
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
num_group
self
.
summa_dim
=
summa_dim
def
init_dist_group
(
self
):
'''Initialize 2D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor col parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2D_COL
for
i
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
summa_dim
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
+
k
*
self
.
summa_dim
for
k
in
range
(
self
.
summa_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_2D
(
ProcessGroupInitializer
):
"""
Serve as the single entry point to 2D parallel initialization.
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
summa_dim
=
int
(
math
.
sqrt
(
self
.
tensor_parallel_size
))
assert
self
.
tensor_parallel_size
==
self
.
summa_dim
**
2
,
\
"2D summa dim should equal to tensor parallel size ^ 0.5"
_check_summa_env_var
(
self
.
summa_dim
)
self
.
col_initializer
=
Initializer_2D_Col
(
self
.
num_group
,
self
.
summa_dim
,
*
args
,
**
kwargs
)
self
.
row_initializer
=
Initializer_2D_Row
(
self
.
num_group
,
self
.
summa_dim
,
*
args
,
**
kwargs
)
def
init_dist_group
(
self
):
'''Initialize 2D tensor row and col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
row_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
col_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_2p5d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
TESSERACT_DIM
,
TESSERACT_DEP
from
colossalai.context
import
Config
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
def
_check_tesseract_env_var
(
tesseract_dim
:
int
,
tesseract_dep
:
int
):
# check environment variable for TESSERACT
env_tesseract_dim
=
os
.
environ
.
get
(
TESSERACT_DIM
,
None
)
env_tesseract_dep
=
os
.
environ
.
get
(
TESSERACT_DEP
,
None
)
if
env_tesseract_dim
and
env_tesseract_dep
:
assert
int
(
env_tesseract_dim
)
==
tesseract_dim
,
\
'TESSERACT_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
assert
int
(
env_tesseract_dep
)
==
tesseract_dep
,
\
'TESSERACT_DEP has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
TESSERACT_DIM
]
=
str
(
tesseract_dim
)
os
.
environ
[
TESSERACT_DEP
]
=
str
(
tesseract_dep
)
# i row j col k dep
class
Initializer_2p5D_ROW
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among rows.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_ROW
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor row parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor row parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_ROW
for
h
in
range
(
self
.
num_group
):
for
j
in
range
(
self
.
tesseract_dim
):
for
k
in
range
(
self
.
tesseract_dep
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
i
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2p5D_Col
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among cols.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_Col
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor col parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor col parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_COL
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
for
k
in
range
(
self
.
tesseract_dep
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
j
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_2p5D_Dep
(
ProcessGroupInitializer
):
'''2p5D tensor parallel initialization among depths.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_Dep
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor depth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor depth parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_DEP
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
for
j
in
range
(
self
.
tesseract_dim
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
k
in
range
(
self
.
tesseract_dep
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
# i row j col k dep
class
Initializer_2p5D_XZ
(
ProcessGroupInitializer
):
'''2p5d tensor parallel initialization among cols times dep.
'''
def
__init__
(
self
,
tesseract_dim
:
int
,
tesseract_dep
:
int
,
*
args
):
super
(
Initializer_2p5D_XZ
,
self
).
__init__
(
*
args
)
self
.
tensor_parallel_size
=
gpc
.
tensor_parallel_size
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dep
=
tesseract_dep
self
.
tesseract_dim
=
tesseract_dim
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"Tensor parallel size should be depth * dim ** 2 in 2.5D parallel"
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: 2p5D tensor colXdepth parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_2P5D_XZ
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
tesseract_dim
):
ranks
=
[
h
*
self
.
tensor_parallel_size
+
i
+
self
.
tesseract_dim
*
(
j
+
self
.
tesseract_dim
*
k
)
for
k
in
range
(
self
.
tesseract_dep
)
for
j
in
range
(
self
.
tesseract_dim
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_2p5D
(
ProcessGroupInitializer
):
"""
Serve as the single entry point to Tesseract parallel initialization.
"""
def
__init__
(
self
,
rank
:
int
,
world_size
:
int
,
config
:
Config
,
data_parallel_size
:
int
,
pipeline_parlalel_size
:
int
,
tensor_parallel_size
:
int
,
depth
:
int
):
args
=
(
rank
,
world_size
,
config
,
data_parallel_size
,
pipeline_parlalel_size
,
tensor_parallel_size
)
super
().
__init__
(
*
args
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
tesseract_dim
=
int
(
math
.
sqrt
(
self
.
tensor_parallel_size
/
depth
))
self
.
tesseract_dep
=
depth
assert
self
.
tensor_parallel_size
==
self
.
tesseract_dim
**
2
*
self
.
tesseract_dep
,
\
"2.5D tesseract dim should equal to (tensor parallel size / tesseract dep) ^ 0.5"
_check_tesseract_env_var
(
self
.
tesseract_dim
,
self
.
tesseract_dep
)
self
.
col_initializer
=
Initializer_2p5D_Col
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
row_initializer
=
Initializer_2p5D_ROW
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
dep_initializer
=
Initializer_2p5D_Dep
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
self
.
xz_initializer
=
Initializer_2p5D_XZ
(
self
.
tesseract_dim
,
self
.
tesseract_dep
,
*
args
)
def
init_dist_group
(
self
):
'''Initialize 2p5D tensor row, col, depth, and colXdepth parallel groups, and assign local_ranks and groups to each gpu.
:return: Whole 2p5D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
col_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
row_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
dep_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
xz_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_3d.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
math
import
os
import
torch.distributed
as
dist
from
colossalai.constants
import
DEPTH_3D
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
..parallel_mode
import
ParallelMode
from
.process_group_initializer
import
ProcessGroupInitializer
def
_check_depth_env_var
(
depth
):
# check environment variable for SUMMA
env_depth
=
os
.
environ
.
get
(
DEPTH_3D
,
None
)
if
env_depth
:
assert
int
(
env_depth
)
==
depth
,
\
'SUMMA_DIM has been set in the current environment and '
\
'does not match with the value passed to this initialized'
else
:
os
.
environ
[
DEPTH_3D
]
=
str
(
depth
)
class
Initializer_3D_Input
(
ProcessGroupInitializer
):
'''2D tensor parallel initialization among input.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among input, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among input
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_INPUT
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
depth
):
for
k
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
j
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_3D_Weight
(
ProcessGroupInitializer
):
'''3D tensor parallel initialization among weight.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among weight, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among weight
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_WEIGHT
for
h
in
range
(
self
.
num_group
):
for
k
in
range
(
self
.
depth
):
for
j
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
i
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
class
Initializer_3D_Output
(
ProcessGroupInitializer
):
'''2D tensor parallel initialization among weight.
'''
def
__init__
(
self
,
num_group
:
int
,
depth
:
int
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
num_group
self
.
depth
=
depth
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups among output, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information among output
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
PARALLEL_3D_OUTPUT
for
h
in
range
(
self
.
num_group
):
for
i
in
range
(
self
.
depth
):
for
j
in
range
(
self
.
depth
):
ranks
=
[
h
*
self
.
depth
**
3
+
i
+
self
.
depth
*
(
j
+
self
.
depth
*
k
)
for
k
in
range
(
self
.
depth
)
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_3D
(
ProcessGroupInitializer
):
'''Serve as the single entry point to 3D parallel initialization.
'''
def
__init__
(
self
,
*
args
):
super
().
__init__
(
*
args
)
self
.
num_group
=
self
.
world_size
//
self
.
tensor_parallel_size
self
.
depth
=
round
(
math
.
pow
(
self
.
tensor_parallel_size
,
1
/
3
))
assert
self
.
tensor_parallel_size
==
self
.
depth
**
3
,
\
f
'3D depth (
{
self
.
depth
}
) if not cube root of tensor parallel size (
{
self
.
tensor_parallel_size
}
)'
_check_depth_env_var
(
self
.
depth
)
self
.
input_initializer
=
Initializer_3D_Input
(
self
.
num_group
,
self
.
depth
,
*
args
)
self
.
weight_initializer
=
Initializer_3D_Weight
(
self
.
num_group
,
self
.
depth
,
*
args
)
self
.
output_initializer
=
Initializer_3D_Output
(
self
.
num_group
,
self
.
depth
,
*
args
)
def
init_dist_group
(
self
):
'''Initialize 3D tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: 3D tensor parallelism's information
:rtype: list of tuples (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
parallel_setting
=
[]
parallel_setting
.
append
(
self
.
input_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
weight_initializer
.
init_dist_group
())
parallel_setting
.
append
(
self
.
output_initializer
.
init_dist_group
())
return
parallel_setting
colossalai/context/process_group_initializer/initializer_data.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
torch
import
distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Data
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for data parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_data_parallel_group
=
self
.
world_size
//
self
.
data_parallel_size
def
init_dist_group
(
self
):
'''Initialize data parallel groups, and assign local_ranks and groups to each gpu.
:return: data parallelism's information
:rtype: tuple (local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
DATA
for
i
in
range
(
self
.
num_data_parallel_group
):
ranks
=
[
i
+
j
*
self
.
num_data_parallel_group
for
j
in
range
(
self
.
data_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_pipeline.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
torch
import
distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Pipeline
(
ProcessGroupInitializer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
data_group_size
=
self
.
world_size
//
self
.
data_parallel_size
self
.
pipeline_stage_size
=
self
.
data_group_size
//
self
.
pipeline_parallel_size
def
init_dist_group
(
self
):
dist_settings
=
list
()
for
i
in
range
(
self
.
data_parallel_size
):
for
j
in
range
(
self
.
pipeline_stage_size
):
pipe_ranks
=
list
(
range
(
i
*
self
.
data_group_size
+
j
,
(
i
+
1
)
*
self
.
data_group_size
,
self
.
pipeline_stage_size
))
pipe_group_size
=
len
(
pipe_ranks
)
pipe_group
=
dist
.
new_group
(
pipe_ranks
)
if
self
.
rank
in
pipe_ranks
:
local_rank
=
pipe_ranks
.
index
(
self
.
rank
)
group_world_size
=
pipe_group_size
process_group
=
pipe_group
ranks_in_group
=
pipe_ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE
)))
for
k
in
range
(
pipe_group_size
):
first
=
pipe_ranks
[
k
]
second
=
pipe_ranks
[(
k
+
1
)
%
pipe_group_size
]
ranks
=
[
first
,
second
]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
==
first
:
local_rank
=
0
group_world_size
=
2
process_group
=
group
ranks_in_group
=
ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE_NEXT
)))
elif
self
.
rank
==
second
:
local_rank
=
1
group_world_size
=
2
process_group
=
group
ranks_in_group
=
ranks
dist_settings
.
append
(
tuple
((
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
ParallelMode
.
PIPELINE_PREV
)))
return
dist_settings
colossalai/context/process_group_initializer/initializer_sequence.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.initializer_tensor
import
Initializer_Tensor
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Sequence
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for sequence parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
# reuse tensor parallel code
self
.
_initializer
=
Initializer_Tensor
(
*
args
,
**
kwargs
)
def
init_dist_group
(
self
):
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
=
self
.
_initializer
.
init_dist_group
()
# change mode to sequence
mode
=
ParallelMode
.
SEQUENCE
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/initializer_tensor.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch.distributed
as
dist
from
colossalai.registry
import
DIST_GROUP_INITIALIZER
from
.process_group_initializer
import
ProcessGroupInitializer
from
..parallel_mode
import
ParallelMode
@
DIST_GROUP_INITIALIZER
.
register_module
class
Initializer_Tensor
(
ProcessGroupInitializer
):
'''A ProcessGroupInitializer for tensor parallelism.
'''
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
num_tensor_parallel_group
=
self
.
world_size
//
self
.
tensor_parallel_size
def
init_dist_group
(
self
):
'''Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
:return: tensor parallelism's information
:rtype: tuple(local_rank, group_world_size, process_group, ranks_in_group, mode)
'''
local_rank
=
None
ranks_in_group
=
None
process_group
=
None
group_world_size
=
None
mode
=
ParallelMode
.
TENSOR
for
i
in
range
(
self
.
num_tensor_parallel_group
):
ranks
=
[
i
*
self
.
tensor_parallel_size
+
j
for
j
in
range
(
self
.
tensor_parallel_size
)]
group
=
dist
.
new_group
(
ranks
)
if
self
.
rank
in
ranks
:
local_rank
=
ranks
.
index
(
self
.
rank
)
group_world_size
=
len
(
ranks
)
process_group
=
group
ranks_in_group
=
ranks
return
local_rank
,
group_world_size
,
process_group
,
ranks_in_group
,
mode
colossalai/context/process_group_initializer/process_group_initializer.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
abc
import
ABC
,
abstractmethod
from
colossalai.context
import
Config
class
ProcessGroupInitializer
(
ABC
):
'''An object, knowing the parallelism configuration, that initializes parallel groups.
'''
def
__init__
(
self
,
rank
:
int
,
world_size
:
int
,
config
:
Config
,
data_parallel_size
:
int
,
pipeline_parlalel_size
:
int
,
tensor_parallel_size
:
int
):
self
.
rank
=
rank
self
.
world_size
=
world_size
self
.
data_parallel_size
=
data_parallel_size
self
.
config
=
config
self
.
pipeline_parallel_size
=
pipeline_parlalel_size
self
.
tensor_parallel_size
=
tensor_parallel_size
super
().
__init__
()
@
abstractmethod
def
init_dist_group
(
self
):
pass
colossalai/context/random/__init__.py
0 → 100644
View file @
404ecbdc
from
._helper
import
(
seed
,
set_mode
,
with_seed
,
add_seed
,
get_seeds
,
get_states
,
get_current_mode
,
set_seed_states
,
sync_states
)
__all__
=
[
'seed'
,
'set_mode'
,
'with_seed'
,
'add_seed'
,
'get_seeds'
,
'get_states'
,
'get_current_mode'
,
'set_seed_states'
,
'sync_states'
]
colossalai/context/random/_helper.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
functools
from
contextlib
import
contextmanager
import
torch.cuda
from
torch
import
Tensor
from
.seed_manager
import
SeedManager
from
..parallel_mode
import
ParallelMode
_SEED_MANAGER
=
SeedManager
()
def
get_seeds
():
"""Returns the seeds of the seed manager.
:return: The seeds of the seed manager
:rtype: dict
"""
return
_SEED_MANAGER
.
seeds
def
get_states
(
copy
=
False
):
"""Returns the seed states of the seed manager.
:return: The seed states of the seed manager
:rtype: dict
"""
states
=
_SEED_MANAGER
.
seed_states
if
copy
:
new_states
=
dict
()
for
parallel_mode
,
state
in
states
.
items
():
new_states
[
parallel_mode
]
=
state
.
clone
()
return
new_states
else
:
return
_SEED_MANAGER
.
seed_states
def
get_current_mode
():
"""Returns the current mode of the seed manager.
:return: The current mode of the seed manager.
:rtype: :class:`torch.ByteTensor`
"""
return
_SEED_MANAGER
.
current_mode
def
add_seed
(
parallel_mode
:
ParallelMode
,
seed
:
int
):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
_SEED_MANAGER
.
add_seed
(
parallel_mode
,
seed
)
def
set_mode
(
parallel_mode
:
ParallelMode
):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
def
set_seed_states
(
parallel_mode
:
ParallelMode
,
state
:
Tensor
):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
_SEED_MANAGER
.
set_state
(
parallel_mode
,
state
)
def
sync_states
():
current_mode
=
get_current_mode
()
current_states
=
torch
.
cuda
.
get_rng_state
()
set_seed_states
(
current_mode
,
current_states
)
@
contextmanager
def
seed
(
parallel_mode
:
ParallelMode
):
""" A context for seed switch
Examples::
with seed(ParallelMode.DATA):
output = F.dropout(input)
"""
try
:
# set to new mode
current_mode
=
_SEED_MANAGER
.
current_mode
yield
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
finally
:
# recover
_SEED_MANAGER
.
set_mode
(
current_mode
)
def
with_seed
(
func
,
parallel_mode
:
ParallelMode
):
"""
A function wrapper which executes the function with a specified seed.
Examples::
# use with decorator
@with_seed(ParallelMode.DATA)
def forward(input):
return F.dropout(input)
out = forward(input)
# OR use it inline
def forward(input):
return F.dropout(input)
wrapper_forward = with_seed(forward, ParallelMode.DATA)
out = wrapped_forward(input)
"""
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
# switch mode
current_mode
=
_SEED_MANAGER
.
current_mode
_SEED_MANAGER
.
set_mode
(
parallel_mode
)
# exec func
out
=
func
(
*
args
,
**
kwargs
)
# recover state
_SEED_MANAGER
.
set_mode
(
current_mode
)
return
out
return
wrapper
colossalai/context/random/seed_manager.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
torch
from
torch
import
Tensor
from
colossalai.context.parallel_mode
import
ParallelMode
class
SeedManager
:
"""This class is a manager of all random seeds involved in the system.
"""
def
__init__
(
self
):
self
.
_current_mode
=
None
self
.
_seeds
=
dict
()
self
.
_seed_states
=
dict
()
@
property
def
current_mode
(
self
):
return
self
.
_current_mode
@
property
def
seeds
(
self
):
return
self
.
_seeds
@
property
def
seed_states
(
self
):
return
self
.
_seed_states
def
set_state
(
self
,
parallel_mode
:
ParallelMode
,
state
:
Tensor
):
"""Sets the state of the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param state: the state to be set
:type state: :class:`torch.Tensor`
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager
"""
assert
parallel_mode
in
self
.
_seed_states
,
f
'Parallel mode
{
parallel_mode
}
is not found in the seed manager'
self
.
_seed_states
[
parallel_mode
]
=
state
def
set_mode
(
self
,
parallel_mode
:
ParallelMode
):
"""Sets the current mode of the seed manager.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
"""
if
self
.
current_mode
:
# save the current state for current mode
self
.
_seed_states
[
self
.
_current_mode
]
=
torch
.
cuda
.
get_rng_state
()
# set the new state for new mode
self
.
_current_mode
=
parallel_mode
torch
.
cuda
.
set_rng_state
(
self
.
_seed_states
[
parallel_mode
])
def
add_seed
(
self
,
parallel_mode
:
ParallelMode
,
seed
:
int
):
"""Adds a seed to the seed manager for `parallel_mode`.
:param parallel_mode: The chosen parallel mode
:type parallel_mode: :class:`colossalai.context.ParallelMode`
:param seed: The seed to be added
:type seed: int
:raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
:class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added
"""
assert
isinstance
(
parallel_mode
,
ParallelMode
),
'A valid ParallelMode must be provided'
assert
parallel_mode
not
in
self
.
_seed_states
,
f
'The seed for
{
parallel_mode
}
has been added'
current_state
=
torch
.
cuda
.
get_rng_state
()
torch
.
cuda
.
manual_seed
(
seed
)
self
.
_seed_states
[
parallel_mode
]
=
torch
.
cuda
.
get_rng_state
()
self
.
_seeds
[
parallel_mode
]
=
seed
torch
.
cuda
.
set_rng_state
(
current_state
)
colossalai/core.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
colossalai.context
import
ParallelContext
global_context
=
ParallelContext
()
def
set_global_context
(
context
:
ParallelContext
):
'''Reset global context to be identical to a given :class:ParallelContext.
:param context: Parallel context to generate our global parallel context.
:type context: ParallelContext
'''
global
global_context
global_context
=
context
colossalai/engine/__init__.py
0 → 100644
View file @
404ecbdc
from
.amp_type
import
AMP_TYPE
from
._base_engine
import
Engine
from
.gradient_handler
import
*
from
.schedule
import
*
__all__
=
[
'Engine'
]
colossalai/engine/_base_engine.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
typing
import
Optional
from
colossalai.builder
import
build_gradient_handler
from
colossalai.context
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_global_dist_logger
from
colossalai.nn
import
(
ZeroRedundancyOptimizer_Level_2
,
ZeroRedundancyOptimizer_Level_3
)
from
torch.nn
import
Module
from
torch.nn.modules.loss
import
_Loss
from
torch.optim
import
Optimizer
from
torch.optim.lr_scheduler
import
_LRScheduler
from
torch.utils.data
import
DataLoader
from
.schedule
import
BaseSchedule
,
NoPipelineSchedule
class
Engine
:
"""Basic engine class for training and evaluation. It runs a specific process method
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
:param train_dataloader: Dataloader in training
:param test_dataloader: Dataloader in evaluation
:param model: The neural network model
:param criterion: Criterion for calculating loss
:param optimizer: Optimizer for updating the parameters
:param lr_scheduler: Learning rate scheduler ajusting learning rate during the training or evaluation
:param schedule: Running schedule in :meth:`step`
:type train_dataloader: DataLoader, optional
:type test_dataloader: DataLoader, optional
:type model: Module
:type criterion: _Loss, optional
:type optimizer: Optimizer, optional
:type lr_scheduler: _LRScheduler, optional
:type schedule: BaseSchedule, optional
"""
def
__init__
(
self
,
train_dataloader
:
Optional
[
DataLoader
]
=
None
,
test_dataloader
:
Optional
[
DataLoader
]
=
None
,
model
:
Module
=
None
,
criterion
:
_Loss
=
None
,
optimizer
:
Optimizer
=
None
,
lr_scheduler
:
Optional
[
_LRScheduler
]
=
None
,
schedule
:
BaseSchedule
=
None
):
self
.
train_dataloader
=
train_dataloader
self
.
test_dataloader
=
test_dataloader
assert
model
is
not
None
,
"Engine requires a model"
self
.
model
=
model
self
.
criterion
=
criterion
self
.
optimizer
=
optimizer
self
.
lr_scheduler
=
lr_scheduler
self
.
schedule
=
schedule
if
schedule
is
not
None
\
else
NoPipelineSchedule
()
self
.
_logger
=
get_global_dist_logger
()
# build gradient handler
self
.
_gradient_handlers
=
[]
gradient_handler_cfg
=
[]
if
hasattr
(
gpc
.
config
,
'gradient_handler'
):
assert
isinstance
(
gpc
.
config
.
gradient_handler
,
list
),
\
f
'argument gradient_handler_cfg expected type list, '
\
f
'but got type
{
type
(
gpc
.
config
.
gradient_handler
)
}
'
gradient_handler_cfg
=
gpc
.
config
.
gradient_handler
elif
isinstance
(
self
.
optimizer
,
(
ZeroRedundancyOptimizer_Level_2
,
ZeroRedundancyOptimizer_Level_3
)):
gradient_handler_cfg
=
[
dict
(
type
=
'ZeROGradientHandler'
)]
self
.
_logger
.
info
(
"Training with zero is detected, ZeROGradientHandler is automatically "
"added even though not specified in the configuration"
,
ranks
=
[
0
])
elif
gpc
.
is_initialized
(
ParallelMode
.
DATA
)
and
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
>
1
:
gradient_handler_cfg
=
[
dict
(
type
=
'DataParallelGradientHandler'
)]
self
.
_logger
.
info
(
"Data parallel training is detected, DataParallelGradientHandler is automatically "
"added even though not specified in the configuration"
,
ranks
=
[
0
])
if
len
(
gradient_handler_cfg
)
==
0
:
self
.
_logger
.
warning
(
"No gradient handler is set up, please make sure you do not need "
"to all-reduce the gradients after a training step."
,
ranks
=
[
0
])
for
cfg
in
gradient_handler_cfg
:
handler
=
build_gradient_handler
(
cfg
,
self
.
model
,
self
.
optimizer
)
self
.
_gradient_handlers
.
append
(
handler
)
self
.
schedule
.
initialize
(
self
.
train_dataloader
,
self
.
model
,
self
.
criterion
,
self
.
optimizer
,
self
.
lr_scheduler
)
self
.
forward_only
=
False
def
handle_gradient
(
self
):
"""Handles all-reduce operations of gradients across different parallel groups.
"""
for
handler
in
self
.
_gradient_handlers
:
handler
.
handle_gradient
()
def
set_dataloader
(
self
,
data
:
DataLoader
,
train
:
bool
=
True
):
"""Sets dataloader in training or evaluation.
:param data: Dataloader to be set
:param train: Set training dataloader if True, otherwise evaluation dataloader
:type data: DataLoader
:type train: bool
"""
if
train
:
self
.
train_dataloader
=
data
else
:
self
.
test_dataloader
=
data
def
get_model
(
self
):
"""Returns the neural network model in the engine.
"""
return
self
.
model
def
get_optimizer
(
self
):
"""Returns optimizier in the engine.
"""
return
self
.
optimizer
def
get_lr_scheduler
(
self
):
"""Returns the learning rate scheduler in the engine.
"""
return
self
.
lr_scheduler
def
train
(
self
):
"""Sets the model to training mode.
"""
self
.
forward_only
=
False
self
.
schedule
.
train
(
dataloader
=
self
.
train_dataloader
,
mode
=
True
)
def
eval
(
self
):
"""Sets the model to evaluation mode.
"""
self
.
forward_only
=
True
self
.
schedule
.
train
(
dataloader
=
self
.
test_dataloader
,
mode
=
False
)
def
is_train
(
self
):
"""Returns True if it is in training, otherwise False.
"""
return
not
self
.
forward_only
def
get_lr
(
self
):
"""Gets current learning rate.
"""
return
self
.
schedule
.
get_lr
()
def
step
(
self
,
return_loss
=
True
):
"""A running step based on the schedule. Usually, it runs a training or
evaluation over a batch of dataset.
:param return_loss: loss will be returned if True
:type return_loss: bool
:return: (output, lablel, loss)
"""
self
.
schedule
.
zero_grad
(
forward_only
=
self
.
forward_only
)
output
,
label
,
loss
=
self
.
schedule
.
forward_backward_step
(
forward_only
=
self
.
forward_only
,
return_loss
=
return_loss
)
if
not
self
.
forward_only
:
# all reduce gradients
self
.
handle_gradient
()
self
.
schedule
.
step
()
return
output
,
label
,
loss
colossalai/engine/amp_type.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
enum
import
Enum
class
AMP_TYPE
(
Enum
):
APEX
=
'apex'
TORCH
=
'torch'
PARALLEL
=
'parallel'
colossalai/engine/gradient_handler/__init__.py
0 → 100644
View file @
404ecbdc
from
._base_gradient_handler
import
BaseGradientHandler
from
._data_parallel_gradient_handler
import
DataParallelGradientHandler
from
._zero_gradient_handler
import
ZeROGradientHandler
__all__
=
[
'BaseGradientHandler'
,
'DataParallelGradientHandler'
,
'ZeROGradientHandler'
]
colossalai/engine/gradient_handler/_base_gradient_handler.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
abc
import
ABC
,
abstractmethod
class
BaseGradientHandler
(
ABC
):
"""A basic helper class to handle all-reduce operations of gradients across different parallel groups
before optimization.
:param model: Model where the gradients accumulate
:param optimizer: Optimizer for updating the parameters
:type model: Module
:type optimizer: Optimizer
"""
def
__init__
(
self
,
model
,
optimizer
):
self
.
_model
=
model
self
.
_optimizer
=
optimizer
@
abstractmethod
def
handle_gradient
(
self
):
"""A method to accumulate gradients across different parallel groups. Users should
write their own functions or just use the functions in pre-defined subclasses.
"""
pass
colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
0 → 100644
View file @
404ecbdc
#!/usr/bin/env python
import
torch.distributed
as
dist
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
colossalai.core
import
global_context
as
gpc
from
colossalai.registry
import
GRADIENT_HANDLER
from
._base_gradient_handler
import
BaseGradientHandler
from
...context.parallel_mode
import
ParallelMode
@
GRADIENT_HANDLER
.
register_module
class
DataParallelGradientHandler
(
BaseGradientHandler
):
"""A helper class to handle all-reduce operations in a data parallel group.
A all-reduce collective communication will be operated in
:func:`handle_gradient` among a data parallel group.
For better performance, it bucketizes the gradients of all parameters that are
the same type to improve the efficiency of communication.
"""
def
handle_gradient
(
self
):
"""A method running a all-reduce operation in a data parallel group.
"""
# TODO: add memory buffer
if
gpc
.
data_parallel_size
>
1
:
# bucketize and all-reduce
buckets
=
{}
# Pack the buckets.
for
param
in
self
.
_model
.
parameters
():
if
param
.
requires_grad
and
param
.
grad
is
not
None
:
tp
=
param
.
data
.
type
()
if
tp
not
in
buckets
:
buckets
[
tp
]
=
[]
buckets
[
tp
].
append
(
param
)
param
.
main_grad
=
param
.
grad
# For each bucket, all-reduce and copy all-reduced grads.
for
tp
in
buckets
:
bucket
=
buckets
[
tp
]
grads
=
[
param
.
grad
.
data
for
param
in
bucket
]
coalesced
=
_flatten_dense_tensors
(
grads
)
coalesced
/=
gpc
.
get_world_size
(
ParallelMode
.
DATA
)
dist
.
all_reduce
(
coalesced
,
group
=
gpc
.
get_group
(
ParallelMode
.
DATA
))
for
buf
,
synced
in
zip
(
grads
,
_unflatten_dense_tensors
(
coalesced
,
grads
)):
buf
.
copy_
(
synced
)
colossalai/engine/gradient_handler/_zero_gradient_handler.py
0 → 100644
View file @
404ecbdc
from
colossalai.registry
import
GRADIENT_HANDLER
from
._base_gradient_handler
import
BaseGradientHandler
@
GRADIENT_HANDLER
.
register_module
class
ZeROGradientHandler
(
BaseGradientHandler
):
"""A helper class to handle all-reduce operations in a data parallel group.
A all-reduce collective communication will be operated in
:func:`handle_gradient` among a data parallel group.
This class is specialized with ZeRO optimization.
"""
def
handle_gradient
(
self
):
"""A method running a all-reduce operation in a data parallel group.
"""
self
.
_optimizer
.
allreduce_gradients
()
Prev
1
2
3
4
5
6
…
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment