Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
56ff1921
Unverified
Commit
56ff1921
authored
Feb 14, 2023
by
LuGY
Committed by
GitHub
Feb 14, 2023
Browse files
[NFC] polish colossalai/context/moe_context.py code style (#2693)
parent
1712da28
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
129 additions
and
129 deletions
+129
-129
colossalai/context/moe_context.py
colossalai/context/moe_context.py
+129
-129
No files found.
colossalai/context/moe_context.py
View file @
56ff1921
import
torch
import
torch.distributed
as
dist
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.context.singleton_meta
import
SingletonMeta
from
colossalai.
tensor
import
ProcessGroup
from
typing
import
Tuple
def
_check_sanity
():
from
colossalai.core
import
global_context
as
gpc
if
gpc
.
tensor_parallel_size
>
1
or
gpc
.
pipeline_parallel_size
>
1
:
raise
NotImplementedError
(
"Moe is not compatible with tensor or "
"pipeline parallel at present."
)
class
MoeParallelInfo
:
"""Moe parallelism information, storing parallel sizes and groups.
"""
def
__init__
(
self
,
ep_size
:
int
,
dp_size
:
int
):
_check_sanity
()
self
.
ep_size
=
ep_size
self
.
dp_size
=
dp_size
self
.
pg
=
ProcessGroup
(
tp_degree
=
ep_size
,
dp_degree
=
dp_size
)
self
.
ep_group
=
self
.
pg
.
tp_process_group
()
self
.
dp_group
=
self
.
pg
.
dp_process_group
()
class
MoeContext
(
metaclass
=
SingletonMeta
):
"""MoE parallel context manager. This class manages different
parallel groups in MoE context and MoE loss in training.
"""
def
__init__
(
self
):
self
.
world_size
=
1
# Users may want to set maximum expert parallel size smaller than the world size
# since very low bandwidth across nodes may constrain the performance of MoE
# When we have a maximum expert parallel size, we have a minimum data parallel size naturally
self
.
max_ep_size
=
1
self
.
min_dp_size
=
1
self
.
aux_loss
=
None
self
.
use_kernel_optim
=
True
self
.
has_setup
=
False
self
.
_parallel_info_dict
=
dict
()
@
property
def
parallel_info_dict
(
self
):
return
self
.
_parallel_info_dict
@
property
def
is_initialized
(
self
):
return
self
.
has_setup
def
setup
(
self
,
seed
:
int
,
use_kernel_optim
:
bool
=
True
):
assert
not
self
.
is_initialized
,
"MoE distributed context shouldn't be set up again"
_check_sanity
()
assert
torch
.
cuda
.
is_available
(),
"MoE requires to enable CUDA first"
self
.
world_size
=
dist
.
get_world_size
()
from
colossalai.core
import
global_context
as
gpc
self
.
max_ep_size
=
gpc
.
config
.
get
(
'max_ep_size'
,
self
.
world_size
)
assert
self
.
world_size
%
self
.
max_ep_size
==
0
,
\
"Maximum epxert parallel size must be a factor of the number of GPUs"
self
.
min_dp_size
=
self
.
world_size
//
self
.
max_ep_size
# Enabling kernel optimization may raise error in some cases
# Users can close kernel optimization manually
self
.
use_kernel_optim
=
use_kernel_optim
from
.random
import
moe_set_seed
moe_set_seed
(
seed
)
self
.
has_setup
=
True
def
get_info
(
self
,
num_experts
:
int
)
->
Tuple
[
int
,
MoeParallelInfo
]:
"""Calculate the Data Parallel Group and Expert Parallel Group.
Parameters
----------
num_experts : int
The number experts
Returns
-------
int, MoeParallelInfo
number of local experts, the MoeParallelInfo of the current ep_size
"""
gt_flag
=
num_experts
%
self
.
max_ep_size
==
0
# check whether num_experts is greater
lt_flag
=
self
.
max_ep_size
%
num_experts
==
0
# check whether num_experts is less
assert
gt_flag
or
lt_flag
,
"Automatic experts placement dose not not support expert number"
\
" is not a multiple of ep size or vice versa."
# If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
# there are multiple experts in each GPU and each GPU has different experts
# So it's data parallel size is 1
# Otherwise, there is only one expert in each GPU
# The data parallel size should be calculated
dp_size
=
1
if
gt_flag
else
self
.
max_ep_size
//
num_experts
ep_size
=
self
.
max_ep_size
//
dp_size
# Calculate the number of experts for each GPU
num_local_experts
=
1
if
lt_flag
else
num_experts
//
self
.
max_ep_size
# Don't forget to multiply minimum data parallel size
dp_size
*=
self
.
min_dp_size
if
not
(
ep_size
in
self
.
parallel_info_dict
):
self
.
parallel_info_dict
[
ep_size
]
=
MoeParallelInfo
(
ep_size
,
dp_size
)
return
num_local_experts
,
self
.
parallel_info_dict
[
ep_size
]
def
set_kernel_not_use
(
self
):
self
.
use_kernel_optim
=
False
def
reset_loss
(
self
):
self
.
aux_loss
=
0
def
add_loss
(
self
,
loss
):
self
.
aux_loss
+=
loss
def
get_loss
(
self
):
return
self
.
aux_loss
MOE_CONTEXT
=
MoeContext
()
from
typing
import
Tuple
import
torch
import
torch.distributed
as
dist
from
colossalai.
context.parallel_mode
import
ParallelMode
from
colossalai.context.singleton_meta
import
SingletonMeta
from
colossalai.tensor
import
ProcessGroup
def
_check_sanity
():
from
colossalai.core
import
global_context
as
gpc
if
gpc
.
tensor_parallel_size
>
1
or
gpc
.
pipeline_parallel_size
>
1
:
raise
NotImplementedError
(
"Moe is not compatible with tensor or "
"pipeline parallel at present."
)
class
MoeParallelInfo
:
"""Moe parallelism information, storing parallel sizes and groups.
"""
def
__init__
(
self
,
ep_size
:
int
,
dp_size
:
int
):
_check_sanity
()
self
.
ep_size
=
ep_size
self
.
dp_size
=
dp_size
self
.
pg
=
ProcessGroup
(
tp_degree
=
ep_size
,
dp_degree
=
dp_size
)
self
.
ep_group
=
self
.
pg
.
tp_process_group
()
self
.
dp_group
=
self
.
pg
.
dp_process_group
()
class
MoeContext
(
metaclass
=
SingletonMeta
):
"""MoE parallel context manager. This class manages different
parallel groups in MoE context and MoE loss in training.
"""
def
__init__
(
self
):
self
.
world_size
=
1
# Users may want to set maximum expert parallel size smaller than the world size
# since very low bandwidth across nodes may constrain the performance of MoE
# When we have a maximum expert parallel size, we have a minimum data parallel size naturally
self
.
max_ep_size
=
1
self
.
min_dp_size
=
1
self
.
aux_loss
=
None
self
.
use_kernel_optim
=
True
self
.
has_setup
=
False
self
.
_parallel_info_dict
=
dict
()
@
property
def
parallel_info_dict
(
self
):
return
self
.
_parallel_info_dict
@
property
def
is_initialized
(
self
):
return
self
.
has_setup
def
setup
(
self
,
seed
:
int
,
use_kernel_optim
:
bool
=
True
):
assert
not
self
.
is_initialized
,
"MoE distributed context shouldn't be set up again"
_check_sanity
()
assert
torch
.
cuda
.
is_available
(),
"MoE requires to enable CUDA first"
self
.
world_size
=
dist
.
get_world_size
()
from
colossalai.core
import
global_context
as
gpc
self
.
max_ep_size
=
gpc
.
config
.
get
(
'max_ep_size'
,
self
.
world_size
)
assert
self
.
world_size
%
self
.
max_ep_size
==
0
,
\
"Maximum epxert parallel size must be a factor of the number of GPUs"
self
.
min_dp_size
=
self
.
world_size
//
self
.
max_ep_size
# Enabling kernel optimization may raise error in some cases
# Users can close kernel optimization manually
self
.
use_kernel_optim
=
use_kernel_optim
from
.random
import
moe_set_seed
moe_set_seed
(
seed
)
self
.
has_setup
=
True
def
get_info
(
self
,
num_experts
:
int
)
->
Tuple
[
int
,
MoeParallelInfo
]:
"""Calculate the Data Parallel Group and Expert Parallel Group.
Parameters
----------
num_experts : int
The number experts
Returns
-------
int, MoeParallelInfo
number of local experts, the MoeParallelInfo of the current ep_size
"""
gt_flag
=
num_experts
%
self
.
max_ep_size
==
0
# check whether num_experts is greater
lt_flag
=
self
.
max_ep_size
%
num_experts
==
0
# check whether num_experts is less
assert
gt_flag
or
lt_flag
,
"Automatic experts placement dose not not support expert number"
\
" is not a multiple of ep size or vice versa."
# If the number of experts is greater than maximum expert parallel size. a.k.a ep_size,
# there are multiple experts in each GPU and each GPU has different experts
# So it's data parallel size is 1
# Otherwise, there is only one expert in each GPU
# The data parallel size should be calculated
dp_size
=
1
if
gt_flag
else
self
.
max_ep_size
//
num_experts
ep_size
=
self
.
max_ep_size
//
dp_size
# Calculate the number of experts for each GPU
num_local_experts
=
1
if
lt_flag
else
num_experts
//
self
.
max_ep_size
# Don't forget to multiply minimum data parallel size
dp_size
*=
self
.
min_dp_size
if
not
(
ep_size
in
self
.
parallel_info_dict
):
self
.
parallel_info_dict
[
ep_size
]
=
MoeParallelInfo
(
ep_size
,
dp_size
)
return
num_local_experts
,
self
.
parallel_info_dict
[
ep_size
]
def
set_kernel_not_use
(
self
):
self
.
use_kernel_optim
=
False
def
reset_loss
(
self
):
self
.
aux_loss
=
0
def
add_loss
(
self
,
loss
):
self
.
aux_loss
+=
loss
def
get_loss
(
self
):
return
self
.
aux_loss
MOE_CONTEXT
=
MoeContext
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment