Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
d155925d
Commit
d155925d
authored
Mar 22, 2021
by
Jiezhong Qiu
Browse files
Merge remote-tracking branch 'origin/master' into checkpoint
Conflicts: fmoe/transformer.py
parents
f49a9ec4
98b4b0be
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
109 additions
and
20 deletions
+109
-20
.github/ISSUE_TEMPLATE/bug_report.md
.github/ISSUE_TEMPLATE/bug_report.md
+31
-0
.github/ISSUE_TEMPLATE/feature_request.md
.github/ISSUE_TEMPLATE/feature_request.md
+20
-0
.gitignore
.gitignore
+1
-0
cuda/moe.cpp
cuda/moe.cpp
+16
-2
doc/release-note.md
doc/release-note.md
+16
-0
fmoe/distributed.py
fmoe/distributed.py
+2
-1
fmoe/functions.py
fmoe/functions.py
+2
-1
fmoe/gates.py
fmoe/gates.py
+2
-1
fmoe/layers.py
fmoe/layers.py
+4
-2
fmoe/megatron.py
fmoe/megatron.py
+5
-4
fmoe/transformer.py
fmoe/transformer.py
+2
-4
setup.py
setup.py
+1
-1
tests/benchmark_mlp.py
tests/benchmark_mlp.py
+2
-2
tests/test.sh
tests/test.sh
+5
-2
No files found.
.github/ISSUE_TEMPLATE/bug_report.md
0 → 100644
View file @
d155925d
---
name
:
Bug report
about
:
Create a report to help us improve
title
:
'
'
labels
:
'
'
assignees
:
'
'
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1.
Compile with "..."
2.
Run "..." with "..." processes on "..." nodes
**Expected behavior**
A clear and concise description of what you expected to happen.
**Logs**
If applicable, add logs to help explain your problem.
**Platform**
-
Device: [e.g. NVIDIA V100]
-
OS: [e.g. Debian 10.2 buster]
-
CUDA version: [e.g. 11.1]
-
NCCL version: [e.g. 2.7.8-1]
**Additional context**
Add any other context about the problem here.
.github/ISSUE_TEMPLATE/feature_request.md
0 → 100644
View file @
d155925d
---
name
:
Feature request
about
:
Suggest an idea for this project
title
:
'
'
labels
:
'
'
assignees
:
'
'
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
.gitignore
View file @
d155925d
...
@@ -10,3 +10,4 @@ a.out
...
@@ -10,3 +10,4 @@ a.out
build
build
*swp
*swp
logs
logs
dist
cuda/moe.cpp
View file @
d155925d
...
@@ -117,15 +117,29 @@ public:
...
@@ -117,15 +117,29 @@ public:
ncclComm_t
getcomm
(
at
::
Device
dev
)
{
ncclComm_t
getcomm
(
at
::
Device
dev
)
{
auto
key
=
std
::
to_string
(
dev
.
index
());
auto
key
=
std
::
to_string
(
dev
.
index
());
#ifdef ENABLE_NCCL_P2P_SUPPORT
#ifdef ENABLE_NCCL_P2P_SUPPORT
auto
v
=
getNCCLComm
(
key
,
{
dev
},
c10d
::
OpType
::
ALLTOALL
);
ncclUniqueId
ncclID
;
int
rank
=
getRank
();
if
(
rank
==
0
)
{
ncclGetUniqueId
(
&
ncclID
);
}
broadcastUniqueNCCLID
(
&
ncclID
,
c10d
::
OpType
::
SEND
,
"fastmoe_nccl_comm"
,
rank
);
ncclComm_t
comm
;
ncclCommInitRank
(
&
comm
,
getSize
(),
ncclID
,
rank
);
return
comm
;
#else
#else
auto
v
=
getNCCLComm
(
key
,
{
dev
});
auto
v
=
getNCCLComm
(
key
,
{
dev
});
#endif
if
(
v
.
size
()
==
0
)
{
if
(
v
.
size
()
==
0
)
{
std
::
cerr
<<
"PyTorch has nothing
\n
"
;
std
::
cerr
<<
"PyTorch has nothing
\n
"
;
return
0
;
return
0
;
}
}
int
count
;
ncclCommCount
(
v
[
0
]
->
getNcclComm
(),
&
count
);
std
::
cerr
<<
"PyTorch has "
<<
v
.
size
()
<<
" comms, comm 0 size "
<<
count
<<
"
\n
"
;
return
v
[
0
]
->
getNcclComm
();
return
v
[
0
]
->
getNcclComm
();
#endif
}
}
};
};
...
...
doc/release-note.md
View file @
d155925d
## v0.1.2
### Compilation
-
Remove dependency on the CUDA examples repository.
### Distributed
-
Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
on multiple nodes with PyTorch v1.8.0.
### Misc
-
Fix tons of typos.
-
Format the code.
## v0.1.1
## v0.1.1
### Distributed
### Distributed
...
...
fmoe/distributed.py
View file @
d155925d
...
@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module):
...
@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module):
else
:
else
:
self
.
comms
[
"world"
]
=
world_group
self
.
comms
[
"world"
]
=
world_group
def
allreduce_params
(
no_scale
=
False
,
reduce_after
=
False
,
fp32_allreduce
=
False
):
def
allreduce_params
(
no_scale
=
False
,
reduce_after
=
False
,
fp32_allreduce
=
False
):
groups
=
dict
()
groups
=
dict
()
for
p
in
self
.
module
.
parameters
():
for
p
in
self
.
module
.
parameters
():
if
not
p
.
requires_grad
or
p
.
grad
is
None
:
if
not
p
.
requires_grad
or
p
.
grad
is
None
:
...
...
fmoe/functions.py
View file @
d155925d
...
@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
...
@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
)
)
else
:
else
:
global_expert_count
=
local_expert_count
global_expert_count
=
local_expert_count
fwd_expert_count
=
global_expert_count
.
view
(
world_size
,
num_expert
).
sum
(
dim
=
0
)
fwd_expert_count
=
global_expert_count
.
view
(
world_size
,
num_expert
).
sum
(
dim
=
0
)
fwd_batch_size
=
int
(
fwd_expert_count
.
sum
().
item
())
fwd_batch_size
=
int
(
fwd_expert_count
.
sum
().
item
())
return
(
return
(
pos
,
pos
,
...
...
fmoe/gates.py
View file @
d155925d
...
@@ -23,7 +23,8 @@ class ZeroGate(nn.Module):
...
@@ -23,7 +23,8 @@ class ZeroGate(nn.Module):
idx
=
torch
.
zeros
(
idx
=
torch
.
zeros
(
inp
.
shape
[
0
]
*
self
.
top_k
,
dtype
=
torch
.
int64
,
device
=
inp
.
device
inp
.
shape
[
0
]
*
self
.
top_k
,
dtype
=
torch
.
int64
,
device
=
inp
.
device
)
)
score
=
torch
.
ones
(
inp
.
shape
[
0
]
*
self
.
top_k
,
device
=
inp
.
device
)
/
self
.
top_k
score
=
torch
.
ones
(
inp
.
shape
[
0
]
*
self
.
top_k
,
device
=
inp
.
device
)
/
self
.
top_k
return
idx
,
score
.
reshape
(
-
1
,
1
,
self
.
top_k
)
return
idx
,
score
.
reshape
(
-
1
,
1
,
self
.
top_k
)
...
...
fmoe/layers.py
View file @
d155925d
...
@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size):
...
@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size):
fwd_batch_size
,
fwd_batch_size
,
)
=
moe_prepare_forward
(
gate
,
num_expert
,
world_size
)
)
=
moe_prepare_forward
(
gate
,
num_expert
,
world_size
)
x
=
MOEScatter
.
apply
(
x
=
MOEScatter
.
apply
(
inp
,
pos
,
local_expert_count
,
global_expert_count
,
fwd_batch_size
,
world_size
inp
,
pos
,
local_expert_count
,
global_expert_count
,
fwd_batch_size
,
world_size
)
)
x
=
expert_fn
(
x
,
fwd_expert_count
)
x
=
expert_fn
(
x
,
fwd_expert_count
)
x
=
MOEGather
.
apply
(
x
=
MOEGather
.
apply
(
...
@@ -165,7 +166,8 @@ class FMoE(nn.Module):
...
@@ -165,7 +166,8 @@ class FMoE(nn.Module):
self
.
top_k
=
top_k
self
.
top_k
=
top_k
self
.
gate
=
gate
(
d_model
,
num_expert
,
world_size
,
top_k
)
self
.
gate
=
gate
(
d_model
,
num_expert
,
world_size
,
top_k
)
if
expert
is
not
None
:
if
expert
is
not
None
:
self
.
experts
=
nn
.
ModuleList
([
expert
(
d_model
)
for
_
in
range
(
num_expert
)])
self
.
experts
=
nn
.
ModuleList
([
expert
(
d_model
)
for
_
in
range
(
num_expert
)])
self
.
experts_fused
=
False
self
.
experts_fused
=
False
else
:
else
:
self
.
experts_fused
=
True
self
.
experts_fused
=
True
...
...
fmoe/megatron.py
View file @
d155925d
...
@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma):
...
@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma):
device
=
self
.
weight
.
device
device
=
self
.
weight
.
device
dtype
=
self
.
weight
.
dtype
dtype
=
self
.
weight
.
dtype
weight
=
rng
.
normal
(
loc
=
0.0
,
scale
=
sigma
,
size
=
tuple
(
self
.
weight
.
size
()))
weight
=
rng
.
normal
(
loc
=
0.0
,
scale
=
sigma
,
size
=
tuple
(
self
.
weight
.
size
()))
self
.
weight
.
data
=
torch
.
tensor
(
weight
,
dtype
=
dtype
,
device
=
device
)
self
.
weight
.
data
=
torch
.
from_numpy
(
weight
).
to
(
dtype
=
dtype
,
device
=
device
)
if
self
.
bias
is
not
None
:
if
self
.
bias
is
not
None
:
# Always initialize bias to zero.
# Always initialize bias to zero.
...
@@ -64,13 +64,13 @@ def _random_init_weight(self, rng):
...
@@ -64,13 +64,13 @@ def _random_init_weight(self, rng):
device
=
self
.
weight
.
device
device
=
self
.
weight
.
device
dtype
=
self
.
weight
.
dtype
dtype
=
self
.
weight
.
dtype
weight
=
rng
.
uniform
(
-
bound
,
bound
,
size
=
tuple
(
self
.
weight
.
size
()))
weight
=
rng
.
uniform
(
-
bound
,
bound
,
size
=
tuple
(
self
.
weight
.
size
()))
self
.
weight
.
data
=
torch
.
tensor
(
weight
,
dtype
=
dtype
,
device
=
device
)
self
.
weight
.
data
=
torch
.
from_numpy
(
weight
).
to
(
dtype
=
dtype
,
device
=
device
)
if
self
.
bias
is
not
None
:
if
self
.
bias
is
not
None
:
fan_in
,
_
=
nn
.
init
.
_calculate_fan_in_and_fan_out
(
self
.
weight
[
0
])
fan_in
,
_
=
nn
.
init
.
_calculate_fan_in_and_fan_out
(
self
.
weight
[
0
])
bound
=
1
/
math
.
sqrt
(
fan_in
)
bound
=
1
/
math
.
sqrt
(
fan_in
)
bias
=
rng
.
uniform
(
-
bound
,
bound
,
size
=
tuple
(
self
.
bias
.
size
()))
bias
=
rng
.
uniform
(
-
bound
,
bound
,
size
=
tuple
(
self
.
bias
.
size
()))
self
.
bias
.
data
=
torch
.
tensor
(
bias
,
dtype
=
dtype
,
device
=
device
)
self
.
bias
.
data
=
torch
.
from_numpy
(
bias
).
to
(
dtype
=
dtype
,
device
=
device
)
class
MegatronMLP
(
FMoETransformerMLP
):
class
MegatronMLP
(
FMoETransformerMLP
):
...
@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP):
...
@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP):
def
__init__
(
self
,
args
,
group
):
def
__init__
(
self
,
args
,
group
):
assert
(
assert
(
args
.
seq_length
*
args
.
micro_batch_size
%
args
.
tensor_model_parallel_size
args
.
seq_length
*
args
.
micro_batch_size
%
args
.
tensor_model_parallel_size
==
0
==
0
),
"Batch size x sequence length should be multiple of mp size"
),
"Batch size x sequence length should be multiple of mp size"
if
not
args
.
distributed_experts
:
if
not
args
.
distributed_experts
:
...
...
fmoe/transformer.py
View file @
d155925d
...
@@ -15,10 +15,8 @@ class _Expert(nn.Module):
...
@@ -15,10 +15,8 @@ class _Expert(nn.Module):
def
__init__
(
self
,
num_expert
,
d_model
,
d_hidden
,
activation
,
rank
=
0
):
def
__init__
(
self
,
num_expert
,
d_model
,
d_hidden
,
activation
,
rank
=
0
):
super
().
__init__
()
super
().
__init__
()
self
.
htoh4
=
FMoELinear
(
num_expert
,
d_model
,
d_hidden
,
self
.
htoh4
=
FMoELinear
(
num_expert
,
d_model
,
d_hidden
,
bias
=
True
,
rank
=
rank
)
bias
=
True
,
rank
=
rank
)
self
.
h4toh
=
FMoELinear
(
num_expert
,
d_hidden
,
d_model
,
bias
=
True
,
rank
=
rank
)
self
.
h4toh
=
FMoELinear
(
num_expert
,
d_hidden
,
d_model
,
bias
=
True
,
rank
=
rank
)
self
.
activation
=
activation
self
.
activation
=
activation
def
forward
(
self
,
inp
,
fwd_expert_count
):
def
forward
(
self
,
inp
,
fwd_expert_count
):
...
...
setup.py
View file @
d155925d
...
@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1':
...
@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1':
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
setuptools
.
setup
(
setuptools
.
setup
(
name
=
'fastmoe'
,
name
=
'fastmoe'
,
version
=
'0.1.
1
'
,
version
=
'0.1.
2
'
,
description
=
'An efficient Mixture-of-Experts system for PyTorch'
,
description
=
'An efficient Mixture-of-Experts system for PyTorch'
,
author
=
'Jiaao He, Jiezhong Qiu and Aohan Zeng'
,
author
=
'Jiaao He, Jiezhong Qiu and Aohan Zeng'
,
author_email
=
'hja20@mails.tsinghua.edu.cn'
,
author_email
=
'hja20@mails.tsinghua.edu.cn'
,
...
...
tests/benchmark_mlp.py
View file @
d155925d
import
torch
import
torch.nn
as
nn
from
fmoe
import
FMoETransformerMLP
from
fmoe
import
FMoETransformerMLP
from
fmoe.gates
import
NaiveGate
from
fmoe.gates
import
NaiveGate
from
moe
import
BruteForceMoELinear
from
moe
import
BruteForceMoELinear
import
torch
import
torch.nn
as
nn
import
time
import
time
import
sys
import
sys
import
os
import
os
...
...
tests/test.sh
View file @
d155925d
...
@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version)
...
@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version)
PYTHON_REVISION
=
${
PYTHON_VERSION
:7:3
}
PYTHON_REVISION
=
${
PYTHON_VERSION
:7:3
}
SCRIPT_PATH
=
$(
dirname
$(
dirname
$(
realpath
$0
)))
SCRIPT_PATH
=
$(
dirname
$(
dirname
$(
realpath
$0
)))
source
~/scripts/torch.env
export
PYTHONPATH
=
$SCRIPT_PATH
:
$SCRIPT_PATH
/build/lib.linux-x86_64-
$PYTHON_REVISION
:
$PYTHONPATH
export
PYTHONPATH
=
$SCRIPT_PATH
:
$SCRIPT_PATH
/build/lib.linux-x86_64-
$PYTHON_REVISION
:
$PYTHONPATH
export
LD_LIBRARY_PATH
=
/home/laekov/.local/lib/python
$PYTHON_REVISION
/site-packages/torch/lib:
$LD_LIBRARY_PATH
exec
$PYTHON_EXEC
$@
2>logs/
$RANK
.log
core0
=
$(
expr
$OMPI_COMM_WORLD_LOCAL_RANK
\*
4
)
cores
=
$core0
-
$(
expr
$core0
+ 3
)
exec
numactl
-C
$cores
$PYTHON_EXEC
$@
2>logs/
$RANK
.log
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment