Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FastMoE
Commits
01d9b418
Commit
01d9b418
authored
Feb 01, 2021
by
Rick Ho
Browse files
split more tests
parent
22e1eb45
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
224 additions
and
7 deletions
+224
-7
fmoe/functions.py
fmoe/functions.py
+0
-0
fmoe/layers.py
fmoe/layers.py
+1
-1
tests/test.sh
tests/test.sh
+6
-6
tests/test_all.sh
tests/test_all.sh
+26
-0
tests/test_dp.py
tests/test_dp.py
+33
-0
tests/test_numerical.py
tests/test_numerical.py
+79
-0
tests/test_performance.py
tests/test_performance.py
+79
-0
No files found.
fmoe/
fmoe_
functions.py
→
fmoe/functions.py
View file @
01d9b418
File moved
fmoe/layers.py
View file @
01d9b418
from
.
fmoe_
functions
import
*
from
.functions
import
*
import
torch.nn
as
nn
import
torch.nn.functional
as
F
...
...
tests/
dev_
test.sh
→
tests/test.sh
View file @
01d9b418
...
...
@@ -2,6 +2,10 @@
if
[
!
-z
$OMPI_COMM_WORLD_LOCAL_RANK
]
then
export
CUDA_VISIBLE_DEVICES
=
$OMPI_COMM_WORLD_LOCAL_RANK
fi
if
[
-z
$MASTER_PORT
]
then
export
MASTER_ADDR
=
localhost
export
MASTER_PORT
=
36666
fi
...
...
@@ -18,9 +22,5 @@ mkdir -p logs
SCRIPT_PATH
=
$(
dirname
$(
dirname
$(
realpath
$0
)))
export
PYTHONPATH
=
$SCRIPT_PATH
:
$SCRIPT_PATH
/build/lib.linux-x86_64-3.7:
$PYTHONPATH
export
LD_LIBRARY_PATH
=
/home/laekov/.local/lib/python3.7/site-packages/torch/lib:
$LD_LIBRARY_PATH
if
[
-z
$1
]
then
python3 tests/moe_test.py 2>logs/
$RANK
.log
else
python3
$@
2>logs/
$RANK
.log
fi
exec
python3
$@
2>logs/
$RANK
.log
tests/test_all.sh
0 → 100755
View file @
01d9b418
#!/bin/bash
runtest
()
{
echo
Testing
$@
$@
if
[
$?
=
0
]
then
echo
'----------------- Passed'
else
echo
'----------------- Failed'
exit
fi
}
if
[
!
-z
$1
]
then
runtest
$@
exit
fi
TEST_SCRIPT
=
$(
dirname
$(
realpath
$0
))
/test.sh
runtest
$TEST_SCRIPT
tests/test_numerical.py
runtest mpirun
-n
2
$TEST_SCRIPT
tests/test_numerical.py
runtest
$TEST_SCRIPT
tests/test_dp.py
runtest
$TEST_SCRIPT
tests/test_performance.py
runtest
$TEST_SCRIPT
mpirun
-n
2 tests/test_performance.py
tests/test_dp.py
0 → 100644
View file @
01d9b418
from
moe
import
FMoE
as
MOELayer
from
moe
import
BruteForceMoE
as
MOELayer_raw
import
torch
from
torch
import
nn
import
sys
import
os
n_devices
=
int
(
os
.
environ
.
get
(
'N_GPUS'
,
'2'
))
def
test_dp
():
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
batch_size
=
6
num_expert
=
4
in_feat
=
2
out_feat
=
3
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
()
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
cuda
()
print
(
"data parallel of our MoE model"
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
).
cuda
()
moe_dp
=
torch
.
nn
.
DataParallel
(
moe
,
device_ids
=
list
(
range
(
n_devices
)))
for
i
in
range
(
5
):
output
=
moe_dp
(
inp
,
gate
)
print
(
'Successful'
)
if
__name__
==
'__main__'
:
test_dp
()
tests/
moe_
test.py
→
tests/test
_numerical
.py
View file @
01d9b418
...
...
@@ -2,96 +2,24 @@ from moe import FMoE as MOELayer
from
moe
import
BruteForceMoE
as
MOELayer_raw
import
torch
from
torch
import
nn
import
time
import
sys
import
os
dev_name_default
=
'cuda:0'
def
perf
():
torch
.
manual_seed
(
42
+
torch
.
distributed
.
get_rank
())
torch
.
cuda
.
manual_seed
(
42
+
torch
.
distributed
.
get_rank
())
if
len
(
sys
.
argv
)
==
6
:
batch_size
=
int
(
sys
.
argv
[
2
])
in_feat
=
int
(
sys
.
argv
[
3
])
out_feat
=
int
(
sys
.
argv
[
4
])
num_expert
=
int
(
sys
.
argv
[
5
])
else
:
batch_size
=
4096
in_feat
=
1024
out_feat
=
4096
num_expert
=
4
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'Performance test case bs {} {}x{} ne {}'
.
format
(
batch_size
,
in_feat
,
out_feat
,
num_expert
))
if
torch
.
distributed
.
get_world_size
()
>
1
:
dev_name
=
'cuda'
else
:
dev_name
=
dev_name_default
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
(
dev_name
)
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
*
torch
.
distributed
.
get_world_size
(),
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
dev_name
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
,
world_size
).
cuda
(
dev_name
)
moe
.
train
()
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
o
=
moe
(
inp
,
gate
)
n_runs
=
16
tott
=
0.
backt
=
0.
maxt
=
0.
sqtot
=
0.
for
i
in
range
(
n_runs
):
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
*
torch
.
distributed
.
get_world_size
(),
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
dev_name
)
ts
=
time
.
time
()
o
=
moe
(
inp
,
gate
)
te
=
time
.
time
()
loss
=
o
.
sum
()
bts
=
time
.
time
()
loss
.
backward
()
bte
=
time
.
time
()
tott
+=
te
-
ts
sqtot
+=
(
te
-
ts
)
**
2
maxt
=
max
(
maxt
,
te
-
ts
)
backt
=
bte
-
bts
gflops
=
2e-9
*
n_runs
*
in_feat
*
out_feat
*
batch_size
/
tott
print
(
'Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'
.
format
(
tott
*
1e3
/
n_runs
,
maxt
*
1e3
,
(
sqtot
/
n_runs
-
(
tott
/
n_runs
)
**
2
)
*
1e3
/
n_runs
,
backt
*
1e3
/
n_runs
,
gflops
))
def
test_module
(
moe
,
linear
,
inp
,
gate
):
linear
.
zero_grad
()
moe
.
zero_grad
()
x
=
(
linear
(
inp
))
output
=
moe
(
x
,
gate
)
y
=
output
.
mean
()
y
.
backward
()
return
output
,
moe
.
weight
.
grad
,
linear
.
weight
.
grad
,
linear
.
bias
.
grad
rank
=
None
world_size
=
None
def
test
():
def
test_moe
():
def
test_module
(
moe
,
linear
,
inp
,
gate
):
linear
.
zero_grad
()
moe
.
zero_grad
()
x
=
(
linear
(
inp
))
output
=
moe
(
x
,
gate
)
y
=
output
.
mean
()
y
.
backward
()
return
output
,
moe
.
weight
.
grad
,
linear
.
weight
.
grad
,
linear
.
bias
.
grad
torch
.
manual_seed
(
42
+
rank
)
torch
.
cuda
.
manual_seed
(
42
+
rank
)
batch_size
=
4
...
...
@@ -138,30 +66,6 @@ def test():
return
def
test_dp
():
torch
.
manual_seed
(
42
)
torch
.
cuda
.
manual_seed
(
42
)
batch_size
=
6
num_expert
=
4
in_feat
=
2
out_feat
=
3
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
()
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
()
print
(
"data parallel of a nn.Linear model"
)
linear
=
nn
.
Linear
(
in_feat
,
in_feat
).
cuda
()
linear_dp
=
torch
.
nn
.
DataParallel
(
linear
,
device_ids
=
[
0
,
1
,
2
])
output
=
linear_dp
(
inp
)
print
(
"successful!"
)
print
(
"data parallel of our MoE model"
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
).
cuda
()
moe_dp
=
torch
.
nn
.
DataParallel
(
moe
,
device_ids
=
[
0
,
1
,
2
])
for
i
in
range
(
5
):
output
=
moe_dp
(
inp
,
gate
)
if
__name__
==
'__main__'
:
os
.
environ
[
'RANK'
]
=
os
.
environ
.
get
(
'OMPI_COMM_WORLD_RANK'
,
'0'
)
os
.
environ
[
'WORLD_SIZE'
]
=
os
.
environ
.
get
(
'OMPI_COMM_WORLD_SIZE'
,
'1'
)
...
...
@@ -172,14 +76,4 @@ if __name__ == '__main__':
else
:
rank
=
0
world_size
=
1
if
len
(
sys
.
argv
)
>=
2
:
task
=
sys
.
argv
[
1
]
print
(
'Specificed task {}'
.
format
(
task
))
if
task
==
'correctness'
:
test
()
elif
task
==
'dp'
:
test_dp
()
elif
task
==
'performance'
:
perf
()
else
:
test
()
test_moe
()
tests/test_performance.py
0 → 100644
View file @
01d9b418
from
moe
import
FMoE
as
MOELayer
import
torch
import
time
import
sys
import
os
rank
=
None
world_size
=
None
dev_name_default
=
'cuda:0'
def
test_performance
(
batch_size
,
in_feat
,
out_feat
,
num_expert
):
torch
.
manual_seed
(
42
+
rank
)
torch
.
cuda
.
manual_seed
(
42
+
rank
)
if
rank
==
0
:
print
(
'Performance test case bs {} {}x{} ne {}x{}'
.
format
(
batch_size
,
in_feat
,
out_feat
,
world_size
,
num_expert
))
if
world_size
>
1
:
dev_name
=
'cuda'
else
:
dev_name
=
dev_name_default
inp
=
torch
.
rand
(
batch_size
,
in_feat
).
cuda
(
dev_name
)
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
*
world_size
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
dev_name
)
moe
=
MOELayer
(
num_expert
,
in_feat
,
out_feat
,
world_size
).
cuda
(
dev_name
)
moe
.
train
()
# warm up
for
_
in
range
(
4
):
_
=
moe
(
inp
,
gate
)
n_runs
=
16
tott
=
0.
backt
=
0.
maxt
=
0.
sqtot
=
0.
for
i
in
range
(
n_runs
):
gate
=
torch
.
randint
(
low
=
0
,
high
=
num_expert
*
world_size
,
size
=
(
batch_size
,
),
requires_grad
=
False
).
int
().
cuda
(
dev_name
)
ts
=
time
.
time
()
o
=
moe
(
inp
,
gate
)
te
=
time
.
time
()
loss
=
o
.
sum
()
bts
=
time
.
time
()
loss
.
backward
()
bte
=
time
.
time
()
tott
+=
te
-
ts
sqtot
+=
(
te
-
ts
)
**
2
maxt
=
max
(
maxt
,
te
-
ts
)
backt
=
bte
-
bts
gflops
=
2e-9
*
n_runs
*
in_feat
*
out_feat
*
batch_size
/
tott
print
(
'Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'
.
format
(
tott
*
1e3
/
n_runs
,
maxt
*
1e3
,
(
sqtot
/
n_runs
-
(
tott
/
n_runs
)
**
2
)
*
1e3
/
n_runs
,
backt
*
1e3
/
n_runs
,
gflops
))
if
__name__
==
'__main__'
:
os
.
environ
[
'RANK'
]
=
os
.
environ
.
get
(
'OMPI_COMM_WORLD_RANK'
,
'0'
)
os
.
environ
[
'WORLD_SIZE'
]
=
os
.
environ
.
get
(
'OMPI_COMM_WORLD_SIZE'
,
'1'
)
if
int
(
os
.
environ
[
'WORLD_SIZE'
])
>
1
:
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
)
rank
=
torch
.
distributed
.
get_rank
()
world_size
=
torch
.
distributed
.
get_world_size
()
else
:
rank
=
0
world_size
=
1
test_performance
(
4096
,
1024
,
4096
,
8
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment