Commit 01d9b418 authored by Rick Ho's avatar Rick Ho
Browse files

split more tests

parent 22e1eb45
from .fmoe_functions import *
from .functions import *
import torch.nn as nn
import torch.nn.functional as F
......
......@@ -2,6 +2,10 @@
if [ ! -z $OMPI_COMM_WORLD_LOCAL_RANK ]
then
export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
fi
if [ -z $MASTER_PORT ]
then
export MASTER_ADDR=localhost
export MASTER_PORT=36666
fi
......@@ -18,9 +22,5 @@ mkdir -p logs
SCRIPT_PATH=$(dirname $(dirname $(realpath $0)))
export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-3.7:$PYTHONPATH
export LD_LIBRARY_PATH=/home/laekov/.local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH
if [ -z $1 ]
then
python3 tests/moe_test.py 2>logs/$RANK.log
else
python3 $@ 2>logs/$RANK.log
fi
exec python3 $@ 2>logs/$RANK.log
#!/bin/bash
runtest() {
echo Testing $@
$@
if [ $? = 0 ]
then
echo '----------------- Passed'
else
echo '----------------- Failed'
exit
fi
}
if [ ! -z $1 ]
then
runtest $@
exit
fi
TEST_SCRIPT=$(dirname $(realpath $0))/test.sh
runtest $TEST_SCRIPT tests/test_numerical.py
runtest mpirun -n 2 $TEST_SCRIPT tests/test_numerical.py
runtest $TEST_SCRIPT tests/test_dp.py
runtest $TEST_SCRIPT tests/test_performance.py
runtest $TEST_SCRIPT mpirun -n 2 tests/test_performance.py
from moe import FMoE as MOELayer
from moe import BruteForceMoE as MOELayer_raw
import torch
from torch import nn
import sys
import os
n_devices = int(os.environ.get('N_GPUS', '2'))
def test_dp():
torch.manual_seed(42)
torch.cuda.manual_seed(42)
batch_size = 6
num_expert = 4
in_feat = 2
out_feat = 3
inp = torch.rand(batch_size, in_feat).cuda()
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ),
requires_grad=False).cuda()
print("data parallel of our MoE model")
moe = MOELayer(num_expert, in_feat, out_feat).cuda()
moe_dp = torch.nn.DataParallel(moe, device_ids=list(range(n_devices)))
for i in range(5):
output = moe_dp(inp, gate)
print('Successful')
if __name__ == '__main__':
test_dp()
......@@ -2,96 +2,24 @@ from moe import FMoE as MOELayer
from moe import BruteForceMoE as MOELayer_raw
import torch
from torch import nn
import time
import sys
import os
dev_name_default = 'cuda:0'
def perf():
torch.manual_seed(42 + torch.distributed.get_rank())
torch.cuda.manual_seed(42 + torch.distributed.get_rank())
if len(sys.argv) == 6:
batch_size = int(sys.argv[2])
in_feat = int(sys.argv[3])
out_feat = int(sys.argv[4])
num_expert = int(sys.argv[5])
else:
batch_size = 4096
in_feat = 1024
out_feat = 4096
num_expert = 4
if torch.distributed.get_rank() == 0:
print('Performance test case bs {} {}x{} ne {}'.format(batch_size,
in_feat, out_feat, num_expert))
if torch.distributed.get_world_size() > 1:
dev_name = 'cuda'
else:
dev_name = dev_name_default
inp = torch.rand(batch_size, in_feat).cuda(dev_name)
gate = torch.randint(low=0,
high=num_expert * torch.distributed.get_world_size(),
size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda(dev_name)
moe.train()
o = moe(inp, gate)
o = moe(inp, gate)
o = moe(inp, gate)
o = moe(inp, gate)
n_runs = 16
tott = 0.
backt = 0.
maxt = 0.
sqtot = 0.
for i in range(n_runs):
gate = torch.randint(low=0,
high=num_expert * torch.distributed.get_world_size(),
size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
ts = time.time()
o = moe(inp, gate)
te = time.time()
loss = o.sum()
bts = time.time()
loss.backward()
bte = time.time()
tott += te - ts
sqtot += (te - ts)**2
maxt = max(maxt, te - ts)
backt = bte - bts
gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size / tott
print('Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'.format(
tott * 1e3 / n_runs, maxt * 1e3,
(sqtot / n_runs - (tott / n_runs)**2) * 1e3 / n_runs,
backt * 1e3 / n_runs, gflops))
def test_module(moe, linear, inp, gate):
linear.zero_grad()
moe.zero_grad()
x = (linear(inp))
output = moe(x, gate)
y = output.mean()
y.backward()
return output, moe.weight.grad, linear.weight.grad, linear.bias.grad
rank = None
world_size = None
def test():
def test_moe():
def test_module(moe, linear, inp, gate):
linear.zero_grad()
moe.zero_grad()
x = (linear(inp))
output = moe(x, gate)
y = output.mean()
y.backward()
return output, moe.weight.grad, linear.weight.grad, linear.bias.grad
torch.manual_seed(42 + rank)
torch.cuda.manual_seed(42 + rank)
batch_size = 4
......@@ -138,30 +66,6 @@ def test():
return
def test_dp():
torch.manual_seed(42)
torch.cuda.manual_seed(42)
batch_size = 6
num_expert = 4
in_feat = 2
out_feat = 3
inp = torch.rand(batch_size, in_feat).cuda()
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
print("data parallel of a nn.Linear model")
linear = nn.Linear(in_feat, in_feat).cuda()
linear_dp = torch.nn.DataParallel(linear, device_ids=[0,1,2])
output = linear_dp(inp)
print("successful!")
print("data parallel of our MoE model")
moe = MOELayer(num_expert, in_feat, out_feat).cuda()
moe_dp = torch.nn.DataParallel(moe, device_ids=[0,1,2])
for i in range(5):
output = moe_dp(inp, gate)
if __name__ == '__main__':
os.environ['RANK'] = os.environ.get('OMPI_COMM_WORLD_RANK', '0')
os.environ['WORLD_SIZE'] = os.environ.get('OMPI_COMM_WORLD_SIZE', '1')
......@@ -172,14 +76,4 @@ if __name__ == '__main__':
else:
rank = 0
world_size = 1
if len(sys.argv) >= 2:
task = sys.argv[1]
print('Specificed task {}'.format(task))
if task == 'correctness':
test()
elif task == 'dp':
test_dp()
elif task == 'performance':
perf()
else:
test()
test_moe()
from moe import FMoE as MOELayer
import torch
import time
import sys
import os
rank = None
world_size = None
dev_name_default = 'cuda:0'
def test_performance(batch_size, in_feat, out_feat, num_expert):
torch.manual_seed(42 + rank)
torch.cuda.manual_seed(42 + rank)
if rank == 0:
print('Performance test case bs {} {}x{} ne {}x{}'.format(
batch_size, in_feat, out_feat, world_size, num_expert))
if world_size > 1:
dev_name = 'cuda'
else:
dev_name = dev_name_default
inp = torch.rand(batch_size, in_feat).cuda(dev_name)
gate = torch.randint(low=0,
high=num_expert * world_size,
size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda(dev_name)
moe.train()
# warm up
for _ in range(4):
_ = moe(inp, gate)
n_runs = 16
tott = 0.
backt = 0.
maxt = 0.
sqtot = 0.
for i in range(n_runs):
gate = torch.randint(low=0,
high=num_expert * world_size,
size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
ts = time.time()
o = moe(inp, gate)
te = time.time()
loss = o.sum()
bts = time.time()
loss.backward()
bte = time.time()
tott += te - ts
sqtot += (te - ts)**2
maxt = max(maxt, te - ts)
backt = bte - bts
gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size / tott
print('Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'.format(
tott * 1e3 / n_runs, maxt * 1e3,
(sqtot / n_runs - (tott / n_runs)**2) * 1e3 / n_runs,
backt * 1e3 / n_runs, gflops))
if __name__ == '__main__':
os.environ['RANK'] = os.environ.get('OMPI_COMM_WORLD_RANK', '0')
os.environ['WORLD_SIZE'] = os.environ.get('OMPI_COMM_WORLD_SIZE', '1')
if int(os.environ['WORLD_SIZE']) > 1:
torch.distributed.init_process_group(backend='nccl')
rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size()
else:
rank = 0
world_size = 1
test_performance(4096, 1024, 4096, 8)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment