split more tests

01d9b418 · Rick Ho · 22e1eb45 · 01d9b418 · 01d9b418 · 01d9b418
Commit 01d9b418 authored Feb 01, 2021 by Rick Ho
7 changed files
--- a/fmoe/fmoe_functions.py
+++ b/fmoe/fmoe_functions.py
--- a/fmoe/layers.py
+++ b/fmoe/layers.py
-from .fmoe_functions import *
+from .functions import *
 import torch.nn as nn
 import torch.nn.functional as F


--- a/tests/dev_test.sh
+++ b/tests/dev_test.sh
@@ -2,6 +2,10 @@
 if [ ! -z $OMPI_COMM_WORLD_LOCAL_RANK ]
 then
 	export CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK
+fi
+
+if [ -z $MASTER_PORT ]
+then
 	export MASTER_ADDR=localhost
 	export MASTER_PORT=36666
 fi
@@ -18,9 +22,5 @@ mkdir -p logs
 SCRIPT_PATH=$(dirname $(dirname $(realpath $0)))
 export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-3.7:$PYTHONPATH
 export LD_LIBRARY_PATH=/home/laekov/.local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH
-if [ -z $1 ]
-then
-	python3 tests/moe_test.py 2>logs/$RANK.log
-else
-	python3 $@ 2>logs/$RANK.log
-fi
+
+exec python3 $@ 2>logs/$RANK.log
--- a/tests/test_all.sh
+++ b/tests/test_all.sh
+#!/bin/bash
+
+runtest() {
+	echo Testing $@
+	$@
+	if [ $? = 0 ]
+	then
+		echo '----------------- Passed'
+	else
+		echo '----------------- Failed'
+		exit
+	fi
+}
+
+if [ ! -z $1 ]
+then
+	runtest $@
+	exit
+fi
+
+TEST_SCRIPT=$(dirname $(realpath $0))/test.sh
+runtest $TEST_SCRIPT tests/test_numerical.py
+runtest mpirun -n 2 $TEST_SCRIPT tests/test_numerical.py
+runtest $TEST_SCRIPT tests/test_dp.py
+runtest $TEST_SCRIPT tests/test_performance.py
+runtest $TEST_SCRIPT mpirun -n 2 tests/test_performance.py
--- a/tests/test_dp.py
+++ b/tests/test_dp.py
+from moe import FMoE as MOELayer 
+from moe import BruteForceMoE as MOELayer_raw
+import torch
+from torch import nn
+import sys
+import os
+
+
+n_devices = int(os.environ.get('N_GPUS', '2'))
+
+
+def test_dp():
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+    batch_size = 6
+    num_expert = 4
+    in_feat = 2
+    out_feat = 3
+
+    inp = torch.rand(batch_size, in_feat).cuda()
+    gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), 
+            requires_grad=False).cuda()
+
+    print("data parallel of our MoE model")
+    moe = MOELayer(num_expert, in_feat, out_feat).cuda()
+    moe_dp = torch.nn.DataParallel(moe, device_ids=list(range(n_devices)))
+    for i in range(5):
+        output = moe_dp(inp, gate)
+    print('Successful')
+
+
+if __name__ == '__main__':
+    test_dp()
--- a/tests/moe_test.py
+++ b/tests/moe_test.py
@@ -2,96 +2,24 @@ from moe import FMoE as MOELayer
 from moe import BruteForceMoE as MOELayer_raw
 import torch
 from torch import nn
-import time
 import sys
 import os


-dev_name_default = 'cuda:0'
-
-
-def perf():
-    torch.manual_seed(42 + torch.distributed.get_rank())
-    torch.cuda.manual_seed(42 + torch.distributed.get_rank())
-    
-    if len(sys.argv) == 6:
-        batch_size = int(sys.argv[2])
-        in_feat = int(sys.argv[3])
-        out_feat = int(sys.argv[4])
-        num_expert = int(sys.argv[5])
-    else:
-        batch_size = 4096
-        in_feat = 1024
-        out_feat = 4096
-        num_expert = 4
-    if torch.distributed.get_rank() == 0:
-        print('Performance test case bs {} {}x{} ne {}'.format(batch_size,
-            in_feat, out_feat, num_expert))
-    if torch.distributed.get_world_size() > 1:
-        dev_name = 'cuda'
-    else:
-        dev_name = dev_name_default
-
-    inp = torch.rand(batch_size, in_feat).cuda(dev_name)
-    gate = torch.randint(low=0, 
-            high=num_expert * torch.distributed.get_world_size(), 
-            size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
-
-    moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda(dev_name)
-    moe.train()
-
-    o = moe(inp, gate)
-
-    o = moe(inp, gate)
-    o = moe(inp, gate)
-    o = moe(inp, gate)
-
-    n_runs = 16
-    tott = 0.
-    backt = 0.
-    maxt = 0.
-    sqtot = 0.
-    for i in range(n_runs):
-        gate = torch.randint(low=0, 
-                high=num_expert * torch.distributed.get_world_size(), 
-                size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
-        ts = time.time()
-        o = moe(inp, gate)
-        te = time.time()
-
-        loss = o.sum()
-
-        bts = time.time()
-        loss.backward()
-        bte = time.time()
-
-        tott += te - ts
-        sqtot += (te - ts)**2
-        maxt = max(maxt, te - ts)
-        backt = bte - bts
-
-    gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size / tott
-    print('Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'.format(
-        tott * 1e3 / n_runs, maxt * 1e3, 
-        (sqtot / n_runs - (tott / n_runs)**2) * 1e3 / n_runs, 
-        backt * 1e3 / n_runs, gflops))
-
-
-def test_module(moe, linear, inp, gate):
-    linear.zero_grad()
-    moe.zero_grad()
-    x = (linear(inp))
-    output = moe(x, gate)
-    y = output.mean()
-    y.backward()
-    return output, moe.weight.grad, linear.weight.grad, linear.bias.grad
-
-
 rank = None
 world_size = None


-def test():
+def test_moe():
+    def test_module(moe, linear, inp, gate):
+        linear.zero_grad()
+        moe.zero_grad()
+        x = (linear(inp))
+        output = moe(x, gate)
+        y = output.mean()
+        y.backward()
+        return output, moe.weight.grad, linear.weight.grad, linear.bias.grad
+
    torch.manual_seed(42 + rank)
    torch.cuda.manual_seed(42 + rank)
    batch_size = 4
@@ -138,30 +66,6 @@ def test():
            return


-def test_dp():
-    torch.manual_seed(42)
-    torch.cuda.manual_seed(42)
-    batch_size = 6
-    num_expert = 4
-    in_feat = 2
-    out_feat = 3
-
-    inp = torch.rand(batch_size, in_feat).cuda()
-    gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
-
-    print("data parallel of a nn.Linear model")
-    linear = nn.Linear(in_feat, in_feat).cuda()
-    linear_dp = torch.nn.DataParallel(linear, device_ids=[0,1,2])
-    output = linear_dp(inp)
-    print("successful!")
-
-    print("data parallel of our MoE model")
-    moe = MOELayer(num_expert, in_feat, out_feat).cuda()
-    moe_dp = torch.nn.DataParallel(moe, device_ids=[0,1,2])
-    for i in range(5):
-        output = moe_dp(inp, gate)
-
-
 if __name__ == '__main__':
    os.environ['RANK'] = os.environ.get('OMPI_COMM_WORLD_RANK', '0')
    os.environ['WORLD_SIZE'] = os.environ.get('OMPI_COMM_WORLD_SIZE', '1')
@@ -172,14 +76,4 @@ if __name__ == '__main__':
    else:
        rank = 0
        world_size = 1
-    if len(sys.argv) >= 2:
-        task = sys.argv[1]
-        print('Specificed task {}'.format(task))
-        if task == 'correctness':
-            test()
-        elif task == 'dp':
-            test_dp()
-        elif task == 'performance':
-            perf()
-    else:
-        test()
+    test_moe()
--- a/tests/test_performance.py
+++ b/tests/test_performance.py
+from moe import FMoE as MOELayer 
+import torch
+import time
+import sys
+import os
+
+
+rank = None
+world_size = None
+dev_name_default = 'cuda:0'
+
+
+def test_performance(batch_size, in_feat, out_feat, num_expert):
+    torch.manual_seed(42 + rank)
+    torch.cuda.manual_seed(42 + rank)
+    
+    if rank == 0:
+        print('Performance test case bs {} {}x{} ne {}x{}'.format(
+            batch_size, in_feat, out_feat, world_size, num_expert))
+    if world_size > 1:
+        dev_name = 'cuda'
+    else:
+        dev_name = dev_name_default
+
+    inp = torch.rand(batch_size, in_feat).cuda(dev_name)
+    gate = torch.randint(low=0, 
+            high=num_expert * world_size,
+            size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
+
+    moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda(dev_name)
+    moe.train()
+
+    # warm up
+    for _ in range(4):
+        _ = moe(inp, gate)
+
+    n_runs = 16
+    tott = 0.
+    backt = 0.
+    maxt = 0.
+    sqtot = 0.
+    for i in range(n_runs):
+        gate = torch.randint(low=0, 
+                high=num_expert * world_size,
+                size=(batch_size, ), requires_grad=False).int().cuda(dev_name)
+        ts = time.time()
+        o = moe(inp, gate)
+        te = time.time()
+
+        loss = o.sum()
+
+        bts = time.time()
+        loss.backward()
+        bte = time.time()
+
+        tott += te - ts
+        sqtot += (te - ts)**2
+        maxt = max(maxt, te - ts)
+        backt = bte - bts
+
+    gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size / tott
+    print('Time mean/max/stdev/back {:.3f} {:.3f} {:.3f} {:.3f} ms, {:.3f} GFLOPs'.format(
+        tott * 1e3 / n_runs, maxt * 1e3, 
+        (sqtot / n_runs - (tott / n_runs)**2) * 1e3 / n_runs, 
+        backt * 1e3 / n_runs, gflops))
+
+
+if __name__ == '__main__':
+    os.environ['RANK'] = os.environ.get('OMPI_COMM_WORLD_RANK', '0')
+    os.environ['WORLD_SIZE'] = os.environ.get('OMPI_COMM_WORLD_SIZE', '1')
+    if int(os.environ['WORLD_SIZE']) > 1:
+        torch.distributed.init_process_group(backend='nccl')
+        rank = torch.distributed.get_rank()
+        world_size = torch.distributed.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+   
+    test_performance(4096, 1024, 4096, 8)