multi-gpu forward pass test

864a4522 · Rick Ho · 069cf01a · 864a4522 · 864a4522 · 864a4522
Commit 864a4522 authored Jan 11, 2021 by Rick Ho
Showing with 20 additions and 10 deletions

pytorch/cuda/moe.py pytorch/cuda/moe.py +4 -2

pytorch/cuda/moe_cuda_kernel.cu pytorch/cuda/moe_cuda_kernel.cu +3 -1

pytorch/cuda/moe_test.py pytorch/cuda/moe_test.py +13 -7

No files found.
--- a/pytorch/cuda/moe.py
+++ b/pytorch/cuda/moe.py
@@ -27,15 +27,17 @@ class MOELayer(nn.Module):
 class MOELayer_raw(nn.Module):
-    def __init__(self, num_expert=32, in_feat=1024, out_feat=1024):
+    def __init__(self, num_expert=32, in_feat=1024, out_feat=1024, 
+            world_size=0):
        super(MOELayer_raw, self).__init__()
        self.num_expert = num_expert
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.weight = nn.Parameter(
-            torch.Tensor(num_expert, out_feat, in_feat))
+            torch.Tensor(num_expert * world_size, out_feat, in_feat))
        self.reset_parameters()
    def reset_parameters(self):
        for i in range(self.num_expert):
            linear = nn.Linear(in_features=self.in_feat, 

--- a/pytorch/cuda/moe_cuda_kernel.cu
+++ b/pytorch/cuda/moe_cuda_kernel.cu
@@ -155,6 +155,7 @@ void moe_cuda_global_scatter_impl(
 		NCCL_SAFE_CALL(ncclGroupEnd());
 	}
 	delete [] expert_ptr;
+	smgr->sync(1);
 }
 std::vector<torch::Tensor> moe_cuda_global_scatter(
@@ -224,6 +225,7 @@ void moe_cuda_global_gather_impl(
 		NCCL_SAFE_CALL(ncclGroupEnd());
 	}
 	delete [] expert_ptr;
+	smgr->sync(1);
 }
 std::vector<torch::Tensor> moe_cuda_global_gather(
@@ -238,7 +240,7 @@ std::vector<torch::Tensor> moe_cuda_global_gather(
    AT_DISPATCH_FLOATING_TYPES(output_buf.scalar_type(), 
 			"moe_cuda_global_gather", ([&] {
-		moe_cuda_global_scatter_impl<scalar_t>(
+		moe_cuda_global_gather_impl<scalar_t>(
 			output_buf.data_ptr<scalar_t>(),
 			local_expert_count.data_ptr<int>(),
 			global_expert_count.data_ptr<int>(),

--- a/pytorch/cuda/moe_test.py
+++ b/pytorch/cuda/moe_test.py
@@ -67,6 +67,7 @@ def test_module(moe, linear, inp, gate):
    moe.zero_grad()
    x = (linear(inp))
    output = moe(x, gate)
+    print('ooutput', torch.distributed.get_rank(), output)
    y = output.mean()
    y.backward()
    return output, moe.weight.grad, linear.weight.grad, linear.bias.grad
@@ -86,8 +87,14 @@ def test():
        moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda()
    else:
        moe = MOELayer(num_expert, in_feat, out_feat).cuda()
-    moe_raw = MOELayer_raw(num_expert, in_feat, out_feat).cuda()
+    moe_raw = MOELayer_raw(num_expert, in_feat, out_feat, world_size).cuda()
-    moe_raw.weight.data = moe.weight.data.clone()
+    if world_size == 1:
+        moe_raw.weight.data = moe.weight.data.clone()
+    else:
+        weight_array = [torch.empty_like(moe.weight.data).cpu() 
+                for _ in range(world_size)]
+        torch.distributed.all_gather(weight_array, moe.weight.data.cpu())
+        moe_raw.weight.data = torch.cat(weight_array, dim=0).cuda()
    inp = torch.rand(batch_size, in_feat).cuda()
    gate = torch.randint(low=0, 
@@ -97,11 +104,12 @@ def test():
    # gate = torch.Tensor([0, 1, 0, 1]).int().cuda()
    moe_out = test_module(moe, linear, inp.clone(), gate.clone())
-    print('hhh')
-    return
    raw_out = test_module(moe_raw, linear, inp.clone(), gate.clone())
-    names = ['Out', 'Moe wei', 'Linear wei', 'Linear bias']
+    if world_size == 1:
+        names = ['Out', 'Moe wei', 'Linear wei', 'Linear bias']
+    else:
+        names = ['Out']
    for name, mo, ro in zip(names, moe_out, raw_out):
        err = (mo - ro).abs().sum()
        print('{} abs err {}'.format(name, err))
@@ -134,8 +142,6 @@ def test_dp():
 if __name__ == '__main__':
    torch.distributed.init_process_group(backend='mpi')
    world_size = torch.distributed.get_world_size()
-    if world_size == 1:
-        world_size = None
    test()
    # print('{} / {}'.format(torch.distributed.get_rank(), torch.distributed.get_world_size()))
    # perf()