Merge branch 'master' of github.com:xptree/poly-xfmr

Conflicts: pytorch/cuda/moe.py

Merge branch 'master' of github.com:xptree/poly-xfmr
Conflicts: pytorch/cuda/moe.py
610752d2 · Jiezhong Qiu · 5d076dcf · cb6aadaa · 610752d2 · 610752d2
Commit 610752d2 authored Dec 30, 2020 by Jiezhong Qiu
Hide whitespace changes
Inline Side-by-side

Showing with 59 additions and 21 deletions

pytorch/cuda/moe.py pytorch/cuda/moe.py +26 -21

pytorch/cuda/moe_test.py pytorch/cuda/moe_test.py +33 -0

No files found.
--- a/pytorch/cuda/moe.py
+++ b/pytorch/cuda/moe.py
@@ -43,7 +43,7 @@ class MOELayer(nn.Module):
    def reset_parameters(self):
        for i in range(self.num_expert):
-            linear = nn.Linear(in_features=self.in_feat, out_features=out_feat)
+            linear = nn.Linear(in_features=self.in_feat, out_features=self.out_feat)
            self.weight.data[i] = linear.weight.data
    def forward(self, inp, gate):
@@ -62,7 +62,7 @@ class MOELayer_raw(nn.Module):
    def reset_parameters(self):
        for i in range(self.num_expert):
-            linear = nn.Linear(in_features=self.in_feat, out_features=out_feat)
+            linear = nn.Linear(in_features=self.in_feat, out_features=self.out_feat)
            self.weight.data[i] = linear.weight.data
    def forward(self, inp, gate):
@@ -73,30 +73,35 @@ class MOELayer_raw(nn.Module):
            x[i] = self.weight[gate_long[i]] @ inp[i]
        return x
-batch_size = 4
-num_expert = 8
-in_feat = 2
-out_feat = 3
-moe = MOELayer(num_expert, in_feat, out_feat).cuda()
+def test():
-moe_raw = MOELayer_raw(num_expert, in_feat, out_feat).cuda()
+    batch_size = 4
-moe_raw.weight.data = moe.weight.data.clone()
+    num_expert = 4
+    in_feat = 2
+    out_feat = 3
+    moe = MOELayer(num_expert, in_feat, out_feat).cuda()
+    moe_raw = MOELayer_raw(num_expert, in_feat, out_feat).cuda()
+    moe_raw.weight.data = moe.weight.data.clone()
-inp = torch.rand(batch_size, in_feat).cuda()
+    inp = torch.rand(batch_size, in_feat).cuda()
-gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
+    gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
-output = moe(inp, gate)
+    output = moe(inp, gate)
-output_raw = moe_raw(inp.clone(), gate.clone())
+    output_raw= moe_raw(inp.clone(), gate.clone())
-#print(output)
+    print(output)
-#print(output_raw)
+    print(output_raw)
-y = output.mean()
+    y = output.mean()
-y.backward()
+    y.backward()
-y_raw = output_raw.mean()
+    y_raw = output_raw.mean()
-y_raw.backward()
+    y_raw.backward()
-print(moe.weight.grad)
+    print(moe.weight.grad)
-print(moe_raw.weight.grad)
+    print(moe_raw.weight.grad)
+if __name__ == '__main__':
+    test()
--- a/pytorch/cuda/moe_test.py
+++ b/pytorch/cuda/moe_test.py
+from moe import MOELayer
+import torch
+import time
+def perf():
+    batch_size = 128
+    in_feat = 1024
+    out_feat = 4096
+    num_expert = 4
+    inp = torch.rand(batch_size, in_feat).cuda()
+    gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
+    moe = MOELayer(num_expert, in_feat, out_feat).cuda()
+    o = moe(inp, gate)
+    n_runs = 16
+    tott = 0.
+    for i in range(n_runs):
+        gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
+        ts = time.time()
+        o = moe(inp, gate)
+        te = time.time()
+        tott += te - ts
+    gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size
+    print('Mean time {:.3f} ms, {:.3f} GFLOPs'.format(tott * 1e3 / n_runs, gflops))
+if __name__ == '__main__':
+    perf()