"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "f20b83a04f0bb5fe6df443609e5e337a584d8f49"
Commit 610752d2 authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

Merge branch 'master' of github.com:xptree/poly-xfmr

Conflicts:
	pytorch/cuda/moe.py
parents 5d076dcf cb6aadaa
...@@ -43,7 +43,7 @@ class MOELayer(nn.Module): ...@@ -43,7 +43,7 @@ class MOELayer(nn.Module):
def reset_parameters(self): def reset_parameters(self):
for i in range(self.num_expert): for i in range(self.num_expert):
linear = nn.Linear(in_features=self.in_feat, out_features=out_feat) linear = nn.Linear(in_features=self.in_feat, out_features=self.out_feat)
self.weight.data[i] = linear.weight.data self.weight.data[i] = linear.weight.data
def forward(self, inp, gate): def forward(self, inp, gate):
...@@ -62,7 +62,7 @@ class MOELayer_raw(nn.Module): ...@@ -62,7 +62,7 @@ class MOELayer_raw(nn.Module):
def reset_parameters(self): def reset_parameters(self):
for i in range(self.num_expert): for i in range(self.num_expert):
linear = nn.Linear(in_features=self.in_feat, out_features=out_feat) linear = nn.Linear(in_features=self.in_feat, out_features=self.out_feat)
self.weight.data[i] = linear.weight.data self.weight.data[i] = linear.weight.data
def forward(self, inp, gate): def forward(self, inp, gate):
...@@ -73,30 +73,35 @@ class MOELayer_raw(nn.Module): ...@@ -73,30 +73,35 @@ class MOELayer_raw(nn.Module):
x[i] = self.weight[gate_long[i]] @ inp[i] x[i] = self.weight[gate_long[i]] @ inp[i]
return x return x
batch_size = 4
num_expert = 8
in_feat = 2
out_feat = 3
moe = MOELayer(num_expert, in_feat, out_feat).cuda() def test():
moe_raw = MOELayer_raw(num_expert, in_feat, out_feat).cuda() batch_size = 4
moe_raw.weight.data = moe.weight.data.clone() num_expert = 4
in_feat = 2
out_feat = 3
moe = MOELayer(num_expert, in_feat, out_feat).cuda()
moe_raw = MOELayer_raw(num_expert, in_feat, out_feat).cuda()
moe_raw.weight.data = moe.weight.data.clone()
inp = torch.rand(batch_size, in_feat).cuda() inp = torch.rand(batch_size, in_feat).cuda()
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda() gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
output = moe(inp, gate) output = moe(inp, gate)
output_raw = moe_raw(inp.clone(), gate.clone()) output_raw= moe_raw(inp.clone(), gate.clone())
#print(output) print(output)
#print(output_raw) print(output_raw)
y = output.mean() y = output.mean()
y.backward() y.backward()
y_raw = output_raw.mean() y_raw = output_raw.mean()
y_raw.backward() y_raw.backward()
print(moe.weight.grad) print(moe.weight.grad)
print(moe_raw.weight.grad) print(moe_raw.weight.grad)
if __name__ == '__main__':
test()
from moe import MOELayer
import torch
import time
def perf():
batch_size = 128
in_feat = 1024
out_feat = 4096
num_expert = 4
inp = torch.rand(batch_size, in_feat).cuda()
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
moe = MOELayer(num_expert, in_feat, out_feat).cuda()
o = moe(inp, gate)
n_runs = 16
tott = 0.
for i in range(n_runs):
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ), requires_grad=False).int().cuda()
ts = time.time()
o = moe(inp, gate)
te = time.time()
tott += te - ts
gflops = 2e-9 * n_runs * in_feat * out_feat * batch_size
print('Mean time {:.3f} ms, {:.3f} GFLOPs'.format(tott * 1e3 / n_runs, gflops))
if __name__ == '__main__':
perf()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment