Unverified Commit baf2b118 authored by Rick Ho's avatar Rick Ho Committed by GitHub
Browse files

Merge pull request #2 from test

Add test for FMoE layer
parents c5556037 184b0404
...@@ -115,7 +115,7 @@ class FMoE(nn.Module): ...@@ -115,7 +115,7 @@ class FMoE(nn.Module):
if expert_fn is None: if expert_fn is None:
assert expert is not None, 'Either expert or expert_fn should be set' assert expert is not None, 'Either expert or expert_fn should be set'
self.experts = [expert(d_model) for _ in range(num_expert)] self.experts = [expert(d_model) for _ in range(num_expert)]
def expert_fn(self, inp, fwd_expert_count): def expert_fn(inp, fwd_expert_count):
outputs = [] outputs = []
base_idx = 0 base_idx = 0
for i in range(self.num_expert): for i in range(self.num_expert):
......
import math import math
from torch import nn from torch import nn
import torch import torch
import torch.nn.functional as F
from fmoe.layers import FMoELinear, _fmoe_full_forward
class BruteForceMoELinear(nn.Module):
class FMoE(nn.Module): def __init__(
def __init__(self, num_expert=32, in_feat=1024, out_feat=1024, self,
world_size=1): activation,
super(FMoE, self).__init__() num_expert=32,
d_model=1024,
d_hidden=2048,
world_size=1,
top_k=2,
):
super(BruteForceMoELinear, self).__init__()
self.num_expert = num_expert self.num_expert = num_expert
self.in_feat = in_feat self.d_model = d_model
self.out_feat = out_feat self.activation = activation
self.world_size = world_size self.weight_htoh4 = nn.Parameter(
self.linear = FMoELinear(num_expert, in_feat, out_feat) torch.Tensor(num_expert * world_size, d_hidden, d_model)
self.weight = self.linear.weight )
self.reset_parameters() self.weight_h4toh = nn.Parameter(
torch.Tensor(num_expert * world_size, d_model, d_hidden)
def reset_parameters(self): )
self.linear.reset_parameters() self.top_k = top_k
def forward(self, inp, gate): def forward(self, inp, gate_idx, gate_score):
return _fmoe_full_forward(inp, gate, [self.linear], None, gate_long = gate_idx.long()
self.num_expert, self.world_size) batch_size = inp.size(0)
x = inp.new_zeros((batch_size, self.d_model))
for i in range(batch_size):
t = inp[i] @ self.weight_htoh4[gate_long[i]].t()
t = self.activation(t)
x[i] = t @ self.weight_h4toh[gate_long[i]].t()
x = torch.bmm(gate_score, x.view(-1, self.top_k, self.d_model)).reshape(
-1, self.d_model
)
return x
class BruteForceMoE(nn.Module): class BruteForceMoE(nn.Module):
def __init__(self, num_expert=32, in_feat=1024, out_feat=1024, def __init__(self, expert, num_expert=32, d_model=1024, world_size=1, top_k=2):
world_size=0):
super(BruteForceMoE, self).__init__() super(BruteForceMoE, self).__init__()
self.num_expert = num_expert self.num_expert = num_expert
self.in_feat = in_feat self.d_model = d_model
self.out_feat = out_feat self.top_k = top_k
self.weight = nn.Parameter( self.experts = [expert(d_model) for _ in range(num_expert * world_size)]
torch.Tensor(num_expert * world_size, out_feat, in_feat))
self.reset_parameters() def forward(self, inp, gate_idx, gate_score):
gate_long = gate_idx.long()
def reset_parameters(self):
for i in range(self.num_expert):
linear = nn.Linear(in_features=self.in_feat,
out_features=self.out_feat)
# print(linear.weight.shape)
self.weight.data[i] = linear.weight.data
def forward(self, inp, gate):
gate_long = gate.long()
batch_size = inp.size(0) batch_size = inp.size(0)
x = inp.new_zeros((batch_size, self.out_feat)) x = inp.new_zeros((batch_size, self.d_model))
for i in range(batch_size): for i in range(batch_size):
x[i] = inp[i] @ self.weight[gate_long[i]].t() x[i] = self.experts[gate_long[i]](inp[i])
x = torch.bmm(gate_score, x.view(-1, self.top_k, self.d_model)).reshape(
-1, self.d_model
)
return x return x
class NaiveExpert(nn.Module):
def __init__(self, d_model):
super(NaiveExpert, self).__init__()
self.linear = nn.Linear(d_model, d_model).cuda()
def forward(self, x):
return self.linear(x)
class LinearExpert(nn.Module):
def __init__(self, d_model):
super(LinearExpert, self).__init__()
self.model = nn.Sequential(
nn.Linear(d_model, d_model * 2), nn.ReLU(), nn.Linear(d_model * 2, d_model),
).cuda()
def forward(self, x):
return self.model(x)
from moe import FMoE as MOELayer import json
from moe import BruteForceMoE as MOELayer_raw
import torch
from torch import nn
import sys
import os import os
import sys
from typing import List, Callable, Dict, Type, Union
import pytest
import torch
import torch.nn as nn
rank = None from fmoe.gates import NaiveGate
world_size = None from fmoe.layers import FMoE
from fmoe.transformer import _Expert
from moe import BruteForceMoELinear, BruteForceMoE, NaiveExpert, LinearExpert
rank = 0
world_size = 1
def test_moe():
def test_module(moe, linear, inp, gate): def _perform_forward(moe: nn.Module, moe_raw: nn.Module, batch_size, d_model, top_k):
linear.zero_grad()
moe.zero_grad() moe.zero_grad()
x = (linear(inp)) moe_raw.zero_grad()
output = moe(x, gate) inp = torch.rand(batch_size, d_model).cuda()
y = output.mean() gate_idx, gate_score = moe.gate(inp)
y.backward() inp_repeated = inp.repeat_interleave(repeats=top_k, dim=0)
return output, moe.weight.grad, linear.weight.grad, linear.bias.grad moe_out = moe(inp).mean()
raw_out = moe_raw(inp_repeated, gate_idx, gate_score).mean()
moe_out.backward()
raw_out.backward()
return moe_out, raw_out
def _assert_numercial(names, moe_out_list, raw_out_list):
for name, mo, ro in zip(names, moe_out_list, raw_out_list):
err = (mo - ro).abs().sum()
print("Rank {} {} abs err {}".format(rank, name, err))
if err > 1e-3:
sys.stderr.write("=========== moe out ==============\n")
sys.stderr.write("{}\n".format(mo))
sys.stderr.write("=========== raw out ==============\n")
sys.stderr.write("{}\n".format(ro))
assert False
@pytest.mark.parametrize("num_expert", [4, 8])
@pytest.mark.parametrize("top_k", [2])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("d_model", [16])
@pytest.mark.parametrize("d_hidden", [32])
def test_fmoe_linear(
num_expert,
top_k,
batch_size,
d_model,
d_hidden,
activation=torch.nn.functional.gelu,
):
torch.manual_seed(42 + rank) torch.manual_seed(42 + rank)
torch.cuda.manual_seed(42 + rank) torch.cuda.manual_seed(42 + rank)
batch_size = 4
num_expert = 2
in_feat = 6
out_feat = 7
linear = nn.Linear(in_feat, in_feat).cuda() experts = _Expert(num_expert, d_model, d_hidden, activation).cuda()
def expert_fn(inp, gate):
return experts(inp, gate)
moe = FMoE(
num_expert=num_expert,
d_model=d_model,
gate=NaiveGate,
world_size=world_size,
mp_group=None,
expert_fn=expert_fn,
top_k=top_k,
).cuda()
moe_raw = BruteForceMoELinear(
activation=activation,
num_expert=num_expert,
d_model=d_model,
d_hidden=d_hidden,
world_size=world_size,
).cuda()
moe = MOELayer(num_expert, in_feat, out_feat, world_size).cuda()
moe_raw = MOELayer_raw(num_expert, in_feat, out_feat, world_size).cuda()
if world_size == 1: if world_size == 1:
moe_raw.weight.data = moe.weight.data.clone() moe_raw.weight_htoh4.data = experts.htoh4.weight.data.clone()
moe_raw.weight_h4toh.data = experts.h4toh.weight.data.clone()
else: else:
weight_array = [torch.empty_like(moe.weight.data) weight_htoh4_array = [
for _ in range(world_size)] torch.empty_like(experts.htoh4.weight.data) for _ in range(world_size)
torch.distributed.all_gather(weight_array, moe.weight.data) ]
moe_raw.weight.data = torch.cat(weight_array, dim=0) torch.distributed.all_gather(weight_htoh4_array, experts.htoh4.weight.data)
moe_raw.weight_htoh4.data = torch.cat(weight_htoh4_array, dim=0)
inp = torch.rand(batch_size, in_feat).cuda()
gate = torch.randint(low=0, weight_h4toh_array = [
high=num_expert * world_size, torch.empty_like(experts.h4toh.weight.data) for _ in range(world_size)
size=(batch_size,), ]
requires_grad=False).int().cuda() torch.distributed.all_gather(weight_h4toh_array, experts.h4toh.weight.data)
# gate = torch.Tensor([0, 1, 0, 1]).int().cuda() moe_raw.weight_h4toh.data = torch.cat(weight_h4toh_array, dim=0)
moe_out = test_module(moe, linear, inp.clone(), gate.clone()) moe_out, raw_out = _perform_forward(moe, moe_raw, batch_size, d_model, top_k)
raw_out = test_module(moe_raw, linear, inp.clone(), gate.clone())
moe_out_list = moe_out, experts.htoh4.weight.grad, experts.h4toh.weight.grad
names = ['Out', 'Moe wei', 'Linear wei', 'Linear bias'] raw_out_list = raw_out, moe_raw.weight_htoh4.grad, moe_raw.weight_h4toh.grad
if world_size > 1: if world_size > 1:
ou, wg, lwg, lbg = raw_out _, htoh4_grad, h4toh_grad = raw_out_list
torch.distributed.all_reduce(wg) torch.distributed.all_reduce(htoh4_grad)
wg = wg[rank * num_expert:(rank + 1)* num_expert] torch.distributed.all_reduce(h4toh_grad)
raw_out = ou, wg, lwg, lbg htoh4_grad = htoh4_grad[rank * num_expert : (rank + 1) * num_expert]
for name, mo, ro in zip(names, moe_out, raw_out): h4toh_grad = h4toh_grad[rank * num_expert : (rank + 1) * num_expert]
err = (mo - ro).abs().sum() raw_out_list = _, htoh4_grad, h4toh_grad
print('Rank {} {} abs err {}'.format(rank, name, err))
if err > 1e-3: names = ["output", "htoh4 weight grad", "h4toh weight grad"]
sys.stderr.write('=========== moe out ==============\n') _assert_numercial(names, moe_out_list, raw_out_list)
sys.stderr.write('{}\n'.format(mo))
sys.stderr.write('=========== raw out ==============\n')
sys.stderr.write('{}\n'.format(ro)) @pytest.mark.parametrize("batch_size", [4])
return @pytest.mark.parametrize("num_expert", [4, 8])
@pytest.mark.parametrize("d_model", [16])
@pytest.mark.parametrize("top_k", [2])
if __name__ == '__main__': @pytest.mark.parametrize("expert", ["NaiveExpert", "LinearExpert"])
os.environ['RANK'] = os.environ.get('OMPI_COMM_WORLD_RANK', '0') def test_fmoe(
os.environ['WORLD_SIZE'] = os.environ.get('OMPI_COMM_WORLD_SIZE', '1') batch_size, num_expert, d_model, top_k, expert: Union[Type[nn.Module], str]
if int(os.environ['WORLD_SIZE']) > 1: ):
torch.distributed.init_process_group(backend='nccl') torch.manual_seed(42 + rank)
torch.cuda.manual_seed(42 + rank)
if isinstance(expert, str):
expert = globals()[expert]
moe = FMoE(
num_expert=num_expert,
d_model=d_model,
gate=NaiveGate,
world_size=world_size,
mp_group=None,
expert=expert,
top_k=top_k,
).cuda()
moe_raw = BruteForceMoE(
expert=expert, num_expert=num_expert, d_model=d_model, world_size=world_size,
).cuda()
if world_size == 1:
for expert_moe, expert_raw in zip(moe.experts, moe_raw.experts):
for para_moe, para_raw in zip(
expert_moe.parameters(), expert_raw.parameters()
):
para_raw.data = para_moe.data.clone()
else:
assert len(moe.experts) >= 1
for idx, para in enumerate(moe.experts[0].parameters()):
para_tensor = torch.cat(
[list(expert.parameters())[idx].unsqueeze(0) for expert in moe.experts]
)
para_array = [torch.empty_like(para_tensor) for _ in range(world_size)]
torch.distributed.all_gather(para_array, para_tensor)
para_tensor_gathered = torch.cat(para_array, dim=0)
assert para_tensor_gathered.shape[0] == len(moe_raw.experts)
for expertID in range(para_tensor_gathered.shape[0]):
list(moe_raw.experts[expertID].parameters())[
idx
].data = para_tensor_gathered[expertID]
moe_out, raw_out = _perform_forward(moe, moe_raw, batch_size, d_model, top_k)
def get_experts_grad(experts: List[nn.Module]):
return torch.stack(
[
torch.stack(
[
p.grad.sum() if p.grad is not None else torch.zeros(1).cuda()
for p in item.parameters()
]
).sum()
for item in experts
]
)
moe_grad, raw_grad = (
get_experts_grad(moe.experts),
get_experts_grad(moe_raw.experts),
)
if world_size > 1:
torch.distributed.all_reduce(raw_grad)
raw_grad = raw_grad[rank * num_expert : (rank + 1) * num_expert]
moe_out_list = [moe_out, moe_grad]
raw_out_list = [raw_out, raw_grad]
names = ["forward", "backward"]
_assert_numercial(names, moe_out_list, raw_out_list)
def _run_distributed(func: Callable, args: Dict):
import subprocess
import os
ps, n = [], 2
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "36666"
os.environ["OMPI_COMM_WORLD_SIZE"] = str(n)
for i in range(n):
os.environ["OMPI_COMM_WORLD_RANK"] = str(i)
os.environ["CUDA_VISIBLE_DEVICES"] = str(i)
p = subprocess.Popen(
[sys.executable, __file__, func.__name__, json.dumps(args)],
stdout=subprocess.PIPE,
)
ps.append(p)
for p in ps:
p.wait()
retc = p.poll()
assert retc == 0
@pytest.mark.parametrize("num_expert", [4, 8])
@pytest.mark.parametrize("top_k", [2])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("d_model", [16])
@pytest.mark.parametrize("d_hidden", [32])
def test_fmoe_linear_distributed(
num_expert, top_k, batch_size, d_model, d_hidden,
):
_run_distributed(
test_fmoe_linear,
{
"num_expert": num_expert,
"top_k": top_k,
"batch_size": batch_size,
"d_model": d_model,
"d_hidden": d_hidden,
},
)
@pytest.mark.parametrize("num_expert", [4, 8])
@pytest.mark.parametrize("top_k", [2])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("d_model", [16])
@pytest.mark.parametrize("expert", ["NaiveExpert", "LinearExpert"])
def test_fmoe_distributed(
num_expert, top_k, batch_size, d_model, expert,
):
_run_distributed(
test_fmoe,
{
"num_expert": num_expert,
"top_k": top_k,
"batch_size": batch_size,
"d_model": d_model,
"expert": expert,
},
)
if __name__ == "__main__":
os.environ["RANK"] = os.environ.get("OMPI_COMM_WORLD_RANK", "0")
os.environ["WORLD_SIZE"] = os.environ.get("OMPI_COMM_WORLD_SIZE", "1")
if int(os.environ["WORLD_SIZE"]) > 1:
torch.distributed.init_process_group(backend="nccl")
rank = torch.distributed.get_rank() rank = torch.distributed.get_rank()
world_size = torch.distributed.get_world_size() world_size = torch.distributed.get_world_size()
if len(sys.argv) >= 3:
locals()[sys.argv[1]](**json.loads(sys.argv[2]))
else: else:
rank = 0 test_fmoe_linear(batch_size=4, num_expert=4, d_model=8, top_k=2, d_hidden=16)
world_size = 1 test_fmoe(batch_size=4, num_expert=4, d_model=8, top_k=2, expert=NaiveExpert)
test_moe()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment