"src/vscode:/vscode.git/clone" did not exist on "24c7d578baf6a8b79890101dd280278fff031d12"
Commit 34477955 authored by Sengxian's avatar Sengxian
Browse files

Add DataParallel test for FMoE

parent 40841453
from moe import FMoE as MOELayer
from moe import BruteForceMoE as MOELayer_raw
import torch
from torch import nn
import sys
import os import os
import pytest
import torch
n_devices = int(os.environ.get('N_GPUS', '2')) from fmoe.gates import NaiveGate
from fmoe.layers import FMoE
from fmoe.transformer import _Expert
def test_dp():
n_devices = int(os.environ.get("N_GPUS", "2"))
@pytest.mark.parametrize("num_expert", [4, 8])
@pytest.mark.parametrize("top_k", [2, 3])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("d_model", [16])
@pytest.mark.parametrize("d_hidden", [32])
def test_fmoe_dp(
num_expert,
top_k,
batch_size,
d_model,
d_hidden,
activation=torch.nn.functional.gelu,
):
torch.manual_seed(42) torch.manual_seed(42)
torch.cuda.manual_seed(42) torch.cuda.manual_seed(42)
batch_size = 6
num_expert = 4
in_feat = 2
out_feat = 3
inp = torch.rand(batch_size, in_feat).cuda() experts = _Expert(num_expert, d_model, d_hidden, activation).cuda()
gate = torch.randint(low=0, high=num_expert, size=(batch_size, ),
requires_grad=False).cuda()
print("data parallel of our MoE model") def expert_fn(inp, gate):
moe = MOELayer(num_expert, in_feat, out_feat).cuda() return experts(inp, gate)
moe_dp = torch.nn.DataParallel(moe, device_ids=list(range(n_devices)))
for i in range(5): moe = FMoE(
output = moe_dp(inp, gate) num_expert=num_expert,
print('Successful') d_model=d_model,
gate=NaiveGate,
world_size=1,
mp_group=None,
expert_fn=expert_fn,
top_k=top_k,
).cuda()
moe_dp = torch.nn.DataParallel(moe, device_ids=list(range(n_devices)))
if __name__ == '__main__': for i in range(5):
test_dp() output = moe_dp(torch.rand(batch_size, d_model).cuda())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment