Commit d155925d authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

Merge remote-tracking branch 'origin/master' into checkpoint

Conflicts:
	fmoe/transformer.py
parents f49a9ec4 98b4b0be
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Compile with "..."
2. Run "..." with "..." processes on "..." nodes
**Expected behavior**
A clear and concise description of what you expected to happen.
**Logs**
If applicable, add logs to help explain your problem.
**Platform**
- Device: [e.g. NVIDIA V100]
- OS: [e.g. Debian 10.2 buster]
- CUDA version: [e.g. 11.1]
- NCCL version: [e.g. 2.7.8-1]
**Additional context**
Add any other context about the problem here.
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
...@@ -10,3 +10,4 @@ a.out ...@@ -10,3 +10,4 @@ a.out
build build
*swp *swp
logs logs
dist
...@@ -117,15 +117,29 @@ public: ...@@ -117,15 +117,29 @@ public:
ncclComm_t getcomm(at::Device dev) { ncclComm_t getcomm(at::Device dev) {
auto key = std::to_string(dev.index()); auto key = std::to_string(dev.index());
#ifdef ENABLE_NCCL_P2P_SUPPORT #ifdef ENABLE_NCCL_P2P_SUPPORT
auto v = getNCCLComm(key, {dev}, c10d::OpType::ALLTOALL); ncclUniqueId ncclID;
int rank = getRank();
if (rank == 0) {
ncclGetUniqueId(&ncclID);
}
broadcastUniqueNCCLID(&ncclID,
c10d::OpType::SEND,
"fastmoe_nccl_comm",
rank);
ncclComm_t comm;
ncclCommInitRank(&comm, getSize(), ncclID, rank);
return comm;
#else #else
auto v = getNCCLComm(key, {dev}); auto v = getNCCLComm(key, {dev});
#endif
if (v.size() == 0) { if (v.size() == 0) {
std::cerr << "PyTorch has nothing\n"; std::cerr << "PyTorch has nothing\n";
return 0; return 0;
} }
int count;
ncclCommCount(v[0]->getNcclComm(), &count);
std::cerr << "PyTorch has " << v.size() << " comms, comm 0 size " << count << "\n";
return v[0]->getNcclComm(); return v[0]->getNcclComm();
#endif
} }
}; };
......
## v0.1.2
### Compilation
- Remove dependency on the CUDA examples repository.
### Distributed
- Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
on multiple nodes with PyTorch v1.8.0.
### Misc
- Fix tons of typos.
- Format the code.
## v0.1.1 ## v0.1.1
### Distributed ### Distributed
......
...@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module): ...@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module):
else: else:
self.comms["world"] = world_group self.comms["world"] = world_group
def allreduce_params(no_scale=False, reduce_after=False, fp32_allreduce=False): def allreduce_params(no_scale=False,
reduce_after=False, fp32_allreduce=False):
groups = dict() groups = dict()
for p in self.module.parameters(): for p in self.module.parameters():
if not p.requires_grad or p.grad is None: if not p.requires_grad or p.grad is None:
......
...@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None): ...@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
) )
else: else:
global_expert_count = local_expert_count global_expert_count = local_expert_count
fwd_expert_count = global_expert_count.view(world_size, num_expert).sum(dim=0) fwd_expert_count = global_expert_count.view(world_size,
num_expert).sum(dim=0)
fwd_batch_size = int(fwd_expert_count.sum().item()) fwd_batch_size = int(fwd_expert_count.sum().item())
return ( return (
pos, pos,
......
...@@ -23,7 +23,8 @@ class ZeroGate(nn.Module): ...@@ -23,7 +23,8 @@ class ZeroGate(nn.Module):
idx = torch.zeros( idx = torch.zeros(
inp.shape[0] * self.top_k, dtype=torch.int64, device=inp.device inp.shape[0] * self.top_k, dtype=torch.int64, device=inp.device
) )
score = torch.ones(inp.shape[0] * self.top_k, device=inp.device) / self.top_k score = torch.ones(inp.shape[0] * self.top_k,
device=inp.device) / self.top_k
return idx, score.reshape(-1, 1, self.top_k) return idx, score.reshape(-1, 1, self.top_k)
......
...@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size): ...@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size):
fwd_batch_size, fwd_batch_size,
) = moe_prepare_forward(gate, num_expert, world_size) ) = moe_prepare_forward(gate, num_expert, world_size)
x = MOEScatter.apply( x = MOEScatter.apply(
inp, pos, local_expert_count, global_expert_count, fwd_batch_size, world_size inp, pos,
local_expert_count, global_expert_count, fwd_batch_size, world_size
) )
x = expert_fn(x, fwd_expert_count) x = expert_fn(x, fwd_expert_count)
x = MOEGather.apply( x = MOEGather.apply(
...@@ -165,7 +166,8 @@ class FMoE(nn.Module): ...@@ -165,7 +166,8 @@ class FMoE(nn.Module):
self.top_k = top_k self.top_k = top_k
self.gate = gate(d_model, num_expert, world_size, top_k) self.gate = gate(d_model, num_expert, world_size, top_k)
if expert is not None: if expert is not None:
self.experts = nn.ModuleList([expert(d_model) for _ in range(num_expert)]) self.experts = nn.ModuleList([expert(d_model)
for _ in range(num_expert)])
self.experts_fused = False self.experts_fused = False
else: else:
self.experts_fused = True self.experts_fused = True
......
...@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma): ...@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma):
device = self.weight.device device = self.weight.device
dtype = self.weight.dtype dtype = self.weight.dtype
weight = rng.normal(loc=0.0, scale=sigma, size=tuple(self.weight.size())) weight = rng.normal(loc=0.0, scale=sigma, size=tuple(self.weight.size()))
self.weight.data = torch.tensor(weight, dtype=dtype, device=device) self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
if self.bias is not None: if self.bias is not None:
# Always initialize bias to zero. # Always initialize bias to zero.
...@@ -64,13 +64,13 @@ def _random_init_weight(self, rng): ...@@ -64,13 +64,13 @@ def _random_init_weight(self, rng):
device = self.weight.device device = self.weight.device
dtype = self.weight.dtype dtype = self.weight.dtype
weight = rng.uniform(-bound, bound, size=tuple(self.weight.size())) weight = rng.uniform(-bound, bound, size=tuple(self.weight.size()))
self.weight.data = torch.tensor(weight, dtype=dtype, device=device) self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
if self.bias is not None: if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[0]) fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[0])
bound = 1 / math.sqrt(fan_in) bound = 1 / math.sqrt(fan_in)
bias = rng.uniform(-bound, bound, size=tuple(self.bias.size())) bias = rng.uniform(-bound, bound, size=tuple(self.bias.size()))
self.bias.data = torch.tensor(bias, dtype=dtype, device=device) self.bias.data = torch.from_numpy(bias).to(dtype=dtype, device=device)
class MegatronMLP(FMoETransformerMLP): class MegatronMLP(FMoETransformerMLP):
...@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP): ...@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP):
def __init__(self, args, group): def __init__(self, args, group):
assert ( assert (
args.seq_length * args.micro_batch_size % args.tensor_model_parallel_size args.seq_length * args.micro_batch_size
% args.tensor_model_parallel_size
== 0 == 0
), "Batch size x sequence length should be multiple of mp size" ), "Batch size x sequence length should be multiple of mp size"
if not args.distributed_experts: if not args.distributed_experts:
......
...@@ -15,10 +15,8 @@ class _Expert(nn.Module): ...@@ -15,10 +15,8 @@ class _Expert(nn.Module):
def __init__(self, num_expert, d_model, d_hidden, activation, rank=0): def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
super().__init__() super().__init__()
self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True, rank=rank)
bias=True, rank=rank) self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True, rank=rank)
self.h4toh = FMoELinear(num_expert, d_hidden, d_model,
bias=True, rank=rank)
self.activation = activation self.activation = activation
def forward(self, inp, fwd_expert_count): def forward(self, inp, fwd_expert_count):
......
...@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1': ...@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1':
if __name__ == '__main__': if __name__ == '__main__':
setuptools.setup( setuptools.setup(
name='fastmoe', name='fastmoe',
version='0.1.1', version='0.1.2',
description='An efficient Mixture-of-Experts system for PyTorch', description='An efficient Mixture-of-Experts system for PyTorch',
author='Jiaao He, Jiezhong Qiu and Aohan Zeng', author='Jiaao He, Jiezhong Qiu and Aohan Zeng',
author_email='hja20@mails.tsinghua.edu.cn', author_email='hja20@mails.tsinghua.edu.cn',
......
import torch
import torch.nn as nn
from fmoe import FMoETransformerMLP from fmoe import FMoETransformerMLP
from fmoe.gates import NaiveGate from fmoe.gates import NaiveGate
from moe import BruteForceMoELinear from moe import BruteForceMoELinear
import torch
import torch.nn as nn
import time import time
import sys import sys
import os import os
......
...@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version) ...@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version)
PYTHON_REVISION=${PYTHON_VERSION:7:3} PYTHON_REVISION=${PYTHON_VERSION:7:3}
SCRIPT_PATH=$(dirname $(dirname $(realpath $0))) SCRIPT_PATH=$(dirname $(dirname $(realpath $0)))
source ~/scripts/torch.env
export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-$PYTHON_REVISION:$PYTHONPATH export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-$PYTHON_REVISION:$PYTHONPATH
export LD_LIBRARY_PATH=/home/laekov/.local/lib/python$PYTHON_REVISION/site-packages/torch/lib:$LD_LIBRARY_PATH
exec $PYTHON_EXEC $@ 2>logs/$RANK.log core0=$(expr $OMPI_COMM_WORLD_LOCAL_RANK \* 4)
cores=$core0-$(expr $core0 + 3)
exec numactl -C $cores $PYTHON_EXEC $@ 2>logs/$RANK.log
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment