Commit d155925d authored by Jiezhong Qiu's avatar Jiezhong Qiu
Browse files

Merge remote-tracking branch 'origin/master' into checkpoint

Conflicts:
	fmoe/transformer.py
parents f49a9ec4 98b4b0be
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
1. Compile with "..."
2. Run "..." with "..." processes on "..." nodes
**Expected behavior**
A clear and concise description of what you expected to happen.
**Logs**
If applicable, add logs to help explain your problem.
**Platform**
- Device: [e.g. NVIDIA V100]
- OS: [e.g. Debian 10.2 buster]
- CUDA version: [e.g. 11.1]
- NCCL version: [e.g. 2.7.8-1]
**Additional context**
Add any other context about the problem here.
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
......@@ -10,3 +10,4 @@ a.out
build
*swp
logs
dist
......@@ -117,15 +117,29 @@ public:
ncclComm_t getcomm(at::Device dev) {
auto key = std::to_string(dev.index());
#ifdef ENABLE_NCCL_P2P_SUPPORT
auto v = getNCCLComm(key, {dev}, c10d::OpType::ALLTOALL);
ncclUniqueId ncclID;
int rank = getRank();
if (rank == 0) {
ncclGetUniqueId(&ncclID);
}
broadcastUniqueNCCLID(&ncclID,
c10d::OpType::SEND,
"fastmoe_nccl_comm",
rank);
ncclComm_t comm;
ncclCommInitRank(&comm, getSize(), ncclID, rank);
return comm;
#else
auto v = getNCCLComm(key, {dev});
#endif
if (v.size() == 0) {
std::cerr << "PyTorch has nothing\n";
return 0;
}
int count;
ncclCommCount(v[0]->getNcclComm(), &count);
std::cerr << "PyTorch has " << v.size() << " comms, comm 0 size " << count << "\n";
return v[0]->getNcclComm();
#endif
}
};
......
## v0.1.2
### Compilation
- Remove dependency on the CUDA examples repository.
### Distributed
- Fix a bug related to PyTorch v1.8.0. FastMoE can now operate on multiple GPUs
on multiple nodes with PyTorch v1.8.0.
### Misc
- Fix tons of typos.
- Format the code.
## v0.1.1
### Distributed
......
......@@ -47,7 +47,8 @@ class DistributedGroupedDataParallel(nn.Module):
else:
self.comms["world"] = world_group
def allreduce_params(no_scale=False, reduce_after=False, fp32_allreduce=False):
def allreduce_params(no_scale=False,
reduce_after=False, fp32_allreduce=False):
groups = dict()
for p in self.module.parameters():
if not p.requires_grad or p.grad is None:
......
......@@ -40,7 +40,8 @@ def moe_prepare_forward(gate, num_expert, world_size, comm=None):
)
else:
global_expert_count = local_expert_count
fwd_expert_count = global_expert_count.view(world_size, num_expert).sum(dim=0)
fwd_expert_count = global_expert_count.view(world_size,
num_expert).sum(dim=0)
fwd_batch_size = int(fwd_expert_count.sum().item())
return (
pos,
......
......@@ -23,7 +23,8 @@ class ZeroGate(nn.Module):
idx = torch.zeros(
inp.shape[0] * self.top_k, dtype=torch.int64, device=inp.device
)
score = torch.ones(inp.shape[0] * self.top_k, device=inp.device) / self.top_k
score = torch.ones(inp.shape[0] * self.top_k,
device=inp.device) / self.top_k
return idx, score.reshape(-1, 1, self.top_k)
......
......@@ -114,7 +114,8 @@ def _fmoe_general_global_forward(inp, gate, expert_fn, num_expert, world_size):
fwd_batch_size,
) = moe_prepare_forward(gate, num_expert, world_size)
x = MOEScatter.apply(
inp, pos, local_expert_count, global_expert_count, fwd_batch_size, world_size
inp, pos,
local_expert_count, global_expert_count, fwd_batch_size, world_size
)
x = expert_fn(x, fwd_expert_count)
x = MOEGather.apply(
......@@ -165,7 +166,8 @@ class FMoE(nn.Module):
self.top_k = top_k
self.gate = gate(d_model, num_expert, world_size, top_k)
if expert is not None:
self.experts = nn.ModuleList([expert(d_model) for _ in range(num_expert)])
self.experts = nn.ModuleList([expert(d_model)
for _ in range(num_expert)])
self.experts_fused = False
else:
self.experts_fused = True
......
......@@ -45,7 +45,7 @@ def _megatron_init_method(self, rng, sigma):
device = self.weight.device
dtype = self.weight.dtype
weight = rng.normal(loc=0.0, scale=sigma, size=tuple(self.weight.size()))
self.weight.data = torch.tensor(weight, dtype=dtype, device=device)
self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
if self.bias is not None:
# Always initialize bias to zero.
......@@ -64,13 +64,13 @@ def _random_init_weight(self, rng):
device = self.weight.device
dtype = self.weight.dtype
weight = rng.uniform(-bound, bound, size=tuple(self.weight.size()))
self.weight.data = torch.tensor(weight, dtype=dtype, device=device)
self.weight.data = torch.from_numpy(weight).to(dtype=dtype, device=device)
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[0])
bound = 1 / math.sqrt(fan_in)
bias = rng.uniform(-bound, bound, size=tuple(self.bias.size()))
self.bias.data = torch.tensor(bias, dtype=dtype, device=device)
self.bias.data = torch.from_numpy(bias).to(dtype=dtype, device=device)
class MegatronMLP(FMoETransformerMLP):
......@@ -81,7 +81,8 @@ class MegatronMLP(FMoETransformerMLP):
def __init__(self, args, group):
assert (
args.seq_length * args.micro_batch_size % args.tensor_model_parallel_size
args.seq_length * args.micro_batch_size
% args.tensor_model_parallel_size
== 0
), "Batch size x sequence length should be multiple of mp size"
if not args.distributed_experts:
......
......@@ -15,10 +15,8 @@ class _Expert(nn.Module):
def __init__(self, num_expert, d_model, d_hidden, activation, rank=0):
super().__init__()
self.htoh4 = FMoELinear(num_expert, d_model, d_hidden,
bias=True, rank=rank)
self.h4toh = FMoELinear(num_expert, d_hidden, d_model,
bias=True, rank=rank)
self.htoh4 = FMoELinear(num_expert, d_model, d_hidden, bias=True, rank=rank)
self.h4toh = FMoELinear(num_expert, d_hidden, d_model, bias=True, rank=rank)
self.activation = activation
def forward(self, inp, fwd_expert_count):
......
......@@ -14,7 +14,7 @@ if os.environ.get('USE_NCCL', '0') == '1':
if __name__ == '__main__':
setuptools.setup(
name='fastmoe',
version='0.1.1',
version='0.1.2',
description='An efficient Mixture-of-Experts system for PyTorch',
author='Jiaao He, Jiezhong Qiu and Aohan Zeng',
author_email='hja20@mails.tsinghua.edu.cn',
......
import torch
import torch.nn as nn
from fmoe import FMoETransformerMLP
from fmoe.gates import NaiveGate
from moe import BruteForceMoELinear
import torch
import torch.nn as nn
import time
import sys
import os
......
......@@ -25,7 +25,10 @@ PYTHON_VERSION=$($PYTHON_EXEC --version)
PYTHON_REVISION=${PYTHON_VERSION:7:3}
SCRIPT_PATH=$(dirname $(dirname $(realpath $0)))
source ~/scripts/torch.env
export PYTHONPATH=$SCRIPT_PATH:$SCRIPT_PATH/build/lib.linux-x86_64-$PYTHON_REVISION:$PYTHONPATH
export LD_LIBRARY_PATH=/home/laekov/.local/lib/python$PYTHON_REVISION/site-packages/torch/lib:$LD_LIBRARY_PATH
exec $PYTHON_EXEC $@ 2>logs/$RANK.log
core0=$(expr $OMPI_COMM_WORLD_LOCAL_RANK \* 4)
cores=$core0-$(expr $core0 + 3)
exec numactl -C $cores $PYTHON_EXEC $@ 2>logs/$RANK.log
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment