Commit 6b8d2f2e authored by Rick Ho's avatar Rick Ho
Browse files

fmoefy

parent 4b650671
...@@ -24,6 +24,21 @@ using Fast MoE for training. ...@@ -24,6 +24,21 @@ using Fast MoE for training.
## Usage ## Usage
### FMoEfy a transformer model
Transformer is currently the most popular model to be extended by MoE. Using
Fast MoE, a transformer-based model can be extended as MoE by an one-key plugin
shown as follow.
Assume that there is a PyTorch model `model` with MLP layers located at
`model.language_model.transformer.layers[<idx>].mlp`, use the following two
lines to easily scale up the MLP layers to multiple experts.
```python
from fmoe.megatron import fmoefy
model = fmoefy(model, num_experts=<number of experts per worker>)
```
### Using Fast MoE as a PyTorch module ### Using Fast MoE as a PyTorch module
Examples can be seen in [examples](examples/). The easist way is to replace the Examples can be seen in [examples](examples/). The easist way is to replace the
...@@ -38,28 +53,4 @@ NCCL backend is required to be built with PyTorch. Use environment variable ...@@ -38,28 +53,4 @@ NCCL backend is required to be built with PyTorch. Use environment variable
`USE_NCCL=1` to `setup.py` to enable distributing experts across workers. Note `USE_NCCL=1` to `setup.py` to enable distributing experts across workers. Note
that the arguments of the MoE layers should then be excluded from the data that the arguments of the MoE layers should then be excluded from the data
parallel parameter synchronization list. parallel parameter synchronization list.
E
## Feature Roadmap
### Better All-to-all communication efficiency and computation performance
The dispatching process from source worker to the expert is time-consuming and
topology-aware, as it is an all-to-all communication. Overlapping or other
communication reducition technologies can be applied to reduce the overhead of
this step. However, this demands much research and coding efforts.
### Dynamic expert distribution load balancing
Load imbalance is observed as there is no loss item about load balancing. Some
experts are significantly more frequently called. Therefore, a dynamic scheduler
to duplicate or recycle some experts on some workers may be effective.
### Model parallel the experts
To enable larger expert sizes.
### Use zero-optimizer to reduce memory consumption
### Intigrate top-k gate into local scatter gather
...@@ -71,7 +71,7 @@ class FMoETransformerMLP(nn.Module): ...@@ -71,7 +71,7 @@ class FMoETransformerMLP(nn.Module):
world_size=1, world_size=1,
model_parallel_size=1, model_parallel_size=1,
model_parallel_rank=1, model_parallel_rank=1,
group=None, mp_group=None,
activation=torch.nn.functional.gelu, activation=torch.nn.functional.gelu,
top_k=2, top_k=2,
pre_lnorm=False, pre_lnorm=False,
...@@ -83,7 +83,7 @@ class FMoETransformerMLP(nn.Module): ...@@ -83,7 +83,7 @@ class FMoETransformerMLP(nn.Module):
self.world_size = world_size self.world_size = world_size
self.model_parallel_size = model_parallel_size self.model_parallel_size = model_parallel_size
self.model_parallel_rank = model_parallel_rank self.model_parallel_rank = model_parallel_rank
self.group = group self.mp_group = mp_group
self.activation = activation self.activation = activation
self.pre_lnorm = pre_lnorm self.pre_lnorm = pre_lnorm
self.top_k = top_k self.top_k = top_k
...@@ -140,7 +140,7 @@ class FMoETransformerMLP(nn.Module): ...@@ -140,7 +140,7 @@ class FMoETransformerMLP(nn.Module):
world_size = self.model_parallel_size world_size = self.model_parallel_size
tensor_list = [torch.empty_like(output) for _ in range(world_size)] tensor_list = [torch.empty_like(output) for _ in range(world_size)]
torch.distributed.all_gather(tensor_list, output, group=self.group) torch.distributed.all_gather(tensor_list, output, group=self.mp_group)
output = torch.cat(tensor_list, dim=1) output = torch.cat(tensor_list, dim=1)
return output.reshape(original_shape), self.bias return output.reshape(original_shape), self.bias
...@@ -4,15 +4,30 @@ from .layers import FMoETransformerMLP ...@@ -4,15 +4,30 @@ from .layers import FMoETransformerMLP
def create_moe_mlp(args, model_parallel_rank, group): def create_moe_mlp(args, model_parallel_rank, group):
assert ( assert (
args.seq_length * args.batch_size % args.model_parallel_size == 0 args.seq_length * args.batch_size % args.model_parallel_size == 0
), "Num experts should be multiple of mp size" ), "Batch size x sequence length should be multiple of mp size"
num_experts = args.num_experts // args.model_parallel_size
fmoe = FMoETransformerMLP( fmoe = FMoETransformerMLP(
num_experts, args.num_experts,
d_model=args.hidden_size, d_model=args.hidden_size,
d_hidden=args.hidden_size * 4, d_hidden=args.hidden_size * 4,
world_size=args.world_size, world_size=args.world_size,
model_parallel_size=args.model_parallel_size, model_parallel_size=args.model_parallel_size,
model_parallel_rank=model_parallel_rank, model_parallel_rank=model_parallel_rank,
group=group, mp_group=group,
) )
return fmoe return fmoe
def fmoefy(model, num_experts=None):
from megatron import get_args
from megatron import mpu
args = get_args()
if num_experts is not None:
args.num_experts = num_experts
assert (
'num_experts' in args
), 'num_experts should be specified in arguments or fmoefy function'
for l in model.language_model.transformer.layers:
l.mlp = create_moe_mlp(args,
mpu.get_model_parallel_rank(),
mpu.get_model_parallel_group())
return model
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment