Unverified Commit 3dcc9eff authored by msbaines's avatar msbaines Committed by GitHub
Browse files

[docs] add MOE to docs (#693)

parent 2c663f5a
......@@ -7,6 +7,7 @@ API Reference
optim/adascale
optim/oss
optim/grad_scaler
nn/moe
nn/pipe
nn/sharded_ddp
nn/fsdp
......
Mixture Of Experts
==================
.. autoclass:: fairscale.nn.MOELayer
......@@ -26,6 +26,9 @@ Components
* `Fully Sharded Data Parallel FSDP <../../en/latest/api/nn/fsdp.html>`_
* `FSDP Tips <../../en/latest/api/nn/fsdp_tips.html>`_
* Mixture-of-Experts:
* `MOE <../../en/latest/api/nn/moe.html>`_
* Optimization at scale:
* `AdaScale SGD <../../en/latest/api/optim/adascale.html>`_
......
......@@ -43,13 +43,12 @@ class MOELayer(Base):
output = moe(input)
l_aux = moe.l_aux
.. Gshard_: https://arxiv.org/pdf/2006.16668.pdf
.. _Gshard: https://arxiv.org/pdf/2006.16668.pdf
Args:
gate (torch.nn.Module):
gate network
expert (torch.nn.Module):
expert network
gate: gate network
expert: expert network
group: group to use for all-to-all communication
"""
def __init__(self, gate: Module, experts: Union[Module, ModuleList], group: Optional[Any] = None) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment