add fastmoe project

0f091a1d · Sugon_ldc · 0f091a1d · 0f091a1d · 0f091a1d · 0f091a1d
Commit 0f091a1d authored May 17, 2023 by Sugon_ldc
20 changed files
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2021, Jiaao He
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# FastMoE
+一个易于使用和高效的系统，支持PyTorch的混合专家(MoE)模型。
+## 安装
+方式1：
+在[这里](http://10.0.35.93:8000/customized/fastmoe/22.10/fastmoe-0.3.0%2Bdtk22.10-cp37-cp37m-linux_x86_64.whl)下载whl包，通过pip3 install安装(链接中的whl包为dtk22.10版本)
+方式2：
+```
+python3 setup.py build
+python3 setup.py install 
+```
+## 测试
+所有测试文件在test文件夹中
+```
+pip3 install pytest
+```
+```
+pytest -q test_dp.py
+pytest -q test_ddp.py
+pytest -q test_local_exchange.py
+pytest -q test_numerical.py
+pytest -q test_swipe.py
+pytest -q test_zero.py
+pytest -q test_gates.py
+```
+测试过程中不出现failed，全部pass即为通过
+tips:Fastmoe系统应用于大规模分布式场景，因此在进行测试的时候，应使用不少于8张计算加速卡，否则会出现skip的问题。
\ No newline at end of file
--- a/README_origin.md
+++ b/README_origin.md
+<img height='60px' src='doc/logo/rect.png'/>
+[Release note](doc/release-note.md)
+| [中文文档](doc/readme-cn.md)
+| [Slack workspace](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw)
+## Introduction
+An easy-to-use and efficient system to support the Mixture of Experts (MoE) 
+model for PyTorch. 
+## Installation
+### Prerequisites
+PyTorch with CUDA is required. The repository is currently tested with PyTorch
+v1.8.0 and CUDA 10, with designed compatibility to older versions.
+If the distributed expert feature is enabled, NCCL with P2P communication
+support, typically versions `>=2.7.5`, is needed. 
+### Installing
+FastMoE contains a set of PyTorch customized opearators, including both C and
+Python components. Use `python setup.py install` to easily install and enjoy
+using FastMoE for training.
+The distributed expert feature is disabled by default. If you want to enable
+it, pass environment variable `USE_NCCL=1` to the setup script.
+Note that an extra NCCL developer package is needed, which has to be consistent
+with your PyTorch's NCCL version, which can be inspected by running
+`torch.cuda.nccl.version()`. The 
+[official PyTorch docker image](https://hub.docker.com/r/pytorch/pytorch) is
+recommended, as the environment is well-setup there. Otherwise, you can access
+the [download link of all NCCL
+versions](https://developer.nvidia.com/nccl/nccl-legacy-downloads) to download
+the NCCL package that is suitable for you.
+## Usage 
+### FMoEfy a Transformer model
+Transformer is currently one of the most popular models to be extended by MoE. Using
+FastMoE, a Transformer-based model can be extended as MoE by an one-key plugin
+shown as follow.
+For example, when using [Megatron-LM](https://github.com/nvidia/megatron-lm),
+using the following lines can help you easily scale up the MLP layers to
+multiple experts.
+```python
+model = ...
+from fmoe.megatron import fmoefy
+model = fmoefy(model, num_experts=<number of experts per worker>)
+train(model, ...)
+```
+A detailed tutorial to _moefy_ Megatron-LM can be found
+[here](examples/megatron).
+### Using FastMoE as a PyTorch module
+An example MoE transformer model can be seen in the
+[Transformer-XL](examples/transformer-xl) example. The easist way is to replace
+the MLP layer by the `FMoE` layers.
+### Using FastMoE in Parallel
+FastMoE supports both data parallel and model parallel. 
+#### Data Parallel
+In FastMoE's data parallel mode, both the gate and the experts are replicated on each worker. 
+The following figure shows the forward pass of a 3-expert MoE with 2-way data parallel.
+<p align="center">
+<img src="doc/fastmoe_data_parallel.png" width="600">
+</p>
+For data parallel, no extra coding is needed. FastMoE works seamlessly with PyTorch's `DataParallel` or `DistributedDataParallel`.
+The only drawback of data parallel is that the number of experts is constrained by each worker's memory.
+#### Model Parallel
+In FastMoE's model parallel mode, the gate network is still replicated on each worker but
+experts are placed separately across workers.
+Thus, by introducing additional communication cost, FastMoE enjoys a large expert pool whose size is proportional to the number of workers.
+The following figure shows the forward pass of a 6-expert MoE with 2-way model parallel. Note that experts 1-3 are located in worker 1 while experts 4-6 are located in worker 2.
+<p align="center">
+<img src="doc/fastmoe_model_parallel.png" width="600">
+</p>
+FastMoE's model parallel requires sophiscated parallel strategies that neither PyTorch nor
+Megatron-LM provides. The `fmoe.DistributedGroupedDataParallel` module is
+introduced to replace PyTorch's DDP module.
+## Citation
+```
+@article{he2021fastmoe,
+      title={FastMoE: A Fast Mixture-of-Expert Training System}, 
+      author={Jiaao He and Jiezhong Qiu and Aohan Zeng and Zhilin Yang and Jidong Zhai and Jie Tang},
+      journal={arXiv preprint arXiv:2103.13262},
+      year={2021}
+}
+```
+## Troubleshootings / Discussion
+If you have any problem using FastMoE, or you are interested in getting involved in developing FastMoE, feel free to join the [our slack channel](https://join.slack.com/t/fastmoe/shared_invite/zt-mz0ai6ol-ggov75D62YsgHfzShw8KYw).
--- a/cuda/.gitignore
+++ b/cuda/.gitignore
+*.swp
+build
--- a/cuda/balancing.cu
+++ b/cuda/balancing.cu
+#include <cstdio>
+#include "balancing.cuh"
+#include "global_exchange.h"
+#include <torch/extension.h>
+/* 
+ * note that due to limit of cuda atomic operator, capacity should be int32
+ */
+torch::Tensor _limit_by_capacity(
+        torch::Tensor expert_count, torch::Tensor capacity,
+        long n_expert, long n_worker) {
+    CHECK_INPUT(expert_count);
+    CHECK_INPUT(capacity);
+    auto expert_count_ack = torch::empty_like(expert_count);
+    auto smgr = getCudaStreamManager(expert_count.device().index());
+    fmoe_cuda_limit_by_capacity_impl(
+            expert_count.data_ptr<long>(),
+            capacity.data_ptr<int>(),
+            expert_count_ack.data_ptr<long>(),
+            n_expert, n_worker, smgr);
+    return expert_count_ack;
+}
+torch::Tensor _prune_gate_by_capacity(
+        torch::Tensor gate_idx, torch::Tensor expert_count,
+        long n_expert, long n_worker) {
+    auto smgr = getCudaStreamManager(expert_count.device().index());
+    auto batch_size = gate_idx.numel();
+    auto opt = torch::TensorOptions()
+        .dtype(gate_idx.dtype())
+        .device(gate_idx.device());
+    auto new_gate_idx = torch::empty(gate_idx.sizes(), opt);
+    fmoe_cuda_prune_gate_by_capacity_impl(
+            gate_idx.data_ptr<long>(),
+            new_gate_idx.data_ptr<long>(),
+            expert_count.data_ptr<int>(),
+            batch_size, n_expert, n_worker, smgr);
+    return new_gate_idx;
+}
+template<class T>
+T* _cudamalloc(size_t sz) {
+    T* dptr;
+    cudaMalloc(&dptr, sz * sizeof(T));
+    return dptr;
+}
+template<class T>
+T* _h2d(const T* hptr, T* dptr, size_t sz) {
+    cudaMemcpy(dptr, hptr, sz * sizeof(T), cudaMemcpyHostToDevice);
+    return dptr;
+}
+template<class T>
+T* _h2d(T* hptr, size_t sz) {
+    T* dptr = _cudamalloc<T>(sz);
+    return _h2d(hptr, dptr, sz);
+}
+template<class T>
+T* _d2h(const T* dptr, T* hptr, size_t sz) {
+    cudaMemcpy(hptr, dptr, sz * sizeof(T), cudaMemcpyDeviceToHost);
+    return hptr;
+}
+template<class T>
+T* _d2h(const T* dptr, size_t sz) {
+    T* hptr = new T[sz];
+    return _d2h(dptr, hptr, sz);
+}
+#ifdef FMOE_USE_NCCL
+#include <nccl.h>
+#define UPDATE_COUNTERS(__count__) { \
+    if (i == rank) { \
+        lec[j] += (__count__); \
+    } \
+    if (j == rank) { \
+        gec[i] += (__count__); \
+        cap -= (__count__); \
+    } \
+}
+std::vector<torch::Tensor> _swipe_once(
+        torch::Tensor gate_idx, torch::Tensor capacity,
+        long n_expert, long n_worker, long bias) {
+    auto device_idx = gate_idx.device().index();
+    auto smgr = getCudaStreamManager(device_idx);
+    int rank;
+    ncclCommUserRank(smgr->ncclcomm, &rank);
+    cudaSetDevice(device_idx);
+    auto capacity_new = capacity.clone();
+    auto cap = capacity_new.item<long>();
+    long batch_size = gate_idx.size(0);
+    auto gate_idx_cpu = gate_idx.cpu();
+    long* gidx = gate_idx_cpu.data_ptr<long>();
+    /* Local count and exchange */
+    long *lec = new long[n_worker];
+    memset(lec, 0, n_worker * sizeof(long));
+    for (long i = 0; i < batch_size; ++i) {
+        ++lec[gidx[i] / n_expert];
+    }
+    long *d_lec = _h2d(lec, n_worker), *d_gec = _cudamalloc<long>(n_worker);
+    fmoe_cuda_expert_exchange_impl(d_lec, d_gec, 1, n_worker, smgr);
+    long *gec = _d2h(d_gec, n_worker);
+    /* Limit number of incoming samples */
+    long *drop_count = new long[n_worker];
+    memset(drop_count, 0, n_worker * sizeof(long));
+    for (long i = 0; i < n_worker; ++i) {
+        if (cap >= gec[i]) {
+            drop_count[i] = 0;
+            cap -= gec[i];
+        } else {
+            drop_count[i] = gec[i] - cap;
+            gec[i] = cap;
+            cap = 0;
+        }
+    }
+    /* Send limit information back */
+    _h2d(gec, d_gec, n_worker);
+    fmoe_cuda_expert_exchange_impl(d_gec, d_lec, 1, n_worker, smgr);
+    _d2h(d_lec, lec, n_worker);
+    auto d_dropcount = _h2d(drop_count, n_worker);
+    ncclAllReduce(d_dropcount, d_dropcount, n_worker, ncclInt64, ncclSum,
+            smgr->ncclcomm, smgr->stream());
+    _d2h(d_dropcount, drop_count, n_worker);
+    auto d_gcap = _cudamalloc<long>(n_worker);
+    _h2d(&cap, d_gcap + rank, 1);
+    ncclAllGather(d_gcap + rank, d_gcap, 1, ncclInt64,
+            smgr->ncclcomm, smgr->stream());
+    auto gcap = _d2h(d_gcap, n_worker);
+    /* Re-assign and update counters */
+    for (long i = 0, j = 0; i < n_worker; ++i) {
+        while (drop_count[i] > 0) {
+            if (drop_count[i] > gcap[j]) {
+                drop_count[i] -= gcap[j];
+                UPDATE_COUNTERS(gcap[j]);
+                ++j;
+            } else {
+                gcap[j] -= drop_count[i];
+                UPDATE_COUNTERS(drop_count[i]);
+                break;
+            }
+        }
+    }
+    for (long i = 0; i < batch_size; ++i) {
+        auto widx = gidx[i] / n_expert;
+        if (lec[widx] > 0) {
+            --lec[widx];
+        } else {
+            gidx[i] = -1;
+        }
+    }
+    for (long i = 0, k = 0; i < batch_size; ++i) {
+        if (gidx[i] != -1) {
+            continue;
+        }
+        for (; lec[k] == 0; ++k);
+        --lec[k];
+        gidx[i] = k * n_expert + bias;
+    }
+    *capacity_new.data_ptr<long>() = cap;
+    delete [] drop_count;
+    delete [] lec;
+    delete [] gec;
+    delete [] gcap;
+    cudaFree(d_dropcount);
+    cudaFree(d_lec);
+    cudaFree(d_gec);
+    cudaFree(d_gcap);
+    return {gate_idx_cpu, capacity_new};
+}
+#undef UPDATE_COUNTERS
+#endif
--- a/cuda/balancing.cuh
+++ b/cuda/balancing.cuh
+#include "stream_manager.h"
+#include "utils/fmoe_utils.h"
+#include <cuda.h>
+__global__
+void limit_by_capacity_kernel(const long* ec, int* cap, long* eca,
+        const long n_expert, const long n_worker) {
+    int eid = blockIdx.y;
+    int wid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (wid < n_worker) {
+        int proposal = ec[wid * n_expert + eid];
+        int cap_left = atomicSub(cap + eid, proposal);
+        if (cap_left >= proposal) {
+            eca[wid * n_expert + eid] = proposal;
+        } else if (cap_left >= 0) {
+            eca[wid * n_expert + eid] = cap_left;
+        } else {
+            eca[wid * n_expert + eid] = 0;
+        }
+    }
+}
+void fmoe_cuda_limit_by_capacity_impl(const long* ec, int* cap,
+        long* eca, const long n_expert, const long n_worker,
+        CudaStreamManager* smgr) {
+    dim3 grid_dim(CEIL(n_worker, 1024), n_expert);
+    dim3 block_dim(1024);
+    limit_by_capacity_kernel<<<grid_dim, block_dim, 0, smgr->stream(0)>>>(
+            ec, cap, eca, n_expert, n_worker);
+    smgr->sync(1);
+}
+__global__
+void prune_gate_by_capacity_kernel(const long* gate_idx, long* new_gate_idx,
+        int* ec,
+        const long batch_size, const long n_expert, const long n_worker) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < batch_size) {
+        int orig_cap = atomicSub(ec + gate_idx[i], 1);
+        if (orig_cap <= 0) {
+            new_gate_idx[i] = -1;
+        } else {
+            new_gate_idx[i] = gate_idx[i];
+        }
+    }
+}
+void fmoe_cuda_prune_gate_by_capacity_impl(long* gate_idx, long* new_gate_idx,
+        int* ec,
+        const long batch_size, const long n_expert, const long n_worker,
+        CudaStreamManager* smgr) {
+    dim3 grid_dim(CEIL(batch_size, 1024));
+    dim3 block_dim(1024);
+    prune_gate_by_capacity_kernel<<<grid_dim, block_dim, 0, smgr->stream(0)>>>(
+            gate_idx, new_gate_idx, ec, batch_size, n_expert, n_worker
+            );
+    smgr->sync(1);
+}
--- a/cuda/fmoe_cuda.cpp
+++ b/cuda/fmoe_cuda.cpp
+#include <iostream>
+#include <vector>
+#include <torch/extension.h>
+// global_exchange
+#ifdef FMOE_USE_NCCL
+#include <c10d/ProcessGroupNCCL.hpp>
+torch::Tensor _expert_exchange(
+        torch::Tensor local_expert_count,
+        long n_expert, long n_workers);
+torch::Tensor _global_scatter(
+        torch::Tensor input_buf,
+        torch::Tensor local_expert_count,
+        torch::Tensor global_expert_count,
+        long batch_size, long n_workers);
+torch::Tensor _global_gather(
+        torch::Tensor output_buf,
+        torch::Tensor local_expert_count,
+        torch::Tensor global_expert_count,
+        long batch_size, long n_workers);
+void _ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t);
+#endif  // FMOE_USE_NCCL
+// local_exchange
+void _assign_pos(
+        torch::Tensor cum_count,
+        torch::Tensor gate,
+        torch::Tensor pos);
+void _expert_count(
+        torch::Tensor gate_idx,
+        torch::Tensor expert_count);
+// parallel_linear
+torch::Tensor _linear_forward(
+        torch::Tensor input_buf,
+        torch::Tensor expert_count,
+        torch::Tensor weight,
+        at::optional<torch::Tensor> bias
+        );
+std::vector<torch::Tensor> _linear_backward(
+        torch::Tensor grad_output_buf,
+        torch::Tensor input_buf,
+        torch::Tensor expert_count,
+        torch::Tensor weight,
+        at::optional<torch::Tensor> bias
+        );
+// balancing
+torch::Tensor _limit_by_capacity(
+        torch::Tensor expert_count, torch::Tensor capacity,
+        long n_expert, long n_experts);
+torch::Tensor _prune_gate_by_capacity(
+        torch::Tensor gate_idx, torch::Tensor expert_count,
+        long n_expert, long n_worker);
+std::vector<torch::Tensor> _swipe_once(
+        torch::Tensor gate_idx, torch::Tensor capacity_tensor,
+        long n_expert, long n_worker, long bias);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+#ifdef FMOE_USE_NCCL
+    m.def("expert_exchange", &_expert_exchange, "FastMoE expert exchange (CUDA)");
+    m.def("global_scatter", &_global_scatter, "FastMoE global scatter (CUDA)");
+    m.def("global_gather", &_global_gather, "FastMoE global gather (CUDA)");
+    m.def("ensure_nccl", &_ensure_nccl, "FastMoE ensure torch nccl comm");
+    m.def("swipe_once", &_swipe_once, "SWIPE balance strategy(CUDA)");
+#endif
+    m.def("expert_count", &_expert_count, "FastMoE count gate indices (CUDA)");
+    m.def("assign_pos", &_assign_pos, "FastMoE assign pos by gate (CUDA)");
+    m.def("linear_forward", &_linear_forward, "FastMoE forward (CUDA)");
+    m.def("linear_backward", &_linear_backward, "FastMoE backward (CUDA)");
+    m.def("limit_by_capacity", &_limit_by_capacity, "FastMoE limit experts by capacity(CUDA)");
+    m.def("prune_gate_by_capacity", &_prune_gate_by_capacity, "FastMoE prune gate by capacity(CUDA)");
+}
--- a/cuda/fused_exchange.cu
+++ b/cuda/fused_exchange.cu
+#include "moe_cuda_kernel.h"
+#include <cstdio>
+#include <iostream>
+#include <vector>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "cuda_stream_manager.h"
+#include "cublas_wrapper.h"
+#ifdef FMOE_USE_NCCL
+#include <nccl.h>
+template<typename scalar_t>
+void moe_cuda_global_fused_forward_impl(
+        const scalar_t* input_buf,
+        const scalar_t* weight,
+        scalar_t* global_input_buf,
+        scalar_t* global_output_buf,
+        scalar_t* output_buf,
+        const long* local_expert_count, 
+        const long* global_expert_count, 
+        long in_feat, long out_feat, 
+        long num_expert, long world_size,
+        CudaStreamManager* smgr) {
+    int ptr = 0;
+    int send_ptr = 0;
+    int recv_ptr = 0;
+    int *expert_ptr = new int[num_expert * world_size];
+    expert_ptr[0] = 0;
+    for (int i = 1; i < num_expert * world_size; ++i) {
+        expert_ptr[i] = expert_ptr[i - 1] + local_expert_count[i - 1];
+    }
+    scalar_t alpha = 1, beta = 0; 
+    for (int i = 0; i < num_expert; ++i) {
+        int expert_count = 0;
+        NCCL_SAFE_CALL(ncclGroupStart());
+        for (int j = 0; j < world_size; ++j) {
+            int idx = i + j * num_expert;
+            if (local_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclSend(
+                        input_buf + expert_ptr[idx] * in_feat, 
+                        local_expert_count[idx] * in_feat * sizeof(scalar_t),
+                        ncclChar, 
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(i)));
+            }
+            if (global_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclRecv(
+                        global_input_buf + recv_ptr * in_feat,
+                        global_expert_count[idx] * in_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(i)));
+                recv_ptr += global_expert_count[idx];
+                expert_count += global_expert_count[idx];
+            }
+        }
+        NCCL_SAFE_CALL(ncclGroupEnd());
+        checkCudaErrors(cublasXgemm(
+                smgr->handle(i),
+                CUBLAS_OP_T,
+                CUBLAS_OP_N,
+                out_feat, expert_count, in_feat,
+                &alpha,
+                weight + i * in_feat * out_feat, in_feat,
+                global_input_buf + ptr * in_feat, in_feat,
+                &beta,
+                global_output_buf + out_feat * ptr, out_feat
+                ));
+        ptr += expert_count;
+        NCCL_SAFE_CALL(ncclGroupStart());
+        for (int j = 0; j < world_size; ++j) {
+            int idx = i + j * num_expert;
+            if (global_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclSend(
+                        global_output_buf + send_ptr * out_feat,
+                        global_expert_count[idx] * out_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(i)));
+                send_ptr += global_expert_count[idx];
+            }
+            if (local_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclRecv(
+                        output_buf + expert_ptr[idx] * out_feat, 
+                        local_expert_count[idx] * out_feat * sizeof(scalar_t),
+                        ncclChar, 
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(i)));
+            }
+        }
+        NCCL_SAFE_CALL(ncclGroupEnd());
+    }
+    delete [] expert_ptr;
+    smgr->sync(num_expert);
+}
+std::vector<torch::Tensor> moe_cuda_global_fused_forward(
+        torch::Tensor input_buf,
+        torch::Tensor weight,
+        torch::Tensor local_expert_count,
+        torch::Tensor global_expert_count,
+        long global_batch_size, long local_batch_size, long n_workers) {
+    const auto num_expert = local_expert_count.size(0) / n_workers;
+    const auto out_feat = weight.size(1);
+    const auto in_feat = weight.size(2);
+    auto smgr = getCudaStreamManager(input_buf.device().index());
+    auto global_input_buf = input_buf.new_empty({global_batch_size, in_feat});
+    auto global_output_buf = input_buf.new_empty({global_batch_size, out_feat});
+    auto output_buf = input_buf.new_empty({local_batch_size, out_feat});
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(), 
+            "moe_cuda_global_fused_forward", ([&] {
+        moe_cuda_global_fused_forward_impl(
+            input_buf.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(),
+            global_input_buf.data_ptr<scalar_t>(),
+            global_output_buf.data_ptr<scalar_t>(),
+            output_buf.data_ptr<scalar_t>(),
+            local_expert_count.data_ptr<long>(),
+            global_expert_count.data_ptr<long>(),
+            in_feat, out_feat, num_expert, n_workers,
+            smgr);
+    }));
+    return {output_buf, global_input_buf};
+}
+#endif
--- a/cuda/global_exchange.cpp
+++ b/cuda/global_exchange.cpp
+#include "global_exchange.h"
+#include "utils/fmoe_utils.h"
+#include <torch/extension.h>
+#ifdef FMOE_USE_NCCL
+#include <nccl.h>
+void fmoe_cuda_expert_exchange_impl(
+        const long* local_expert_count,
+        long* global_expert_count,
+        int n_expert, int world_size,
+        CudaStreamManager* smgr) {
+    NCCL_SAFE_CALL(ncclGroupStart());
+    for (int i = 0; i < world_size; ++i) {
+        NCCL_SAFE_CALL(ncclSend(
+                local_expert_count + n_expert * i,
+                n_expert,
+                ncclInt64,
+                i,
+                smgr->ncclcomm,
+                smgr->stream(0)));
+        NCCL_SAFE_CALL(ncclRecv(
+                global_expert_count + n_expert * i,
+                n_expert,
+                ncclInt64,
+                i,
+                smgr->ncclcomm,
+                smgr->stream(0)));
+    }
+    NCCL_SAFE_CALL(ncclGroupEnd());
+    smgr->sync(1);
+}
+torch::Tensor _expert_exchange(
+        torch::Tensor local_expert_count,
+        long n_expert, long n_workers) {
+    auto global_expert_count = torch::empty_like(local_expert_count);
+    auto smgr = getCudaStreamManager(local_expert_count.device().index());
+    fmoe_cuda_expert_exchange_impl(
+            local_expert_count.data_ptr<long>(),
+            global_expert_count.data_ptr<long>(),
+            n_expert, n_workers,
+            smgr);
+    return global_expert_count;
+}
+torch::Tensor _global_scatter(
+        torch::Tensor input_buf,
+        torch::Tensor local_expert_count,
+        torch::Tensor global_expert_count,
+        long batch_size, long n_workers) {
+    CHECK_INPUT(input_buf);
+    auto n_expert = local_expert_count.size(0) / n_workers;
+    auto in_feat = input_buf.size(1);
+    auto global_input_buf = input_buf.new_empty({batch_size, in_feat});
+    auto smgr = getCudaStreamManager(input_buf.device().index());
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(),
+            "fmoe_cuda_global_scatter", ([&] {
+        fmoe_cuda_global_scatter_impl<scalar_t>(
+            input_buf.data_ptr<scalar_t>(),
+            local_expert_count.data_ptr<long>(),
+            global_expert_count.data_ptr<long>(),
+            global_input_buf.data_ptr<scalar_t>(),
+            in_feat, n_expert, n_workers,
+            smgr
+        );
+    }));
+    return global_input_buf;
+}
+torch::Tensor _global_gather(
+        torch::Tensor output_buf,
+        torch::Tensor local_expert_count,
+        torch::Tensor global_expert_count,
+        long batch_size, long n_workers) {
+    CHECK_INPUT(output_buf);
+    auto n_expert = local_expert_count.size(0) / n_workers;
+    auto out_feat = output_buf.size(1);
+    auto local_output_buf = output_buf.new_empty({batch_size, out_feat});
+    auto smgr = getCudaStreamManager(output_buf.device().index());
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(output_buf.scalar_type(),
+            "fmoe_cuda_global_gather", ([&] {
+        fmoe_cuda_global_gather_impl<scalar_t>(
+            output_buf.data_ptr<scalar_t>(),
+            local_expert_count.data_ptr<long>(),
+            global_expert_count.data_ptr<long>(),
+            local_output_buf.data_ptr<scalar_t>(),
+            out_feat, n_expert, n_workers,
+            smgr
+        );
+    }));
+    return local_output_buf;
+}
+#include <c10d/ProcessGroupNCCL.hpp>
+class HackNCCLGroup: public c10d::ProcessGroupNCCL {
+public:
+    ncclComm_t getcomm(at::Device dev) {
+        ncclUniqueId ncclID;
+        int rank = getRank();
+        if (rank == 0) {
+            ncclGetUniqueId(&ncclID);
+        }
+#if defined(TORCH_VERSION_MAJOR) && (TORCH_VERSION_MAJOR > 1 || \
+        (TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 8))
+        broadcastUniqueNCCLID(&ncclID,
+                c10d::OpType::SEND,
+                "fastmoe_nccl_comm",
+                rank);
+#else
+        broadcastUniqueNCCLID(&ncclID);
+#endif
+        ncclComm_t comm;
+        NCCL_SAFE_CALL(ncclCommInitRank(&comm, getSize(), ncclID, rank));
+        return comm;
+    }
+};
+void _ensure_nccl(c10d::ProcessGroupNCCL& p, torch::Tensor t) {
+    auto smgr = getCudaStreamManager(t.device().index());
+    if (smgr->ncclgood) {
+        return;
+    }
+    HackNCCLGroup* h = (HackNCCLGroup*)(void*)&p;
+    smgr->ncclcomm = h->getcomm(t.device());
+    if (smgr->ncclcomm != 0) {
+        smgr->ncclgood = 1;
+    } else {
+        std::cerr << "Nccl initialization failed\n";
+    }
+}
+#endif  // FMOE_USE_NCCL
--- a/cuda/global_exchange.h
+++ b/cuda/global_exchange.h
+#include "stream_manager.h"
+#ifdef FMOE_USE_NCCL
+void fmoe_cuda_expert_exchange_impl(
+        const long* local_expert_count,
+        long* global_expert_count,
+        int n_expert, int world_size,
+        CudaStreamManager* smgr);
+template<typename scalar_t>
+void fmoe_cuda_global_scatter_impl(
+    const scalar_t* local_input_buf,
+    const long* local_expert_count,
+    const long* global_expert_count,
+    scalar_t* input_buf,
+    size_t in_feat, size_t n_expert, size_t world_size,
+    CudaStreamManager* smgr) {
+    // assert world_size > 1
+    int recv_ptr = 0;
+    /* TODO: may save for backward */
+    long*expert_ptr = new long[n_expert * world_size];
+    expert_ptr[0] = 0;
+    for (size_t i = 1; i < n_expert * world_size; ++i) {
+        expert_ptr[i] = expert_ptr[i - 1] + local_expert_count[i - 1];
+    }
+    for (size_t i = 0; i < n_expert; ++i) {
+        NCCL_SAFE_CALL(ncclGroupStart());
+        for (size_t j = 0; j < world_size; ++j) {
+            int idx = i + j * n_expert;
+            if (local_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclSend(
+                        local_input_buf + expert_ptr[idx] * in_feat,
+                        local_expert_count[idx] * in_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(0)));
+            }
+            if (global_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclRecv(
+                        input_buf + recv_ptr * in_feat,
+                        global_expert_count[idx] * in_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(0)));
+                recv_ptr += global_expert_count[idx];
+            }
+        }
+        NCCL_SAFE_CALL(ncclGroupEnd());
+    }
+    delete [] expert_ptr;
+    smgr->sync(1);
+}
+template<typename scalar_t>
+void fmoe_cuda_global_gather_impl(
+    const scalar_t* output_buf,
+    const long* local_expert_count,
+    const long* global_expert_count,
+    scalar_t* local_output_buf,
+    size_t out_feat, size_t n_expert, size_t world_size,
+    CudaStreamManager* smgr) {
+    long send_ptr = 0;
+    /* TODO: may save for backward */
+    long *expert_ptr = new long[n_expert * world_size];
+    expert_ptr[0] = 0;
+    for (size_t i = 1; i < n_expert * world_size; ++i) {
+        expert_ptr[i] = expert_ptr[i - 1] + local_expert_count[i - 1];
+    }
+    for (size_t i = 0; i < n_expert; ++i) {
+        NCCL_SAFE_CALL(ncclGroupStart());
+        for (size_t j = 0; j < world_size; ++j) {
+            int idx = i + j * n_expert;
+            if (global_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclSend(
+                        output_buf + send_ptr * out_feat,
+                        global_expert_count[idx] * out_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(0)));
+                send_ptr += global_expert_count[idx];
+            }
+            if (local_expert_count[idx]) {
+                NCCL_SAFE_CALL(ncclRecv(
+                        local_output_buf + expert_ptr[idx] * out_feat,
+                        local_expert_count[idx] * out_feat * sizeof(scalar_t),
+                        ncclChar,
+                        j,
+                        smgr->ncclcomm,
+                        smgr->stream(0)));
+            }
+        }
+        NCCL_SAFE_CALL(ncclGroupEnd());
+    }
+    delete [] expert_ptr;
+    smgr->sync(1);
+}
+#endif  // FMOE_USE_NCCL
--- a/cuda/local_exchange.cu
+++ b/cuda/local_exchange.cu
+#include "local_exchange.cuh"
+#include "utils/fmoe_utils.h"
+#include <torch/extension.h>
+void _assign_pos(
+    torch::Tensor cum_count,
+    torch::Tensor gate,
+    torch::Tensor pos) {
+    auto smgr = getCudaStreamManager(cum_count.device().index());
+    auto gate_shp = gate.sizes();
+    size_t batch_size = gate_shp[0], topk = 1;
+    if (gate_shp.size() == 2) {
+        topk = gate_shp[1];
+    }
+    fmoe_cuda_assign_pos_impl(
+            cum_count.data_ptr<int>(),
+            gate.data_ptr<long>(),
+            pos.data_ptr<long>(),
+            batch_size, topk, smgr);
+}
+void _expert_count(
+        torch::Tensor gate_idx,
+        torch::Tensor expert_count) {
+    auto smgr = getCudaStreamManager(gate_idx.device().index());
+    auto batch_size = gate_idx.numel();
+    auto n_expert = expert_count.numel();
+    fmoe_cuda_expert_count_impl(
+            gate_idx.data_ptr<long>(),
+            expert_count.data_ptr<int>(),
+            batch_size, n_expert, smgr);
+}
--- a/cuda/local_exchange.cuh
+++ b/cuda/local_exchange.cuh
+#include "stream_manager.h"
+#include "utils/helper_cuda.h"
+#include "utils/fmoe_utils.h"
+__global__
+void assign_pos_kernel(int* cum_count, const long* gate, long* pos,
+        size_t numel, size_t topk) {
+    size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < numel) {
+        long gate_idx = gate[idx];
+        if (gate_idx > -1) {
+            int p = atomicSub(cum_count + gate_idx, 1);
+            pos[p - 1] = (long)idx;
+        }
+    }
+}
+void fmoe_cuda_assign_pos_impl(
+        int* cum_count, const long* gate, long* pos,
+        const size_t batch_size, const size_t topk,
+        CudaStreamManager* smgr) {
+    size_t numel = batch_size * topk;
+    assign_pos_kernel
+        <<<CEIL(numel, 256), 256, 0, smgr->stream(0)>>>
+        (cum_count, gate, pos, numel, topk);
+    smgr->sync(1);
+}
+#define PERTHREAD_EXPERTS 256
+#ifdef FMOE_USE_HIP
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+__global__
+void expert_count_kernel(const long* gate_idx, int* expert_count,
+        const size_t batch_size, const size_t n_expert) {
+    int res_tmp[PERTHREAD_EXPERTS] = {0};
+    long expert_min = blockIdx.x * PERTHREAD_EXPERTS;
+    long expert_max = expert_min + PERTHREAD_EXPERTS;
+    if (expert_max > n_expert) {
+        expert_max = n_expert;
+    }
+    for (int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+        long idx = gate_idx[i];
+        if (idx == -1) {
+            continue;
+        }
+        if (idx < expert_min || idx >= expert_max) {
+            continue;
+        }
+        res_tmp[idx - expert_min] += 1;
+    }
+    for (int i = expert_min; i < expert_max; ++i) {
+        int x = res_tmp[i - expert_min];
+#pragma unroll
+        for (int j = 1; j < WARP_SIZE; j <<= 1) {
+#ifdef FMOE_USE_HIP
+            x = x + __shfl_down(x, j);
+#else
+            x = x + __shfl_down_sync(-1u, x, j);
+#endif
+        }
+        if (threadIdx.x % WARP_SIZE == 0) {
+            atomicAdd(expert_count + i, x);
+        }
+    }
+}
+void fmoe_cuda_expert_count_impl(
+        const long* gate_idx, int* expert_count,
+        const size_t batch_size, const size_t n_expert,
+        CudaStreamManager* smgr) {
+    expert_count_kernel
+        <<<CEIL(n_expert, PERTHREAD_EXPERTS), 256, 0, smgr->stream(0)>>>
+        (gate_idx, expert_count, batch_size, n_expert);
+    smgr->sync(1);
+}
--- a/cuda/parallel_linear.cu
+++ b/cuda/parallel_linear.cu
+#include "parallel_linear.cuh"
+#include "utils/fmoe_utils.h"
+#include <torch/extension.h>
+torch::Tensor _linear_forward(
+        torch::Tensor input_buf,
+        torch::Tensor expert_count,
+        torch::Tensor weight,
+        at::optional<torch::Tensor> bias
+        ) {
+    auto smgr = getCudaStreamManager(input_buf.device().index());
+    const auto batch_size = input_buf.size(0);
+    const auto num_expert = weight.size(0);
+    const auto out_feat = weight.size(1);
+    const auto in_feat = weight.size(2);
+#ifdef MOE_DEBUG
+    printf("[forward] expert=%ld, in_feat (d_model)=%ld, out_feat (d_ffn)=%ld\n",
+            num_expert, in_feat, out_feat);
+#endif
+    torch::Tensor output;
+    if (bias.has_value()) {
+        output = bias.value().repeat_interleave(expert_count.to(bias.value().device()), 0);
+    } else{
+        auto out_options = torch::TensorOptions()
+            .device(input_buf.device())
+            .dtype(input_buf.dtype());
+        output = torch::empty({batch_size, out_feat}, out_options);
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(), "moe_forward_cuda",
+            ([&] {
+        fmoe_cuda_linear_forward_impl<scalar_t>(
+            input_buf.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(),
+            expert_count.data_ptr<long>(),
+            output.data_ptr<scalar_t>(),
+            bias.has_value(),
+            in_feat,
+            out_feat,
+            num_expert,
+            smgr
+        );
+    }));
+    return output;
+}
+std::vector<torch::Tensor> _linear_backward(
+    torch::Tensor grad_output_buf,
+    torch::Tensor input_buf,
+    torch::Tensor expert_count,
+    torch::Tensor weight,
+    at::optional<torch::Tensor> bias
+) {
+    auto smgr = getCudaStreamManager(input_buf.device().index());
+    const auto batch_size = input_buf.size(0);
+    const auto num_expert = weight.size(0);
+    const auto out_feat = weight.size(1);
+    const auto in_feat = weight.size(2);
+#ifdef MOE_DEBUG
+    printf("[backward] b=%ld, expert=%ld, in_feat (d_model)=%ld, "
+            "out_feat (d_ffn)=%ld\n",
+            batch_size, num_expert, in_feat, out_feat);
+#endif
+    auto grad_input_buf = grad_output_buf.new_empty({batch_size, in_feat});
+    auto grad_weight = grad_output_buf.new_empty({num_expert, out_feat, in_feat});
+    auto grad_bias = grad_output_buf.new_empty({num_expert, out_feat});
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input_buf.scalar_type(), "moe_cuda_backward", ([&] {
+        fmoe_cuda_linear_backward_impl<scalar_t>(
+            grad_output_buf.data_ptr<scalar_t>(),
+            input_buf.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(),
+            expert_count.data_ptr<long>(),
+            grad_input_buf.data_ptr<scalar_t>(),
+            grad_weight.data_ptr<scalar_t>(),
+            grad_bias.data_ptr<scalar_t>(),
+            bias.has_value(),
+            batch_size,
+            in_feat,
+            out_feat,
+            num_expert,
+            smgr
+        );
+    }));
+    return {grad_input_buf, grad_weight, grad_bias};
+}
--- a/cuda/parallel_linear.cuh
+++ b/cuda/parallel_linear.cuh
+#include "stream_manager.h"
+#include "utils/cublas_wrapper.h"
+/*
+    This function is to be called with one block per each column
+*/
+template <typename scalar_t>
+__global__ 
+void column_reduce(const scalar_t * matrix, scalar_t * result, 
+    int m /* lines */, int n /* columns*/) {
+    // https://stackoverflow.com/questions/27570552/templated-cuda-kernel-with-dynamic-shared-memory
+    extern __shared__ unsigned char my_smem[];
+    scalar_t *sdata = reinterpret_cast<scalar_t *>(my_smem);
+    // normal tid
+    int tid = threadIdx.x + threadIdx.y * blockDim.x;
+    // transposed tid for shared memory
+    int new_tid = threadIdx.y + threadIdx.x * blockDim.y;
+    // true x value in the matrix
+    int real_x = threadIdx.x + blockDim.x * blockIdx.x;
+    int i = real_x + n * threadIdx.y;
+    const int it = n*blockDim.y;
+    int offset = it;
+    float accumulator = 0;
+    if (threadIdx.y < m && real_x < n) {
+        // store all the values from this column in a warped way
+        accumulator = matrix[i];
+        while (i + offset < n*m) {
+            accumulator += matrix[i + offset];
+            offset += it;
+        }
+    }
+    // save column reduction data in a transposed way
+    sdata[new_tid] = accumulator;
+    __syncthreads();
+    for (size_t t= 16; t > 0; t>>=1) {
+        if (tid < 32 * 32 - 16)
+            sdata[tid] += sdata[tid + t];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0 && real_x < n) 
+        result[real_x] = sdata[new_tid];
+}
+template <typename scalar_t>
+void fmoe_cuda_linear_forward_impl(
+        const scalar_t* input_buf,
+        const scalar_t* weight,
+        const long* expert_count,
+        scalar_t* output_buf,
+        const bool has_bias,
+        const size_t in_feat,
+        const size_t out_feat,
+        const size_t num_expert,
+        CudaStreamManager* smgr) {
+    scalar_t alpha = 1, beta = has_bias ? 1 : 0; 
+    for (int i = 0, ptr = 0; i < num_expert; ++i) {
+        if (expert_count[i] == 0) {
+            continue;
+        }
+        // Use T(B) x T(A) = T(C) to produce row-major C
+        checkCudaErrors(cublasXgemm(
+                smgr->handle(i),
+                CUBLAS_OP_T,
+                CUBLAS_OP_N,
+                out_feat, expert_count[i], in_feat,
+                &alpha,
+                weight + i * in_feat * out_feat, in_feat,
+                input_buf + ptr * in_feat, in_feat,
+                &beta,
+                output_buf + out_feat * ptr, out_feat
+                ));
+        ptr += expert_count[i];
+    }
+    smgr->sync(num_expert);
+}
+template <typename scalar_t>
+void fmoe_cuda_linear_backward_impl(
+        const scalar_t* grad_output_buf,
+        const scalar_t* input_buf,
+        const scalar_t* weight,
+        const long* expert_count,
+        scalar_t* grad_input_buf,
+        scalar_t* grad_weight,
+        scalar_t* grad_bias,
+        const bool has_bias,
+        const size_t batch_size,
+        const size_t in_feat,
+        const size_t out_feat,
+        const size_t num_expert,
+        CudaStreamManager* smgr) {
+    scalar_t alpha = 1, beta = 0;
+    // bias
+    dim3 block_threads(32, 32);
+    dim3 grid_threads(out_feat / 32 + (out_feat % 32 ? 1 : 0), 1);
+    for (int i = 0, ptr = 0; i < num_expert; ++i) {
+        if (expert_count[i] == 0) {
+            cudaMemset(grad_weight + i * in_feat * out_feat, 0, 
+                    sizeof(scalar_t) * in_feat * out_feat);
+            cudaMemset(grad_bias + i * out_feat, 0, sizeof(scalar_t) * out_feat);
+            continue;
+        }
+        // Use T(B) x T(A) = T(C) to produce row-major C
+        // Backward input: g_i = w @ g_o
+        checkCudaErrors(cublasXgemm(
+                smgr->handle(i),
+                CUBLAS_OP_N,
+                CUBLAS_OP_N,
+                in_feat, expert_count[i], out_feat,
+                &alpha,
+                weight + i * in_feat * out_feat, in_feat,
+                grad_output_buf + ptr * out_feat, out_feat,
+                &beta,
+                grad_input_buf + in_feat * ptr, in_feat
+                ));
+        // Backward weight: g_w = i @ g_o
+        checkCudaErrors(cublasXgemm(
+                smgr->handle(i),
+                CUBLAS_OP_N,
+                CUBLAS_OP_T,
+                in_feat, out_feat, expert_count[i],
+                &alpha,
+                input_buf + in_feat * ptr, in_feat,
+                grad_output_buf + ptr * out_feat, out_feat,
+                &beta,
+                grad_weight + i * in_feat * out_feat, in_feat
+                ));
+        if (has_bias) {
+            column_reduce
+            <<<grid_threads, block_threads, sizeof(scalar_t)*1024, smgr->stream(i)>>>
+            (
+                grad_output_buf + ptr * out_feat,
+                grad_bias + i * out_feat,
+                expert_count[i],
+                out_feat
+            );
+        }
+        ptr += expert_count[i];
+    }
+    smgr->sync(num_expert);
+}
--- a/cuda/stream_manager.cpp
+++ b/cuda/stream_manager.cpp
+#include <unordered_map>
+#include <mutex>
+#include <cassert>
+#include <thread>
+#include <iostream>
+#include "stream_manager.h"
+#define SMGR_N_STREAMS 16
+cudaStream_t CudaStreamManager::stream(size_t idx) {
+    return this->streams[idx % SMGR_N_STREAMS];
+}
+cublasHandle_t CudaStreamManager::handle(size_t idx) {
+    return this->handles[idx % SMGR_N_STREAMS];
+}
+void CudaStreamManager::sync(int idx) {
+    for (int i = 0; i < idx && i < SMGR_N_STREAMS; ++i) {
+        cudaStreamSynchronize(streams[i]);
+    }
+}
+void CudaStreamManager::setup(const int device) {
+#ifdef FMOE_USE_NCCL
+    this->ncclgood = 0;
+#endif
+    this->device = device;
+    checkCudaErrors(cudaSetDevice(device));
+    streams = new cudaStream_t[SMGR_N_STREAMS];
+    handles = new cublasHandle_t[SMGR_N_STREAMS];
+    for (size_t i = 0; i < SMGR_N_STREAMS; ++i) {
+        checkCudaErrors(cudaStreamCreate(streams + i));
+        checkCudaErrors(cublasCreate(handles + i));
+        cublasSetStream(handles[i], streams[i]);
+    }
+}
+void CudaStreamManager::destroy() {
+    for (size_t i = 0; i < SMGR_N_STREAMS; ++i) {
+        checkCudaErrors(cudaStreamDestroy(streams[i]));
+        checkCudaErrors(cublasDestroy(handles[i]));
+    }
+    delete[] streams;
+    delete[] handles;
+}
+std::unordered_map<int, CudaStreamManager*> smgrs;
+std::mutex smgr_mtx;
+CudaStreamManager* getCudaStreamManager(const int device) {
+    auto it = smgrs.find(device);
+    if (it == smgrs.end()) {
+        smgr_mtx.lock();
+        it = smgrs.find(device);
+        if (it == smgrs.end()) {
+            auto smgr = new CudaStreamManager(device);
+            smgrs.insert(std::pair<int, CudaStreamManager*>(device, smgr));
+            smgr_mtx.unlock();
+            return smgr;
+        } else {
+            smgr_mtx.unlock();
+        }
+    }
+    return it->second;
+}
--- a/cuda/stream_manager.h
+++ b/cuda/stream_manager.h
+#ifndef CUDA_STREAM_MANAGER_H
+#define CUDA_STREAM_MANAGER_H
+#include "utils/helper_cuda.h"
+#ifdef FMOE_USE_NCCL
+#include <nccl.h>
+#define NCCL_SAFE_CALL(__fn__) { \
+    auto __res__ = __fn__; \
+    if (__res__ != ncclSuccess) { \
+        fprintf(stderr, "NCCL Error at %s:%d value %d\n", __FILE__, __LINE__, __res__); \
+        exit(-1); \
+    } \
+}
+#endif
+class CudaStreamManager {
+public:
+    int device;
+    cublasHandle_t* handles;
+    cudaStream_t* streams;
+#ifdef FMOE_USE_NCCL
+    char ncclgood;
+    ncclComm_t ncclcomm;
+#endif
+public:
+    CudaStreamManager(int device_): device(device_) {
+        this->setup(device);
+    }
+    void setup(int);
+    void sync(int=0);
+    void destroy();
+    cudaStream_t stream(size_t=0);
+    cublasHandle_t handle(size_t=0);
+    ~CudaStreamManager() {
+        this->destroy();
+    }
+}; 
+CudaStreamManager* getCudaStreamManager(const int device);
+#endif  // CUDA_STREAM_MANAGER 
--- a/cuda/tests/.gitignore
+++ b/cuda/tests/.gitignore
+test_*
--- a/cuda/tests/Makefile
+++ b/cuda/tests/Makefile
+default : test_prune_gate test_limit test_assign test_counting
+test_% : %.cu
+	nvcc $< ../stream_manager.cpp -lcublas -o $@
--- a/cuda/tests/assign.cu
+++ b/cuda/tests/assign.cu
+#include "../local_exchange.cuh"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda.h>
+#include <cuda_runtime.h>
+int main(int argc, char* args[]) {
+    int n_worker = atoi(args[1]);
+    int n_expert = atoi(args[2]);
+    int batch_size = atoi(args[3]);
+    int topk = atoi(args[4]);
+    int tot_expert = n_worker * n_expert;
+    long* gate_idx = new long[batch_size * topk];
+    long* n_gate_idx = new long[batch_size * topk];
+    int* lec = new int[tot_expert];
+    memset(lec, 0, sizeof(int) * tot_expert);
+    for (int i = 0; i < batch_size * topk; ++i) {
+        if (rand() % 10) {
+            gate_idx[i] = rand() % tot_expert;
+            ++lec[gate_idx[i]];
+        } else {
+            gate_idx[i] = -1;
+        }
+    }
+    for (int i = 1; i < tot_expert; ++i) {
+        lec[i] += lec[i - 1];
+    }
+    puts("gate idx");
+    for (int i = 0; i < batch_size * topk; ++i) {
+        printf("%d ", gate_idx[i]);
+    }
+    putchar(10);
+    int nlec = lec[tot_expert - 1];
+    int* g_lec;
+    cudaMalloc(&g_lec, sizeof(int) * tot_expert);
+    cudaMemcpy(g_lec, lec, sizeof(int) * tot_expert, cudaMemcpyHostToDevice);
+    long* g_gate_idx;
+    cudaMalloc(&g_gate_idx, sizeof(long) * batch_size * topk);
+    cudaMemcpy(g_gate_idx, gate_idx, sizeof(long) * batch_size * topk,
+            cudaMemcpyHostToDevice);
+    long* g_pos;
+    cudaMalloc(&g_pos, sizeof(long) * nlec);
+    // cudaMemcpy(g_gate_idx, gate_idx, sizeof(long) * nlec, cudaMemcpyHostToDevice);
+    auto smgr = getCudaStreamManager(0);
+    fmoe_cuda_assign_pos_impl(g_lec, g_gate_idx, g_pos, batch_size * topk,
+            topk, smgr);
+    long* pos = new long[nlec];
+    cudaMemcpy(pos, g_pos, sizeof(long) * nlec, cudaMemcpyDeviceToHost);
+    puts("pos");
+    for (int i = 0; i < nlec; ++i) {
+        printf("%d ", pos[i]);
+    }
+    putchar(10);
+}
--- a/cuda/tests/counting.cu
+++ b/cuda/tests/counting.cu
+#include "../local_exchange.cuh"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cuda.h>
+#include <cuda_runtime.h>
+int main(int argc, char* args[]) {
+    int batch_size = atoi(args[1]);
+    int n_expert = atoi(args[2]);
+    long* gate_idx = new long[batch_size];
+    long* n_gate_idx = new long[batch_size];
+    int* ref_lec = new int[n_expert];
+    memset(ref_lec, 0, sizeof(int) * n_expert);
+    for (int i = 0; i < batch_size; ++i) {
+        gate_idx[i] = rand() % (n_expert + 1) - 1;
+        if (gate_idx[i] != -1) {
+            ref_lec[gate_idx[i]] += 1;
+        }
+    }
+    puts("ref lec");
+    for (int i = 0; i < n_expert; ++i) {
+        printf("%d ", ref_lec[i]);
+    }
+    putchar(10);
+    int* g_lec;
+    cudaMalloc(&g_lec, sizeof(int) * n_expert);
+    cudaMemset(g_lec, 0, sizeof(int) * n_expert);
+    long* g_gate_idx;
+    cudaMalloc(&g_gate_idx, sizeof(long) * batch_size);
+    cudaMemcpy(g_gate_idx, gate_idx, sizeof(long) * batch_size,
+            cudaMemcpyHostToDevice);
+    auto smgr = getCudaStreamManager(0);
+    fmoe_cuda_expert_count_impl(g_gate_idx, g_lec, batch_size, n_expert, smgr);
+    int* lec = new int[n_expert];
+    cudaMemcpy(lec, g_lec, sizeof(int) * n_expert, cudaMemcpyDeviceToHost);
+    puts("lec");
+    for (int i = 0; i < n_expert; ++i) {
+        printf("%d ", lec[i]);
+    }
+    putchar(10);
+}