update release note

254cdbbc · Rick Ho · 3397bc19 · 254cdbbc · 254cdbbc · 254cdbbc
Commit 254cdbbc authored Nov 08, 2021 by Rick Ho
5 changed files
--- a/cuda/utils/cublas_wrapper.h
+++ b/cuda/utils/cublas_wrapper.h
@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
                                const c10::Half *beta,
                                c10::Half *C, int ldc) {
 #ifdef FMOE_USE_HIP
-    return rocblas_hgemm(handle, transa, transb, m, n, k, 
+    return rocblas_hgemm(handle, transa, transb, m, n, k,
-            (const rocblas_half*)alpha, 
+            (const rocblas_half*)alpha,
-            (const rocblas_half*)A, lda, 
+            (const rocblas_half*)A, lda,
-            (const rocblas_half*)B, ldb, 
+            (const rocblas_half*)B, ldb,
-            (const rocblas_half*)beta, 
+            (const rocblas_half*)beta,
            (rocblas_half*)C, ldc);
 #else
-    return cublasHgemm(handle, transa, transb, m, n, k, 
+    return cublasHgemm(handle, transa, transb, m, n, k,
-            (const __half*)alpha, 
+            (const __half*)alpha,
-            (const __half*)A, lda, 
+            (const __half*)A, lda,
-            (const __half*)B, ldb, 
+            (const __half*)B, ldb,
-            (const __half*)beta, 
+            (const __half*)beta,
            (__half*)C, ldc);
 #endif
 }

--- a/cuda/utils/helper_cuda.h
+++ b/cuda/utils/helper_cuda.h
@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) {
 #ifdef FMOE_USE_HIP
 static const char *_cudaGetErrorEnum(cublasStatus_t error) {
  switch (error) {
    case rocblas_status_success:
      return "rocblas_status_success";
    case rocblas_status_invalid_handle:
      return "rocblas_status_invalid_handle";
    case rocblas_status_not_implemented:
      return "rocblas_status_not_implemented";
    case rocblas_status_invalid_pointer:
      return "rocblas_status_invalid_pointer:";
    case rocblas_status_invalid_size:
      return "rocblas_status_invalid_size";
    case rocblas_status_memory_error:
      return "rocblas_status_memory_error";
    case rocblas_status_internal_error:
      return "rocblas_status_internal_error";
    case rocblas_status_perf_degraded:
      return "rocblas_status_perf_degraded";
@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
    case rocblas_status_size_increased:
      return "rocblas_status_size_increased";
    case rocblas_status_size_unchanged:
      return "rocblas_status_size_unchanged";
    case rocblas_status_invalid_value:
      return "rocblas_status_invalid_value";
    case rocblas_status_continue:
      return "rocblas_status_continue";
  }
@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file,
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 #endif  // HELPER_CUDA_H
--- a/doc/release-note.md
+++ b/doc/release-note.md
+## v0.3.0
+### FMoE core
+* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release.
+* ROCm supported.
+* `FMoELinear` is moved to a stand-alone file.
+### Groupped data parallel
+* Support any group name by their relative tag name.
+###  Load balancing
+* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
+* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
+### Megatron-LM support
+* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
+* Support arbitrary customized gate in `MegatronMLP`.
+* Move the patches to a stand-alone file.
+### Tests
+* Move util functions into `test_ddp.py`.
 ## v0.2.1
 ## Load balancing
 * Fix gradient for balance loss.
-## Misc
+### Misc
 * Typos.
 * Update benchmark interface.
@@ -12,7 +39,7 @@
 * Enable `USE_NCCL` by default.
 * Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
-## Megatron adaption
+### Megatron adaption
 * Patch for numerical correctness of gradient clipping.
 * Support to pipeline parallelism.

--- a/fmoe/layers.py
+++ b/fmoe/layers.py
@@ -72,7 +72,7 @@ class FMoE(nn.Module):
    group hold the same copy of input feature, and requires the same copy of
    the output. For each worker, FMoE only computes the output of a certain
    slice of the input batch, and will all-gather the outputs after
-    computation.  
+    computation.
    * `top_k` stands for the number of experts each token is going to.
    * `gate` is a gate class which can found in `fmoe.gates`.
    * `expert` can be specified as a module class, it is used to generate

--- a/fmoe/megatron/patch.py
+++ b/fmoe/megatron/patch.py
@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None):
    def fmoefied_model_provider():
        from .layers import fmoefy
        args = get_args()
-        hhs = args.hidden_size * 4 
+        hhs = args.hidden_size * 4
        assert hhs % args.top_k == 0
-        hhs = hhs // args.top_k 
+        hhs = hhs // args.top_k
        assert hhs % args.tensor_model_parallel_size == 0
        hhs = hhs // args.tensor_model_parallel_size
        return fmoefy(