Commit 254cdbbc authored by Rick Ho's avatar Rick Ho
Browse files

update release note

parent 3397bc19
...@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle, ...@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
const c10::Half *beta, const c10::Half *beta,
c10::Half *C, int ldc) { c10::Half *C, int ldc) {
#ifdef FMOE_USE_HIP #ifdef FMOE_USE_HIP
return rocblas_hgemm(handle, transa, transb, m, n, k, return rocblas_hgemm(handle, transa, transb, m, n, k,
(const rocblas_half*)alpha, (const rocblas_half*)alpha,
(const rocblas_half*)A, lda, (const rocblas_half*)A, lda,
(const rocblas_half*)B, ldb, (const rocblas_half*)B, ldb,
(const rocblas_half*)beta, (const rocblas_half*)beta,
(rocblas_half*)C, ldc); (rocblas_half*)C, ldc);
#else #else
return cublasHgemm(handle, transa, transb, m, n, k, return cublasHgemm(handle, transa, transb, m, n, k,
(const __half*)alpha, (const __half*)alpha,
(const __half*)A, lda, (const __half*)A, lda,
(const __half*)B, ldb, (const __half*)B, ldb,
(const __half*)beta, (const __half*)beta,
(__half*)C, ldc); (__half*)C, ldc);
#endif #endif
} }
......
...@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) { ...@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) {
#ifdef FMOE_USE_HIP #ifdef FMOE_USE_HIP
static const char *_cudaGetErrorEnum(cublasStatus_t error) { static const char *_cudaGetErrorEnum(cublasStatus_t error) {
switch (error) { switch (error) {
case rocblas_status_success: case rocblas_status_success:
return "rocblas_status_success"; return "rocblas_status_success";
case rocblas_status_invalid_handle: case rocblas_status_invalid_handle:
return "rocblas_status_invalid_handle"; return "rocblas_status_invalid_handle";
case rocblas_status_not_implemented: case rocblas_status_not_implemented:
return "rocblas_status_not_implemented"; return "rocblas_status_not_implemented";
case rocblas_status_invalid_pointer: case rocblas_status_invalid_pointer:
return "rocblas_status_invalid_pointer:"; return "rocblas_status_invalid_pointer:";
case rocblas_status_invalid_size: case rocblas_status_invalid_size:
return "rocblas_status_invalid_size"; return "rocblas_status_invalid_size";
case rocblas_status_memory_error: case rocblas_status_memory_error:
return "rocblas_status_memory_error"; return "rocblas_status_memory_error";
case rocblas_status_internal_error: case rocblas_status_internal_error:
return "rocblas_status_internal_error"; return "rocblas_status_internal_error";
case rocblas_status_perf_degraded: case rocblas_status_perf_degraded:
return "rocblas_status_perf_degraded"; return "rocblas_status_perf_degraded";
...@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) { ...@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
case rocblas_status_size_increased: case rocblas_status_size_increased:
return "rocblas_status_size_increased"; return "rocblas_status_size_increased";
case rocblas_status_size_unchanged: case rocblas_status_size_unchanged:
return "rocblas_status_size_unchanged"; return "rocblas_status_size_unchanged";
case rocblas_status_invalid_value: case rocblas_status_invalid_value:
return "rocblas_status_invalid_value"; return "rocblas_status_invalid_value";
case rocblas_status_continue: case rocblas_status_continue:
return "rocblas_status_continue"; return "rocblas_status_continue";
} }
...@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file, ...@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file,
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#endif // HELPER_CUDA_H #endif // HELPER_CUDA_H
## v0.3.0
### FMoE core
* Previous `mp_group` is renamed to `slice_gruop`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_gruop` will be deprecated in our next release.
* ROCm supported.
* `FMoELinear` is moved to a stand-alone file.
### Groupped data parallel
* Support any group name by their relative tag name.
### Load balancing
* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
### Megatron-LM support
* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
* Support arbitrary customized gate in `MegatronMLP`.
* Move the patches to a stand-alone file.
### Tests
* Move util functions into `test_ddp.py`.
## v0.2.1 ## v0.2.1
## Load balancing ## Load balancing
* Fix gradient for balance loss. * Fix gradient for balance loss.
## Misc ### Misc
* Typos. * Typos.
* Update benchmark interface. * Update benchmark interface.
...@@ -12,7 +39,7 @@ ...@@ -12,7 +39,7 @@
* Enable `USE_NCCL` by default. * Enable `USE_NCCL` by default.
* Compatibility for PyTorch `<1.8.0` and `>=1.8.0`. * Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
## Megatron adaption ### Megatron adaption
* Patch for numerical correctness of gradient clipping. * Patch for numerical correctness of gradient clipping.
* Support to pipeline parallelism. * Support to pipeline parallelism.
......
...@@ -72,7 +72,7 @@ class FMoE(nn.Module): ...@@ -72,7 +72,7 @@ class FMoE(nn.Module):
group hold the same copy of input feature, and requires the same copy of group hold the same copy of input feature, and requires the same copy of
the output. For each worker, FMoE only computes the output of a certain the output. For each worker, FMoE only computes the output of a certain
slice of the input batch, and will all-gather the outputs after slice of the input batch, and will all-gather the outputs after
computation. computation.
* `top_k` stands for the number of experts each token is going to. * `top_k` stands for the number of experts each token is going to.
* `gate` is a gate class which can found in `fmoe.gates`. * `gate` is a gate class which can found in `fmoe.gates`.
* `expert` can be specified as a module class, it is used to generate * `expert` can be specified as a module class, it is used to generate
......
...@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None): ...@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None):
def fmoefied_model_provider(): def fmoefied_model_provider():
from .layers import fmoefy from .layers import fmoefy
args = get_args() args = get_args()
hhs = args.hidden_size * 4 hhs = args.hidden_size * 4
assert hhs % args.top_k == 0 assert hhs % args.top_k == 0
hhs = hhs // args.top_k hhs = hhs // args.top_k
assert hhs % args.tensor_model_parallel_size == 0 assert hhs % args.tensor_model_parallel_size == 0
hhs = hhs // args.tensor_model_parallel_size hhs = hhs // args.tensor_model_parallel_size
return fmoefy( return fmoefy(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment