"examples/vscode:/vscode.git/clone" did not exist on "4eb9ad0d1c7cc86a4c533b0da261e9bf57128166"
Unverified Commit acf8bec8 authored by Rick Ho's avatar Rick Ho Committed by GitHub
Browse files

Merge pull request #86 from laekov/v0.3.0-rc

v0.3.0 Release
parents 3397bc19 a461be6c
......@@ -93,18 +93,18 @@ inline cublasStatus_t cublasXgemm(cublasHandle_t handle,
const c10::Half *beta,
c10::Half *C, int ldc) {
#ifdef FMOE_USE_HIP
return rocblas_hgemm(handle, transa, transb, m, n, k,
(const rocblas_half*)alpha,
(const rocblas_half*)A, lda,
(const rocblas_half*)B, ldb,
(const rocblas_half*)beta,
return rocblas_hgemm(handle, transa, transb, m, n, k,
(const rocblas_half*)alpha,
(const rocblas_half*)A, lda,
(const rocblas_half*)B, ldb,
(const rocblas_half*)beta,
(rocblas_half*)C, ldc);
#else
return cublasHgemm(handle, transa, transb, m, n, k,
(const __half*)alpha,
(const __half*)A, lda,
(const __half*)B, ldb,
(const __half*)beta,
return cublasHgemm(handle, transa, transb, m, n, k,
(const __half*)alpha,
(const __half*)A, lda,
(const __half*)B, ldb,
(const __half*)beta,
(__half*)C, ldc);
#endif
}
......
......@@ -54,28 +54,28 @@ static const char *_cudaGetErrorEnum(CUresult error) {
#ifdef FMOE_USE_HIP
static const char *_cudaGetErrorEnum(cublasStatus_t error) {
switch (error) {
case rocblas_status_success:
return "rocblas_status_success";
case rocblas_status_invalid_handle:
return "rocblas_status_invalid_handle";
case rocblas_status_not_implemented:
return "rocblas_status_not_implemented";
case rocblas_status_invalid_pointer:
return "rocblas_status_invalid_pointer:";
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
case rocblas_status_memory_error:
return "rocblas_status_memory_error";
case rocblas_status_internal_error:
return "rocblas_status_internal_error";
case rocblas_status_perf_degraded:
return "rocblas_status_perf_degraded";
......@@ -84,13 +84,13 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
case rocblas_status_size_increased:
return "rocblas_status_size_increased";
case rocblas_status_size_unchanged:
return "rocblas_status_size_unchanged";
case rocblas_status_invalid_value:
return "rocblas_status_invalid_value";
case rocblas_status_continue:
return "rocblas_status_continue";
}
......@@ -627,3 +627,4 @@ void check(T result, char const *const func, const char *const file,
#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
#endif // HELPER_CUDA_H
## v0.3.0
### FMoE core
* Previous `mp_group` is renamed to `slice_group`, indicating that all workers in the group receive the same input batch, and process a slice of the input. `mp_group` will be deprecated in our next release.
* ROCm supported.
* `FMoELinear` is moved to a stand-alone file.
### Groupped data parallel
* Support any group name by their relative tag name.
### Load balancing
* A brand new balancing strategy - SWIPE. Contributed by authors of a (currently unpublished) paper.
* A property `has_loss` is added to each gate, in order to identify whether balance loss should be collected.
### Megatron-LM support
* Experts are partitioned by tensor model parallelism in `mp_group`, instead of expert parallelism.
* Support arbitrary customized gate in `MegatronMLP`.
* Move the patches to a stand-alone file.
### Tests
* Move util functions into `test_ddp.py`.
## v0.2.1
## Load balancing
* Fix gradient for balance loss.
## Misc
### Misc
* Typos.
* Update benchmark interface.
......@@ -12,7 +39,7 @@
* Enable `USE_NCCL` by default.
* Compatibility for PyTorch `<1.8.0` and `>=1.8.0`.
## Megatron adaption
### Megatron adaption
* Patch for numerical correctness of gradient clipping.
* Support to pipeline parallelism.
......
......@@ -72,7 +72,7 @@ class FMoE(nn.Module):
group hold the same copy of input feature, and requires the same copy of
the output. For each worker, FMoE only computes the output of a certain
slice of the input batch, and will all-gather the outputs after
computation.
computation.
* `top_k` stands for the number of experts each token is going to.
* `gate` is a gate class which can found in `fmoe.gates`.
* `expert` can be specified as a module class, it is used to generate
......
......@@ -59,9 +59,9 @@ def patch_model_provider(model_provider, gate=None):
def fmoefied_model_provider():
from .layers import fmoefy
args = get_args()
hhs = args.hidden_size * 4
hhs = args.hidden_size * 4
assert hhs % args.top_k == 0
hhs = hhs // args.top_k
hhs = hhs // args.top_k
assert hhs % args.tensor_model_parallel_size == 0
hhs = hhs // args.tensor_model_parallel_size
return fmoefy(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment