DGRAD_RS UB overlap Bug fixes (#1004)

* DGRAD_RS UB overlap Bug fixes Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

DGRAD_RS UB overlap Bug fixes (#1004)
* DGRAD_RS UB overlap Bug fixes Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Vasudevan Rengasamy <vrengasamy@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
8c0a0c93 · vasunvidia · GitHub · e39674b9 · 8c0a0c93 · 8c0a0c93
Unverified Commit 8c0a0c93 authored Jul 17, 2024 by vasunvidia Committed by GitHub Jul 17, 2024
4 changed files
--- a/transformer_engine/pytorch/csrc/comm_gemm_overlap.h
+++ b/transformer_engine/pytorch/csrc/comm_gemm_overlap.h
--- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
+++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
@@ -1890,11 +1890,18 @@ template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e4m3>(
    void *output, float *scale, const int handler, const int offset, const int rowelements,
    const int colelements, const int strideelements_out, const int strideelements_in,
    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);
+template void reducescatter2_userbuff_strided_atomic_fp8<__nv_fp8_e5m2>(
+    void *output, float *scale, const int handler, const int offset, const int rowelements,
+    const int colelements, const int strideelements_out, const int strideelements_in,
+    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);
 template void reducescatter2_userbuff_strided_multiatomic_fp8<__nv_fp8_e4m3>(
    void *output, float *scale, const int handler, const int offset, const int rowelements,
    const int colelements, const int strideelements_out, const int strideelements_in,
    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);
+template void reducescatter2_userbuff_strided_multiatomic_fp8<__nv_fp8_e5m2>(
+    void *output, float *scale, const int handler, const int offset, const int rowelements,
+    const int colelements, const int strideelements_out, const int strideelements_in,
+    const int numchunks, void *counters, communicator *comm, cudaStream_t stream);
 __global__ void kuserbuffers_pullsend(int myrank, int peer, int *send_id, int *flagptr) {
  atomicAdd_system(flagptr, 1);

--- a/transformer_engine/pytorch/module/base.py
+++ b/transformer_engine/pytorch/module/base.py
@@ -233,7 +233,9 @@ def initialize_ub(
                wgrad_name = name.replace("dgrad", "wgrad")
                assert wgrad_name not in ub_cfgs
                layers_reduce_scatter_overlap.remove(wgrad_name)
+                layers_all_gather_overlap.remove(name)
                layers_reduce_scatter_overlap.append(name)
+                methods["pipeline"].append(name)
    for name in methods["ring_exchange"] + methods["pipeline"] + methods["bulk"]:
        if ub_cfgs is not None and name in ub_cfgs:

--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -184,7 +184,7 @@ class _LayerNormLinear(torch.autograd.Function):
                        fp8_dtype_forward,
                        out=ln_out_fp8,
                    )
-                    ln_out = ln_out_fp8
+                    ln_out = torch.empty_like(ln_out_fp8)
                else:
                    ln_out_total = tex.cast_to_fp8(
                        ln_out_total,