Fix wrong idx bug in invertible LayerNormBackward1 (#692)

* fix wrong idx bug in invertible LayerNormBackward1 this index bug cause wrong scale grad * fix unexpected deletion * fix idx for LayerNormBackward1_fused_add * move pos defination in LayerNormBackward1 kernels * fix format error Co-authored-by: Reza Yazdani <reyazda@microsoft.com>

Fix wrong idx bug in invertible LayerNormBackward1 (#692)
* fix wrong idx bug in invertible LayerNormBackward1 this index bug cause wrong scale grad * fix unexpected deletion * fix idx for LayerNormBackward1_fused_add * move pos defination in LayerNormBackward1 kernels * fix format error Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
5221832e · Ying Xiong · GitHub · 852c524a · 5221832e
Unverified Commit 5221832e authored Jan 26, 2021 by Ying Xiong Committed by GitHub Jan 26, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

csrc/transformer/normalize_kernels.cu csrc/transformer/normalize_kernels.cu +6 -6

No files found.
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -624,9 +624,8 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
    int offset = threadIdx.y * width + idx;
    int y_stride = width * TILE_DIM;
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float betta_reg = (invertible ? (float)betta[pos] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
-    float gamma_reg = (float)gamma[pos];
    // Loop across matrix height
    float betta_tmp = 0;
@@ -660,6 +659,7 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
    }
    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
        betta_grad[pos] = s1;
        gamma_grad[pos] = s2;
    }
@@ -1368,9 +1368,8 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
    int offset = threadIdx.y * width + idx;
    int y_stride = width * TILE_DIM;
-    int pos = blockIdx.x * TILE_DIM + threadIdx.y;
+    float betta_reg = (invertible ? (float)betta[idx] : 0.0f);
-    float betta_reg = (invertible ? (float)betta[pos] : 0.0f);
+    float gamma_reg = (float)gamma[idx];
-    float gamma_reg = (float)gamma[pos];
    // Loop across matrix height
    float betta_tmp = 0;
@@ -1404,6 +1403,7 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
    }
    if (threadIdx.x == 0) {
+        int pos = blockIdx.x * TILE_DIM + threadIdx.y;
        betta_grad[pos] = s1;
        gamma_grad[pos] = s2;
    }