Get CUDA forward working correctly.

6a77cb45 · Daniel Povey · 74897fd5 · 6a77cb45 · 6a77cb45 · 6a77cb45
Commit 6a77cb45 authored Jul 15, 2021 by Daniel Povey
3 changed files
--- a/torch_learned_nonlin/learned_nonlin_cpu.cpp
+++ b/torch_learned_nonlin/learned_nonlin_cpu.cpp
@@ -54,7 +54,8 @@ torch::Tensor learned_nonlin_cpu(torch::Tensor input,

        for (int b = 0; b < B; b++) {
          for (int c = 0; c < C; c++) {
-            scalar_t inv_scale = exp(-params_a[c][0]);
+            scalar_t scale = exp(params_a[c][0]),
+                inv_scale = 1.0 / scale;
            for (int t = 0; t < T; t++) {
              // `x` is the scaled input x plus an offset so that -K maps to 0.
              //  Note: the discontinuities in our function are at -(K-1) ... +(K+1),
@@ -68,7 +69,7 @@ torch::Tensor learned_nonlin_cpu(torch::Tensor input,
              // C++ rounds toward zero.
              int n = (int) x_trunc;
              // OK, at this point, 0 <= min < 2*K.
-              scalar_t y = (x - n) * params_a[c][n + 1] + y_vals_a[c][n];
+              scalar_t y = (x - n) * scale * params_a[c][n + 1] + y_vals_a[c][n];
              /* printf("x = %f, y = %f, n = %d; y = (%f - %d) * %f+ %f\n", x, y, n,
                   x, n, params_a[c][n + 1], y_vals_a[c][n - 1]); */
              output_a[b][c][t] = y;
@@ -139,8 +140,10 @@ std::vector<torch::Tensor> learned_nonlin_backward_cpu(torch::Tensor input,

        for (int b = 0; b < B; b++) {
          for (int c = 0; c < C; c++) {
-            scalar_t inv_scale = exp(-params_a[c][0]),
-                inv_scale_grad = 0.0;
+            scalar_t scale = exp(params_a[c][0]),
+                inv_scale = 1.0 / scale,
+                inv_scale_grad = 0.0,
+                scale_grad = 0.0;
            for (int t = 0; t < T; t++) {
              // `x` is the scaled input x plus an offset so that -K maps to 0.
              //  Note: the discontinuities in our function are at -(K-1) ... +(K+1),
@@ -157,16 +160,22 @@ std::vector<torch::Tensor> learned_nonlin_backward_cpu(torch::Tensor input,
              int n = (int) x_trunc;
              // OK, at this point, 0 <= n < 2*K.
              // backprop for:
-              // scalar_t y = (x - (scalar_t)n) * params_a[c][n + 1] + y_vals_a[c][n];
-              scalar_t x_grad = y_grad * params_a[c][n + 1];
-              params_grad_a[c][n + 1] += y_grad * (x - (scalar_t)n);
+              // scalar_t x_residual_scaled = (x - (scalar_t)n) * scale
+              // scalar_t y = x_residual_scaled * params_a[c][n + 1] + y_vals_a[c][n];
+              scalar_t x_residual_scaled = (x - n) * scale,
+                  x_residual_scaled_grad = y_grad * params_a[c][n + 1],
+                  x_grad = x_residual_scaled_grad * scale;
+              scale_grad += x_residual_scaled_grad * (x - (scalar_t)n);
+              params_grad_a[c][n + 1] += y_grad * x_residual_scaled;
              y_vals_grad_a[c][n] += y_grad;
              // backprop for:  x = input * inv_scale + K,
              inv_scale_grad += x_grad * input;
              input_grad_a[b][c][t] = x_grad * inv_scale;
            }
-            // Do the backprop for: inv_scale = exp(-params_a[c][0])
-            params_grad_a[c][0] -= inv_scale * inv_scale_grad;
+            // Do the backprop for:
+            //    scale = exp(params_a[c][0]);
+            //    inv_scale = exp(-params_a[c][0]);
+            params_grad_a[c][0] += (scale * scale_grad - inv_scale * inv_scale_grad);
          }
        }
        // Now do the backprop for the loop above where we set y_vals_a.

--- a/torch_learned_nonlin/learned_nonlin_cuda_kernel.cu
+++ b/torch_learned_nonlin/learned_nonlin_cuda_kernel.cu
@@ -108,54 +108,53 @@ void learned_nonlin_kernel(
                                              // spaces between here and
                                              // `params_buf` for storing scale
                                              // and inv_scale and l == params[c][0].
-      *params_buf = (scalar_t*) y_vals + 3 + N;  // [N].  Caution: contains params[c][1] through params[c][N].
+      *params_buf = (scalar_t*) y_vals + 3 + N;  // [N].  Contains params[c][1] * scale through params[c][N] * scale,
                                                 //  params_buf[-1] contains params[c][0] == log of scale;
                                                 // params_buf[-2] and params_buf[-3] contain scale and inv_scale.
  // Load parameters
  if (threadIdx.x <= N)
    params_buf[threadIdx.x - 1] = params[c][threadIdx.x];
-
  __syncthreads();
-  // The easiest way to understand this code is to compare it with the CPU code
-  // in learned_nonlin_cpu.cpp.
-  // TODO: replace this with easier-to-understand code.
-  if ((((int)threadIdx.x & ~(int)64)) == 0) {
-    // threadIdx.x == 0 or 64 (we choose 64 because it's >= the max known warp
-    // size).  These are in separate warps so we can allow them to do separate
-    // jobs.  This code takes linear time in K which is not at all ideal and
-    // could be improved if K is largish, but it shouldn't dominate the total
-    // time taken if we are processing a lot of data; and anyway, we doubt that
-    // K will be need to be more than 4 or 8 or so, so the potential savings are
-    // quite small.
+
+  if (threadIdx.x == 0) {
    scalar_t scale = exp(params_buf[-1]),
        inv_scale = 1.0 / scale;
-    params_buf[-2] = scale;  // both threads write these but it's OK, it's the
-                             // same value.
+    params_buf[-2] = scale;
    params_buf[-3] = inv_scale;
-    int sign,
-        Koffset;  // Koffset == K for threads handling sum_positive and K - 1
-                  // for threads handling sum_negative, see
-                  // learned_nonlin_cpu.cpp for reference code.  This would be K
-                  // + 1 and K respectively, except our params_buf has its index
-                  // shifted by one versus params.
-    if (threadIdx.x == 0) {  // sum_positive
-      sign = 1;
-      Koffset = K;
-    } else {  // threadIdx.x == 64.  sum_negative.
-      scale *= -1;  // this is a local variable..
-      sign = -1;
-      Koffset = K - 1;
+  }
+  __syncthreads();
+
+  if (threadIdx.x < N) {
+    scalar_t scale = params_buf[-2];
+    params_buf[threadIdx.x] = params_buf[threadIdx.x] * scale;
+  }
+
+  __syncthreads();
+
+  // The easiest way to understand this code is to compare it with the CPU code
+  // in learned_nonlin_cpu.cpp.
+
+  if (threadIdx.x == 0) {
+    scalar_t sum_positive = 0.0;
+    for (int i = 0; i < K; i++) {
+      y_vals[K + i] = sum_positive;
+      // versus the CPU code, the params_buf is indexed off by 1; and it already
+      // contains the factor "scale".
+      sum_positive += params_buf[K + i];
    }
-    scalar_t sum = 0.0;
+
+  } else if (threadIdx.x == 64) {
+    scalar_t sum_negative = 0.0;
    for (int i = 0; i < K; i++) {
-      int isign = i * sign;
-      y_vals[K + isign] = sum * scale;
-      sum += params_buf[Koffset + isign];
+      y_vals[K - i] = sum_negative;
+      // versus the CPU code, the params_buf is indexed off by 1; and it already
+      // contains the factor "scale".
+      sum_negative -= params_buf[K - 1 - i];
    }
-    if (threadIdx.x != 0)  // sum_negative
-      y_vals[0] = sum * scale;
+    y_vals[0] = sum_negative;
  }
  __syncthreads();
+
  scalar_t inv_scale = params_buf[-3];

  int T_inc = THREADS_PER_BLOCK / images_per_thread_block,
@@ -176,7 +175,8 @@ void learned_nonlin_kernel(
      else if (x_trunc >= N) x_trunc = N - 1;
      // C++ rounds toward zero.
      int n = (int) x_trunc;
-      // OK, at this point, 0 <= min < N.
+      // OK, at this point, 0 <= min < N.  Versus the CPU code, we removed the
+      // factor of 'scale' because params_buf already has that factor.
      output[b][c][t] = (x - n) * params_buf[n] + y_vals[n];
    }
  }

--- a/torch_learned_nonlin/learned_nonlin_test.py
+++ b/torch_learned_nonlin/learned_nonlin_test.py
@@ -76,7 +76,7 @@ def test_learned_nonlin_deriv():
            y2 = learned_nonlin(x + delta_x, params, dim = 1)
            observed_change = (y_deriv * (y2 - y)).sum()
            print(f"for input: pred_change = {pred_change}, observed_change={observed_change}")
-            if not torch.allclose(pred_change, observed_change, rtol=1.0e-02, atol=1.0e-05):
+            if not torch.allclose(pred_change, observed_change, rtol=2.0e-02, atol=1.0e-05):
                print(f"For changed input, output differs too much: params={params}, input={x}, mod_input={x+delta_x}, y={y}, y2={y2}, diff={y2-y}")
                assert 0