Fixed compile errors for ChannelsLast C++ code, unfortunately also format-on-save

3dd35b45 · Max Rietmann · e1338191 · 3dd35b45
Commit 3dd35b45 authored Jul 03, 2025 by Max Rietmann
Hide whitespace changes
Inline Side-by-side

Showing with 265 additions and 273 deletions

torch_harmonics/csrc/attention/attention_bwd_cuda.cu torch_harmonics/csrc/attention/attention_bwd_cuda.cu +265 -273

No files found.
--- a/torch_harmonics/csrc/attention/attention_bwd_cuda.cu
+++ b/torch_harmonics/csrc/attention/attention_bwd_cuda.cu
@@ -2,7 +2,7 @@
 //
 // SPDX-FileCopyrightText: Copyright (c) 2025 The torch-harmonics Authors. All rights reserved.
 // SPDX-License-Identifier: BSD-3-Clause
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
@@ -51,310 +51,302 @@
 #define THREADS (64)
 #endif
 #ifndef DIV_UP
-#define DIV_UP(a,b) (((a)+((b)-1))/(b))
+#define DIV_UP(a, b) (((a) + ((b) - 1)) / (b))
 #endif
 #ifndef CHECK_CUDA
-#define CHECK_CUDA(call) {                                            \
+#define CHECK_CUDA(call)                                                                                                 \
-    cudaError_t err = call;                                           \
+    {                                                                                                                    \
-    if( cudaSuccess != err) {                                         \
+        cudaError_t err = call;                                                                                          \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\\n",  \
+        if (cudaSuccess != err) {                                                                                        \
-              __FILE__, __LINE__, cudaGetErrorString( err) );         \
+            fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
-      exit(EXIT_FAILURE);                                             \
+            exit(EXIT_FAILURE);                                                                                          \
-    }}
+        }                                                                                                                \
+    }
 #endif
 #include <iostream>
 #include <chrono>
 #include <string>
-class ScopeTimer {
+class ScopeTimer
-public:
+{
-  explicit ScopeTimer(const std::string& label = "")
+  public:
-    : label_(label), start_(std::chrono::high_resolution_clock::now()) {}
+    explicit ScopeTimer(const std::string &label = "") :
+        label_(label), start_(std::chrono::high_resolution_clock::now())
+    {
+    }
-  ~ScopeTimer() {
+    ~ScopeTimer()
-    auto end = std::chrono::high_resolution_clock::now();
+    {
-    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_);
+        auto end = std::chrono::high_resolution_clock::now();
-    std::cout << label_ << "Elapsed time: " << elapsed.count() << " ms" << std::endl;
+        auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_);
-  }
+        std::cout << label_ << "Elapsed time: " << elapsed.count() << " ms" << std::endl;
+    }
-private:
+  private:
-  std::string label_;
+    std::string label_;
-  std::chrono::high_resolution_clock::time_point start_;
+    std::chrono::high_resolution_clock::time_point start_;
 };
-static __device__ float __warp_sum(float val) {
+static __device__ float __warp_sum(float val)
+{
 #pragma unroll
-  for(int i = WARP_SIZE/2; i; i /= 2) {
+    for (int i = WARP_SIZE / 2; i; i /= 2) { val += __shfl_xor_sync(FULL_MASK, val, i); }
-    val += __shfl_xor_sync(FULL_MASK, val, i);
+    return val;
-  }
-  return val;
 }
 // easier to understand version of manual shfl_xor_sync, performance appears similar
-static __device__ float __warp_sum_cub(float val) {
+static __device__ float __warp_sum_cub(float val)
-  // use cub to reduce within a warp
+{
-  __shared__ typename cub::WarpReduce<float>::TempStorage temp_storage;
+    // use cub to reduce within a warp
+    __shared__ typename cub::WarpReduce<float>::TempStorage temp_storage;
-  // 1. Compute sum (initially only in lane 0)
-  float sum = cub::WarpReduce<float>(temp_storage).Sum(val);
+    // 1. Compute sum (initially only in lane 0)
-  // 2. Broadcast sum to all threads
+    float sum = cub::WarpReduce<float>(temp_storage).Sum(val);
-  sum = __shfl_sync(0xFFFFFFFF, sum, 0);
+    // 2. Broadcast sum to all threads
-  return sum;
+    sum = __shfl_sync(0xFFFFFFFF, sum, 0);
+    return sum;
 }
 // This kernel computes the backward pass for the S2 attention mechanism, using
 // shared memory as a cache and one warp per output point, warp-parallel over
 // channels, which should be layed out in the fastest dimension for coalesced
 // memory access.
-template<int BDIM_X>
+template <int BDIM_X>
-__global__
+__global__ __launch_bounds__(BDIM_X) void s2_attention_bwd_dkvq_kernel(
-__launch_bounds__(BDIM_X)
+    int num_channels, int nlon_in, int nlat_out, int nlon_out,
-  void s2_attention_bwd_dkvq_kernel(
+    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> kx,
-                                        int num_channels,
+    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> vx,
-                                        int nlon_in,
+    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> qy,
-                                        int nlat_out,
+    const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dy,
-                                        int nlon_out,
+    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydk,
-                                        const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> kx,
+    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydv,
-                                        const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> vx,
+    torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydq,
-                                        const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> qy,
+    const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_col_idx,
-                                        const torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dy,
+    const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_row_offset,
-                                        torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydk,
+    const torch::PackedTensorAccessor32<float, 1, torch::RestrictPtrTraits> quad_weights)
-                                        torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydv,
+{
-                                        torch::PackedTensorAccessor32<float, 4, torch::RestrictPtrTraits> dydq,
-                                        const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_col_idx,
+    extern __shared__ float sh[];
-                                        const torch::PackedTensorAccessor64<int64_t, 1, torch::RestrictPtrTraits> psi_row_offset,
+    float *sh_alpha_k = sh + threadIdx.y * num_channels * 5;
-                                        const torch::PackedTensorAccessor32<float, 1, torch::RestrictPtrTraits> quad_weights) {
+    float *sh_alpha_vw = sh_alpha_k + num_channels;
+    float *sh_alpha_kvw = sh_alpha_vw + num_channels;
-  extern __shared__ float sh[];
+    float *sh_dy = sh_alpha_kvw + num_channels;
-  float* sh_alpha_k = sh + threadIdx.y * num_channels * 5;
+    float *sh_qy = sh_dy + num_channels;
-  float* sh_alpha_vw = sh_alpha_k + num_channels;
+    // (optionally, could use more shared memory for other intermediates)
-  float* sh_alpha_kvw = sh_alpha_vw + num_channels;
-  float *sh_dy = sh_alpha_kvw + num_channels;
+    const uint64_t batchId = blockIdx.y;
-  float* sh_qy = sh_dy + num_channels;
+    const uint64_t wid = uint64_t(blockIdx.x) * blockDim.y + threadIdx.y;
-  // (optionally, could use more shared memory for other intermediates)
+    if (wid >= uint64_t(nlat_out) * nlon_in) return;
+    const int tidx = threadIdx.x;
-  const uint64_t batchId = blockIdx.y;
+    const int ho = wid / nlon_out;
-  const uint64_t wid = uint64_t(blockIdx.x) * blockDim.y + threadIdx.y;
+    const int wo = wid - (ho * nlon_out);
-  if (wid >= uint64_t(nlat_out) * nlon_in) return;
-  const int tidx = threadIdx.x;
+    // Zero shared memory
-  const int ho = wid / nlon_out;
-  const int wo = wid - (ho * nlon_out);
-  // Zero shared memory
-  for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-    sh_alpha_k[chan] = 0.0f;
-    sh_alpha_vw[chan] = 0.0f;
-    sh_alpha_kvw[chan] = 0.0f;
-    sh_dy[chan] = dy[batchId][chan][ho][wo];
-    sh_qy[chan] = qy[batchId][chan][ho][wo];
-  }
-  float alpha_sum = 0.0f;
-  float qdotk_max = -FLT_MAX;
-  float integral = 0.0f;
-  __syncthreads();
-  const int64_t rbeg = psi_row_offset[ho];
-  const int64_t rend = psi_row_offset[ho+1];
-  const int rlen = rend - rbeg;
-  // First pass: find qdotk_max
-  for (int off = 0; off < rlen; off++) {
-    const int64_t col = psi_col_idx[rbeg + off];
-    const int hi = col / nlon_in;
-    const int wi = col - (hi * nlon_in);
-    const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-    float qdotk = 0.0f;
    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-      qdotk += sh_qy[chan] * kx[batchId][chan][hi][wip];
+        sh_alpha_k[chan] = 0.0f;
+        sh_alpha_vw[chan] = 0.0f;
+        sh_alpha_kvw[chan] = 0.0f;
+        sh_dy[chan] = dy[batchId][chan][ho][wo];
+        sh_qy[chan] = qy[batchId][chan][ho][wo];
    }
-    qdotk = __warp_sum_cub(qdotk);
+    float alpha_sum = 0.0f;
-    qdotk_max = max(qdotk_max, qdotk);
+    float qdotk_max = -FLT_MAX;
-  }
+    float integral = 0.0f;
+    __syncthreads();
-  // Second pass: accumulate alpha_sum, integral, and shared stats
-  for (int off = 0; off < rlen; off++) {
+    const int64_t rbeg = psi_row_offset[ho];
-    const int64_t col = psi_col_idx[rbeg + off];
+    const int64_t rend = psi_row_offset[ho + 1];
-    const int hi = col / nlon_in;
+    const int rlen = rend - rbeg;
-    const int wi = col - (hi * nlon_in);
-    const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
+    // First pass: find qdotk_max
-    float qdotk = 0.0f, gdotv = 0.0f;
+    for (int off = 0; off < rlen; off++) {
-    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+        const int64_t col = psi_col_idx[rbeg + off];
-      qdotk += sh_qy[chan] * kx[batchId][chan][hi][wip];
+        const int hi = col / nlon_in;
-      gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
+        const int wi = col - (hi * nlon_in);
+        const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
+        float qdotk = 0.0f;
+        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+            qdotk += sh_qy[chan] * kx[batchId][chan][hi][wip];
+        }
+        qdotk = __warp_sum_cub(qdotk);
+        qdotk_max = max(qdotk_max, qdotk);
    }
-    qdotk = __warp_sum_cub(qdotk);
-    gdotv = __warp_sum_cub(gdotv);
+    // Second pass: accumulate alpha_sum, integral, and shared stats
-    float alpha_inz = expf(qdotk - qdotk_max) * quad_weights[hi];
+    for (int off = 0; off < rlen; off++) {
-    alpha_sum += alpha_inz;
+        const int64_t col = psi_col_idx[rbeg + off];
-    integral += alpha_inz * gdotv;
+        const int hi = col / nlon_in;
-    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+        const int wi = col - (hi * nlon_in);
-      float kxval = kx[batchId][chan][hi][wip];
+        const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-      sh_alpha_k[chan] += alpha_inz * kxval;
+        float qdotk = 0.0f, gdotv = 0.0f;
-      sh_alpha_vw[chan] += alpha_inz * gdotv;
+        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-      sh_alpha_kvw[chan] += alpha_inz * kxval * gdotv;
+            qdotk += sh_qy[chan] * kx[batchId][chan][hi][wip];
+            gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
+        }
+        qdotk = __warp_sum_cub(qdotk);
+        gdotv = __warp_sum_cub(gdotv);
+        float alpha_inz = expf(qdotk - qdotk_max) * quad_weights[hi];
+        alpha_sum += alpha_inz;
+        integral += alpha_inz * gdotv;
+        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+            float kxval = kx[batchId][chan][hi][wip];
+            sh_alpha_k[chan] += alpha_inz * kxval;
+            sh_alpha_vw[chan] += alpha_inz * gdotv;
+            sh_alpha_kvw[chan] += alpha_inz * kxval * gdotv;
+        }
    }
-  }
+    integral /= alpha_sum;
-  integral /= alpha_sum;
+    // Write dydq
-  // Write dydq
-  for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-    dydq[batchId][chan][ho][wo] = (sh_alpha_kvw[chan] * alpha_sum - sh_alpha_vw[chan] * sh_alpha_k[chan]) / (alpha_sum * alpha_sum);
-  }
-  // Third pass: accumulate gradients for k and v
-  for (int off = 0; off < rlen; off++) {
-    const int64_t col = psi_col_idx[rbeg + off];
-    const int hi = col / nlon_in;
-    const int wi = col - (hi * nlon_in);
-    const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-    float qdotk = 0.0f, gdotv = 0.0f;
    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
-      qdotk += qy[batchId][chan][ho][wo] * kx[batchId][chan][hi][wip];
+        dydq[batchId][chan][ho][wo]
-      gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
+            = (sh_alpha_kvw[chan] * alpha_sum - sh_alpha_vw[chan] * sh_alpha_k[chan]) / (alpha_sum * alpha_sum);
    }
-    qdotk = __warp_sum_cub(qdotk);
-    gdotv = __warp_sum_cub(gdotv);
+    // Third pass: accumulate gradients for k and v
-    float alpha_inz = expf(qdotk - qdotk_max) * quad_weights[hi];
+    for (int off = 0; off < rlen; off++) {
-    for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+        const int64_t col = psi_col_idx[rbeg + off];
-      float qyval = qy[batchId][chan][ho][wo];
+        const int hi = col / nlon_in;
-      float dyval = sh_dy[chan];
+        const int wi = col - (hi * nlon_in);
-      atomicAdd(&dydk[batchId][chan][hi][wip], qyval * (alpha_inz / alpha_sum) * (gdotv - integral));
+        const int wip = (wi + wo) - ((wi + wo) / nlon_in) * nlon_in;
-      atomicAdd(&dydv[batchId][chan][hi][wip], (alpha_inz / alpha_sum) * dyval);
+        float qdotk = 0.0f, gdotv = 0.0f;
+        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+            qdotk += qy[batchId][chan][ho][wo] * kx[batchId][chan][hi][wip];
+            gdotv += sh_dy[chan] * vx[batchId][chan][hi][wip];
+        }
+        qdotk = __warp_sum_cub(qdotk);
+        gdotv = __warp_sum_cub(gdotv);
+        float alpha_inz = expf(qdotk - qdotk_max) * quad_weights[hi];
+        for (int chan = tidx; chan < num_channels; chan += WARP_SIZE) {
+            float qyval = qy[batchId][chan][ho][wo];
+            float dyval = sh_dy[chan];
+            atomicAdd(&dydk[batchId][chan][hi][wip], qyval * (alpha_inz / alpha_sum) * (gdotv - integral));
+            atomicAdd(&dydv[batchId][chan][hi][wip], (alpha_inz / alpha_sum) * dyval);
+        }
    }
-  }
 }
+std::tuple<at::Tensor, at::Tensor, at::Tensor> s2_attention_bwd_dkvq_cuda(at::Tensor kx, at::Tensor vx, at::Tensor qy,
+                                                                          at::Tensor dy, at::Tensor quad_weights,
+                                                                          at::Tensor psi_col_idx, at::Tensor psi_row_off,
-std::tuple<at::Tensor,at::Tensor,at::Tensor> s2_attention_bwd_dkvq_cuda(at::Tensor kx, at::Tensor vx,
+                                                                          int nlon_in, int nlat_out, int nlon_out)
-                                                                        at::Tensor qy,
+{
-                                                                        at::Tensor dy,
-                                                                        at::Tensor quad_weights,
+    CHECK_CUDA_TENSOR(kx);
-                                                                        at::Tensor psi_col_idx,
+    CHECK_CUDA_TENSOR(vx);
-                                                                        at::Tensor psi_row_off,
+    CHECK_CUDA_TENSOR(qy);
-                                                                        int nlon_in, int nlat_out, int nlon_out) {
+    CHECK_CUDA_TENSOR(quad_weights);
+    CHECK_CUDA_TENSOR(psi_col_idx);
-  CHECK_CUDA_TENSOR(kx);
+    CHECK_CUDA_TENSOR(psi_row_off);
-  CHECK_CUDA_TENSOR(vx);
+    CHECK_CUDA_TENSOR(dy);
-  CHECK_CUDA_TENSOR(qy);
-  CHECK_CUDA_TENSOR(quad_weights);
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
-  CHECK_CUDA_TENSOR(psi_col_idx);
-  CHECK_CUDA_TENSOR(psi_row_off);
+    // Transpose to [batch, ho, wo, channel]
-  CHECK_CUDA_TENSOR(dy);
+    nvtxRangePush("s2_attention_bwd_dkvq_kernel_mbT permute inputs");
+    // auto* permute_timer = new ScopeTimer("permute inputs");
-  auto stream = at::cuda::getCurrentCUDAStream().stream();
+    // extract dtype
-  // Transpose to [batch, ho, wo, channel]
+    auto kx_type = kx.dtype();
-  nvtxRangePush("s2_attention_bwd_dkvq_kernel_mbT permute inputs");
+    auto vx_type = vx.dtype();
-  // auto* permute_timer = new ScopeTimer("permute inputs");
+    auto qy_type = qy.dtype();
+    auto dy_type = dy.dtype();
-  // extract dtype
-  auto kx_type = kx.dtype();
+    // exract memory format
-  auto vx_type = vx.dtype();
+    auto kx_is_channels_last = kx.is_contiguous(at::MemoryFormat::ChannelsLast);
-  auto qy_type = qy.dtype();
+    auto vx_is_channels_last = vx.is_contiguous(at::MemoryFormat::ChannelsLast);
-  auto dy_type = dy.dtype();
+    auto qy_is_channels_last = qy.is_contiguous(at::MemoryFormat::ChannelsLast);
+    auto dy_is_channels_last = dy.is_contiguous(at::MemoryFormat::ChannelsLast);
-  // exract memory format
-  auto kx_is_channels_last = kx.is_contiguous(at::MemoryFormat::Channels_last);
+    // convert to channels-last
-  auto vx_is_channels_last = vx.is_contiguous(at::MemoryFormat::Channels_last);
+    auto kxP = kx.to(torch::kFloat32).to(at::MemoryFormat::ChannelsLast);
-  auto qy_is_channels_last = qy.is_contiguous(at::MemoryFormat::Channels_last);
+    auto vxP = vx.to(torch::kFloat32).to(at::MemoryFormat::ChannelsLast);
-  auto dy_is_channels_last = dy.is_contiguous(at::MemoryFormat::Channels_last);
+    auto qyP = qy.to(torch::kFloat32).to(at::MemoryFormat::ChannelsLast);
+    auto dyP = dy.to(torch::kFloat32).to(at::MemoryFormat::ChannelsLast);
-  // convert to channels-last
-  auto kxP = kx.to(torch::kFloat32, at::MemoryFormat::ChannelsLast);
+    // cudaDeviceSynchronize();
-  auto vxP = vx.to(torch::kFloat32, at::MemoryFormat::ChannelsLast);
+    // delete permute_timer;
-  auto qyP = qy.to(torch::kFloat32, at::MemoryFormat::ChannelsLast);
+    nvtxRangePop();
-  auto dyP = dy.to(torch::kFloat32, at::MemoryFormat::ChannelsLast);
+    nvtxRangePush("s2_attention_bwd_dkvq_kernel_mbT output allocation & zero");
-  // cudaDeviceSynchronize();
+    auto dydk = torch::zeros_like(qyP);
-  // delete permute_timer;
+    auto dydv = torch::zeros_like(qyP);
-  nvtxRangePop();
+    auto dydq = torch::zeros_like(qyP);
+    // print strdie of dydkP, dydvP, dydqP
-  nvtxRangePush("s2_attention_bwd_dkvq_kernel_mbT output allocation & zero");
+    nvtxRangePop();
-  auto dydk = torch::zeros_like(qyP);
-  auto dydv = torch::zeros_like(qyP);
+    size_t uo_num_channels = kx.size(1);
-  auto dydq = torch::zeros_like(qyP);
+    const int batch_size = kx.size(0);
-  // print strdie of dydkP, dydvP, dydqP
-  nvtxRangePop();
+    dim3 block(WARP_SIZE, THREADS / WARP_SIZE);
+    dim3 grid(DIV_UP(nlat_out * nlon_out, block.y), batch_size);
-  size_t uo_num_channels = kx.size(1);
+    size_t shared_size = sizeof(float) * uo_num_channels * 5 * block.y; // 4 arrays per warp
-  const int batch_size = kx.size(0);
+    cudaEvent_t start, stop;
-  dim3 block(WARP_SIZE, THREADS/WARP_SIZE);
+    float milliseconds = 0;
-  dim3 grid(DIV_UP(nlat_out*nlon_out, block.y), batch_size);
+    CHECK_CUDA(cudaEventCreate(&start));
-  size_t shared_size = sizeof(float) * uo_num_channels * 5 * block.y; // 4 arrays per warp
+    CHECK_CUDA(cudaEventCreate(&stop));
+    CHECK_CUDA(cudaEventRecord(start, stream));
-  cudaEvent_t start, stop;
-  float milliseconds = 0;
+    s2_attention_bwd_dkvq_kernel<THREADS><<<grid, block, shared_size, stream>>>(
-  CHECK_CUDA(cudaEventCreate(&start));
+        uo_num_channels, nlon_in, nlat_out, nlon_out, kxP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-  CHECK_CUDA(cudaEventCreate(&stop));
+        vxP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-  CHECK_CUDA(cudaEventRecord(start, stream));
+        qyP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+        dyP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-  s2_attention_bwd_dkvq_kernel<THREADS><<<
+        dydk.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-    grid, block, shared_size, stream>>>(
+        dydv.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-                                        uo_num_channels, nlon_in, nlat_out, nlon_out,
+        dydq.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-                                        kxP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+        psi_col_idx.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>(),
-                                        vxP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+        psi_row_off.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>(),
-                                        qyP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+        quad_weights.packed_accessor32<float, 1, torch::RestrictPtrTraits>());
-                                        dyP.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
-                                        dydk.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+    CHECK_CUDA(cudaEventRecord(stop, stream));
-                                        dydv.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+    CHECK_CUDA(cudaEventSynchronize(stop));
-                                        dydq.packed_accessor32<float, 4, torch::RestrictPtrTraits>(),
+    CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
-                                        psi_col_idx.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>(),
-                                        psi_row_off.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>(),
+    // [1, 256, 1, (721, 1440), (721, 1440), "equiangular", "equiangular", 1e-5, 1e-5],
-                                        quad_weights.packed_accessor32<float, 1, torch::RestrictPtrTraits>());
+    // s2_attention_bwd_kernel_mbT execution time: 63.280128 ms
+    // printf("s2_attention_bwd_kernel_mbT execution time: %f ms\n", milliseconds);
-  CHECK_CUDA(cudaEventRecord(stop, stream));
+    CHECK_CUDA(cudaEventDestroy(start));
-  CHECK_CUDA(cudaEventSynchronize(stop));
+    CHECK_CUDA(cudaEventDestroy(stop));
-  CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
-  // [1, 256, 1, (721, 1440), (721, 1440), "equiangular", "equiangular", 1e-5, 1e-5],
-  // s2_attention_bwd_kernel_mbT execution time: 63.280128 ms
+    // Permute outputs back to memory layout given by input. if input had channels
-  // printf("s2_attention_bwd_kernel_mbT execution time: %f ms\n", milliseconds);
+    // first, leave it in that layout, otherwise permute layout back to [batch,
-  CHECK_CUDA(cudaEventDestroy(start));
+    // channel, ho, wo]
-  CHECK_CUDA(cudaEventDestroy(stop));
+    // convert back to original dtype
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-  // Permute outputs back to memory layout given by input. if input had channels
-  // first, leave it in that layout, otherwise permute layout back to [batch,
-  // channel, ho, wo]
-  // convert back to original dtype
-  dydk = dydk.to(kx_type);
-  dydv = dydv.to(vx_type);
-  dydq = dydq.to(qy_type);
-  // permute back to original layout
-  if(!kx_is_channels_last){
-    dydk = dydk.to(kx_type, at::MemoryFormat::Contiguous);
-  } else {
    dydk = dydk.to(kx_type);
-  }
-  if(!vx_is_channels_last){
-    dydv = dydv.to(vx_type, at::MemoryFormat::Contiguous);
-  } else {
    dydv = dydv.to(vx_type);
-  }
+    dydq = dydq.to(qy_type);
-  if(!qy_is_channels_last) {
-    dydq = dydq.to(qy_type, at::MemoryFormat::Contiguous);
-  } else {
-    dydq = dydq.to(qy_type)
-  }
-  // printf("dydk strides:  [");
-  // for(auto& stride : dydk.strides()) {
-  //   printf("%ld,", stride);
-  // }
-  // printf("]\n");
-  // cudaDeviceSynchronize();
-  // delete permute_output_timer;
-  // nvtxRangePop();
-  return std::make_tuple(dydk, dydv, dydq);
-}
+    // permute back to original layout
+    if (!kx_is_channels_last) {
+        dydk = dydk.to(kx_type).to(at::MemoryFormat::Contiguous);
+    } else {
+        dydk = dydk.to(kx_type);
+    }
+    if (!vx_is_channels_last) {
+        dydv = dydv.to(vx_type).to(at::MemoryFormat::Contiguous);
+    } else {
+        dydv = dydv.to(vx_type);
+    }
+    if (!qy_is_channels_last) {
+        dydq = dydq.to(qy_type).to(at::MemoryFormat::Contiguous);
+    } else {
+        dydq = dydq.to(qy_type);
+    }
+    // printf("dydk strides:  [");
+    // for(auto& stride : dydk.strides()) {
+    //   printf("%ld,", stride);
+    // }
+    // printf("]\n");
+    // cudaDeviceSynchronize();
+    // delete permute_output_timer;
+    // nvtxRangePop();
+    return std::make_tuple(dydk, dydv, dydq);
+}