add hcu support code

97f6e1c3 · zhanggzh · 1661daf1 · 97f6e1c3 · 97f6e1c3 · 97f6e1c3
Commit 97f6e1c3 authored Apr 25, 2025 by zhanggzh
3 changed files
--- a/src/libtorchaudio/cuctc/src/bitonic_topk/bitonic_sort.cuh
+++ b/src/libtorchaudio/cuctc/src/bitonic_topk/bitonic_sort.cuh
@@ -16,9 +16,10 @@ constexpr inline __host__ __device__ bool isPo2(IntType num) {
 }
 inline __device__ int laneId() {
-  int id;
+  //int id;
-  asm("mov.s32 %0, %%laneid;" : "=r"(id));
+  //asm("mov.s32 %0, %%laneid;" : "=r"(id));
-  return id;
+  //return id;
+  return __lane_id();
 }
 /**
 * @brief Shuffle the data inside a warp

--- a/src/libtorchaudio/cuctc/src/bitonic_topk/pow2_utils.cuh
+++ b/src/libtorchaudio/cuctc/src/bitonic_topk/pow2_utils.cuh
@@ -12,7 +12,7 @@ namespace cu_ctc {
 * @tparam IntType data type (checked only for integers)
 */
 template <typename IntType>
-constexpr __device__ IntType log2(IntType num, IntType ret = IntType(0)) {
+constexpr __host__ __device__ IntType log2(IntType num, IntType ret = IntType(0)) {
  return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
 }

--- a/src/libtorchaudio/cuctc/src/bitonic_topk/warpsort_topk.cuh
+++ b/src/libtorchaudio/cuctc/src/bitonic_topk/warpsort_topk.cuh
@@ -313,7 +313,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
  __device__ __forceinline__ void merge_buf_() {
    topk::bitonic<kMaxBufLen>(!Ascending, kWarpWidth).sort(val_buf_, idx_buf_);
-    this->merge_in<kMaxBufLen>(val_buf_, idx_buf_);
+    this->template merge_in<kMaxBufLen>(val_buf_, idx_buf_);
    buf_len_ = 0;
    set_k_th_(); // contains warp sync
 #pragma unroll
@@ -421,7 +421,7 @@ constexpr inline __host__ __device__ IntType ceildiv(IntType a, IntType b) {
  return (a + b - 1) / b;
 }
 template <typename IntType>
-constexpr inline __device__ IntType roundUp256(IntType num) {
+constexpr inline __host__ __device__ IntType roundUp256(IntType num) {
  // return (num + 255) / 256 * 256;
  constexpr int MASK = 255;
  return (num + MASK) & (~MASK);