Merge pull request #1019 from InfiniTensor/issue/1008

Issue/1008

Merge pull request #1019 from InfiniTensor/issue/1008
Issue/1008
52f0dcf0 · thatPepe · GitHub · d0f405ce · 68026bd1 · 52f0dcf0
Unverified Commit 52f0dcf0 authored Feb 12, 2026 by thatPepe Committed by GitHub Feb 12, 2026
20 changed files
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -39,6 +39,9 @@ def run_tests(args):
        "topkrouter.py",
        "topksoftmax.py",
        "zeros.py",
+        # "paged_attention.py",
+        # "paged_caching.py",
+        # "paged_attention_prefill.py"
    ]:
        result = subprocess.run(
            f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True

--- a/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
+++ b/src/infiniop/ops/layer_norm/nvidia/layer_norm_nvidia.cu
@@ -255,6 +255,8 @@ infiniStatus_t Descriptor::calculate(
        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CALCULATE_LAYER_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
    } else {
        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/layer_norm/operator.cc
+++ b/src/infiniop/ops/layer_norm/operator.cc
@@ -174,6 +174,9 @@ infiniopDestroyLayerNormDescriptor(infiniopLayerNormDescriptor_t desc) {
 #ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu
+++ b/src/infiniop/ops/logsoftmax/nvidia/logsoftmax_nvidia.cu
@@ -117,6 +117,11 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
            y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
            _info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
            _info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
+            y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,
+            _info.y_stride_b, _info.y_stride_p, _info.x_stride_b, _info.x_stride_p,
+            _info.y_stride_0, _info.y_stride_1, _info.x_stride_0, _info.x_stride_1, stream));
    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
            y, x, _info.x_dtype, _info.y_dtype, _info.batch_size, _info.probs_size, _info.ndim, _info.seq_len,

--- a/src/infiniop/ops/logsoftmax/operator.cc
+++ b/src/infiniop/ops/logsoftmax/operator.cc
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateLogSoftmaxDescriptor(
        CREATE(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
-        // CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
@@ -73,7 +73,7 @@ __C infiniStatus_t infiniopGetLogSoftmaxWorkspaceSize(infiniopLogSoftmaxDescript
        GET(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
-        // GET(INFINI_DEVICE_ILUVATAR, nvidia);
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia);
@@ -111,7 +111,7 @@ __C infiniStatus_t infiniopLogSoftmax(
        CALCULATE(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
-        // CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_QY_API
        CALCULATE(INFINI_DEVICE_QY, nvidia);
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopDestroyLogSoftmaxDescriptor(infiniopLogSoftmaxDescrip
        DESTROY(INFINI_DEVICE_ALI, nvidia);
 #endif
 #ifdef ENABLE_ILUVATAR_API
-        // DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
 #ifdef ENABLE_QY_API
        DESTROY(INFINI_DEVICE_QY, nvidia);

--- a/src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu
+++ b/src/infiniop/ops/lp_norm/nvidia/lp_norm_nvidia.cu
@@ -155,6 +155,8 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
        CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
        CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_2048)
    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
        CALCULATE_LP_NORM_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
    } else {

--- a/src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh
+++ b/src/infiniop/ops/paged_attention/cuda/kernel_v2.cuh
@@ -16,17 +16,66 @@ struct OnlineSoftmaxState {
    }
 };
 __device__ __forceinline__ float warpReduceSum(float x) {
+#if defined(ENABLE_ILUVATAR_API)
+    // Iluvatar may use warp size 64; __shfl_sync(0xffffffff) only covers 32 threads.
+    // Use shared-memory tree reduce for portability across warp sizes.
+    constexpr int kMaxWarps = 16;
+    __shared__ float _reduce_buf[kMaxWarps * 32];
+    const int lane = threadIdx.x & 31;
+    const int warp_id = threadIdx.x / 32;
+    _reduce_buf[threadIdx.x] = x;
+    __syncthreads();
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        if (lane < offset) {
+            _reduce_buf[warp_id * 32 + lane] += _reduce_buf[warp_id * 32 + lane + offset];
+        }
+        __syncthreads();
+    }
+    return _reduce_buf[warp_id * 32];
+#else
    for (int offset = 16; offset > 0; offset >>= 1) {
        x += __shfl_down_sync(0xffffffff, x, offset);
    }
    return x;
+#endif
+}
+__device__ __forceinline__ float warpBroadcast(float x, int src_lane) {
+#if defined(ENABLE_ILUVATAR_API)
+    __shared__ float _bcast_buf[16];
+    const int warp_id = threadIdx.x / 32;
+    if ((threadIdx.x & 31) == src_lane) {
+        _bcast_buf[warp_id] = x;
+    }
+    __syncthreads();
+    return _bcast_buf[warp_id];
+#else
+    return __shfl_sync(0xffffffff, x, src_lane);
+#endif
 }
 __device__ __forceinline__ float warpReduceMax(float x) {
+#if defined(ENABLE_ILUVATAR_API)
+    __shared__ float _reduce_buf[16 * 32];
+    const int lane = threadIdx.x & 31;
+    const int warp_id = threadIdx.x / 32;
+    _reduce_buf[threadIdx.x] = x;
+    __syncthreads();
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        if (lane < offset) {
+            float other = _reduce_buf[warp_id * 32 + lane + offset];
+            float cur = _reduce_buf[warp_id * 32 + lane];
+            _reduce_buf[warp_id * 32 + lane] = fmaxf(cur, other);
+        }
+        __syncthreads();
+    }
+    return _reduce_buf[warp_id * 32];
+#else
    for (int offset = 16; offset > 0; offset >>= 1) {
        x = fmaxf(x, __shfl_down_sync(0xffffffff, x, offset));
    }
    return x;
+#endif
 }
 __device__ __forceinline__ unsigned int cvtaToShared(const void *ptr) {

--- a/src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh
+++ b/src/infiniop/ops/paged_attention_prefill/cuda/kernel_v2.cuh
 #ifndef __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
 #define __PAGED_ATTENTION_PREFILL_KERNEL_V2_CUH__
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
@@ -194,8 +194,8 @@ __device__ void PagedAttentionPrefillWarpKernel(
                l = l * alpha + beta;
                m = m_new;
            }
-            alpha = __shfl_sync(0xffffffff, alpha, 0);
+            alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
-            beta = __shfl_sync(0xffffffff, beta, 0);
+            beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
 #if defined(__CUDA_ARCH__)
            if constexpr (std::is_same_v<Tdata, half>) {
@@ -233,7 +233,7 @@ __device__ void PagedAttentionPrefillWarpKernel(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
-    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
 #pragma unroll
    for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -411,8 +411,8 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
                l = l * alpha + beta;
                m = m_new;
            }
-            alpha = __shfl_sync(0xffffffff, alpha, 0);
+            alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
-            beta = __shfl_sync(0xffffffff, beta, 0);
+            beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
 #if defined(__CUDA_ARCH__)
            if constexpr (std::is_same_v<Tdata, half>) {
@@ -450,7 +450,11 @@ __global__ void PagedAttentionPrefillWarpGlobalKernel(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
+#ifdef ENABLE_ILUVATAR_API
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
+#else
    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+#endif
 #pragma unroll
    for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -785,8 +789,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
                l = l * alpha + beta;
                m = m_new;
            }
-            alpha = __shfl_sync(0xffffffff, alpha, 0);
+            alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
-            beta = __shfl_sync(0xffffffff, beta, 0);
+            beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
 #if defined(__CUDA_ARCH__)
            if constexpr (std::is_same_v<Tdata, half>) {
@@ -826,7 +830,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernel(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
-    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
 #pragma unroll
    for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -1270,7 +1274,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelPipelined(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
-    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
 #pragma unroll
    for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -1961,8 +1965,8 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
                l = l * alpha + beta;
                m = m_new;
            }
-            alpha = __shfl_sync(0xffffffff, alpha, 0);
+            alpha = op::paged_attention::cuda::warpBroadcast(alpha, 0);
-            beta = __shfl_sync(0xffffffff, beta, 0);
+            beta = op::paged_attention::cuda::warpBroadcast(beta, 0);
 #if defined(__CUDA_ARCH__)
            if constexpr (std::is_same_v<Tdata, half>) {
@@ -2002,7 +2006,7 @@ __device__ void PagedAttentionPrefillWarpCtaKernelKOnly(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
-    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
 #pragma unroll
    for (int i = 0; i < DIMS_PER_THREAD; ++i) {
@@ -2131,7 +2135,7 @@ __device__ __forceinline__ void PagedAttentionPrefillMmaScoreWriteRow(
    if (lane == 0) {
        inv_l = 1.0f / (l + 1e-6f);
    }
-    inv_l = __shfl_sync(0xffffffff, inv_l, 0);
+    inv_l = op::paged_attention::cuda::warpBroadcast(inv_l, 0);
    const int64_t q_token = q_start + static_cast<int64_t>(q_token_local);
    half *out_ptr = out_ + q_token * o_stride + static_cast<int64_t>(head_idx) * o_head_stride;

--- a/src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
+++ b/src/infiniop/ops/paged_attention_prefill/nvidia/paged_attention_prefill_nvidia.cu
@@ -21,6 +21,11 @@ constexpr size_t ceilDiv(size_t a, size_t b) {
 }
 inline const char *default_prefill_kernel(const PagedAttentionPrefillInfo &info) {
+    // Iluvatar: use warp (stable). Users can override via INFINIOP_FLASH_PREFILL_KERNEL.
+#ifdef ENABLE_ILUVATAR_API
+    (void)info;
+    return "warp";
+#endif
    // Heuristic auto-dispatch (v0.4):
    // - Prefer the pipelined + tile-wise softmax kernel on FA2-compatible block_size=256.
    // - Keep a conservative fallback for other shapes / older GPUs (cp.async is a no-op below SM80).

--- a/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
+++ b/src/infiniop/ops/rearrange/nvidia/rearrange_kernel.cuh
@@ -8,8 +8,8 @@
 #define ARRAY_TYPE_SIZE size_t
 // 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合，需要同时修改
-#define MAX_BLOCK_ARRAY_SIZE 5
+#define MAX_BLOCK_ARRAY_SIZE 6
-#define MAX_GRID_ARRAY_SIZE 5
+#define MAX_GRID_ARRAY_SIZE 6
 template <int ArrSize, typename ArrayType>
 struct ArrayStruct {
@@ -185,32 +185,43 @@ struct Constraint {
    DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
 // 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合，需要同时修改
-// 为1-5和1-5的所有组合生成内核
+// 为1-6和1-6的所有组合生成内核
 DEFINE_KERNELS_BY_CONSTRAINT(1, 1)
 DEFINE_KERNELS_BY_CONSTRAINT(1, 2)
 DEFINE_KERNELS_BY_CONSTRAINT(1, 3)
 DEFINE_KERNELS_BY_CONSTRAINT(1, 4)
 DEFINE_KERNELS_BY_CONSTRAINT(1, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(1, 6)
 DEFINE_KERNELS_BY_CONSTRAINT(2, 1)
 DEFINE_KERNELS_BY_CONSTRAINT(2, 2)
 DEFINE_KERNELS_BY_CONSTRAINT(2, 3)
 DEFINE_KERNELS_BY_CONSTRAINT(2, 4)
 DEFINE_KERNELS_BY_CONSTRAINT(2, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(2, 6)
 DEFINE_KERNELS_BY_CONSTRAINT(3, 1)
 DEFINE_KERNELS_BY_CONSTRAINT(3, 2)
 DEFINE_KERNELS_BY_CONSTRAINT(3, 3)
 DEFINE_KERNELS_BY_CONSTRAINT(3, 4)
 DEFINE_KERNELS_BY_CONSTRAINT(3, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(3, 6)
 DEFINE_KERNELS_BY_CONSTRAINT(4, 1)
 DEFINE_KERNELS_BY_CONSTRAINT(4, 2)
 DEFINE_KERNELS_BY_CONSTRAINT(4, 3)
 DEFINE_KERNELS_BY_CONSTRAINT(4, 4)
 DEFINE_KERNELS_BY_CONSTRAINT(4, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(4, 6)
 DEFINE_KERNELS_BY_CONSTRAINT(5, 1)
 DEFINE_KERNELS_BY_CONSTRAINT(5, 2)
 DEFINE_KERNELS_BY_CONSTRAINT(5, 3)
 DEFINE_KERNELS_BY_CONSTRAINT(5, 4)
 DEFINE_KERNELS_BY_CONSTRAINT(5, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(5, 6)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 1)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 2)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 3)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 4)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 5)
+DEFINE_KERNELS_BY_CONSTRAINT(6, 6)
 // 准备参数结构体
 struct RearrangeParams {
@@ -294,6 +305,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
    case 5:                                                      \
        GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
        break;                                                   \
+    case 6:                                                      \
+        GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 6); \
+        break;                                                   \
    }
 #define GET_REARRANGE_KERNEL_BY_BLOCK_NUM    \
@@ -313,6 +327,9 @@ utils::Result<void *> getRearrangeKernel(const RearrangeParams &params) {
    case 5:                                  \
        GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
        break;                               \
+    case 6:                                  \
+        GET_REARRANGE_KERNEL_BY_GRID_NUM(6); \
+        break;                               \
    }
    GET_REARRANGE_KERNEL_BY_BLOCK_NUM

--- a/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
+++ b/src/infiniop/ops/scaled_mm/nvidia/int8_gemm_nvidia.cu
@@ -64,6 +64,7 @@ infiniStatus_t Descriptor::create(
    return INFINI_STATUS_SUCCESS;
 }
+#ifdef ENABLE_QY_API
 template <unsigned int BLOCK_SIZE, typename Tdata>
 infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, void *stream_, void *workspace) const {
    cudaStream_t stream = (cudaStream_t)stream_;
@@ -112,6 +113,7 @@ infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const
    return INFINI_STATUS_SUCCESS;
 }
+#endif
 infiniStatus_t Descriptor::calculate(
    void *workspace,

--- a/src/infiniop/ops/sigmoid/operator.cc
+++ b/src/infiniop/ops/sigmoid/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/sigmoid_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include "nvidia/sigmoid_nvidia.cuh"
 #endif
@@ -37,6 +37,9 @@ __C infiniStatus_t infiniopCreateSigmoidDescriptor(
 #ifdef ENABLE_ALI_API
        CREATE(INFINI_DEVICE_ALI, nvidia);
 #endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -65,7 +68,9 @@ __C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t d
 #ifdef ENABLE_ALI_API
        GET(INFINI_DEVICE_ALI, nvidia);
 #endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -101,7 +106,9 @@ __C infiniStatus_t infiniopSigmoid(
 #ifdef ENABLE_ALI_API
        CALCULATE(INFINI_DEVICE_ALI, nvidia);
 #endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -131,7 +138,9 @@ infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
 #ifdef ENABLE_ALI_API
        DELETE(INFINI_DEVICE_ALI, nvidia);
 #endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
+++ b/src/infiniop/ops/softmax/nvidia/softmax_nvidia.cu
@@ -128,6 +128,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
            y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
+            y, x, _info.dtype, _info.othersize, _info.dimsize, _info.stride, stream));
    } else {
        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/topksoftmax/operator.cc
+++ b/src/infiniop/ops/topksoftmax/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/topksoftmax_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API) || defined(ENABLE_ILUVATAR_API)
 #include "nvidia/topksoftmax_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -36,6 +36,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
 #endif
 #ifdef ENABLE_ALI_API
        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
    }
@@ -66,6 +69,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
 #endif
 #ifdef ENABLE_ALI_API
        GET(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
    }
@@ -101,6 +107,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
 #endif
 #ifdef ENABLE_ALI_API
        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
    }
@@ -131,6 +140,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
 #endif
 #ifdef ENABLE_ALI_API
        DESTROY(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
    }

--- a/test/infinicore/ops/abs.py
+++ b/test/infinicore/ops/abs.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
 # Test cases format: (in_shape, in_strides_or_None)

--- a/test/infinicore/ops/acos.py
+++ b/test/infinicore/ops/acos.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/acosh.py
+++ b/test/infinicore/ops/acosh.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/adaptive_avg_pool1d.py
+++ b/test/infinicore/ops/adaptive_avg_pool1d.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/adaptive_avg_pool2d.py
+++ b/test/infinicore/ops/adaptive_avg_pool2d.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,

--- a/test/infinicore/ops/adaptive_avg_pool3d.py
+++ b/test/infinicore/ops/adaptive_avg_pool3d.py
@@ -3,8 +3,8 @@ import os
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-import torch
 import infinicore
+import torch
 from framework import (
    BaseOperatorTest,
    TensorSpec,