ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子， (#629)

* ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子，包括gelu,layer_norm，lp_norm(支持l1，l2 norm)，relu，softmax，tanh。 --------- Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>

ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子， (#629)
* ISSUE/628 适配QY C610 GPU，增加编译选项，适配已有算子。添加bge类模型所需的算子，包括gelu,layer_norm，lp_norm(支持l1，l2 norm)，relu，softmax，tanh。 --------- Co-authored-by: xgqdut2016 <kenan_gewei@163.com> Co-authored-by: xgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
85bc98ac · qinyiqun · GitHub · 7c397dd2 · 85bc98ac · 85bc98ac
Unverified Commit 85bc98ac authored Nov 21, 2025 by qinyiqun Committed by GitHub Nov 21, 2025
20 changed files
--- a/src/infiniop/ops/tanh/cuda/kernel.cuh
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
+#ifndef __TANH_CUDA_H__
+#define __TANH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::tanh::cuda {
+typedef struct TanhOp {
+    static constexpr size_t num_inputs = 1;
+
+    __device__ __forceinline__ float tanh_f32_func(float x) const {
+        return tanhf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = tanh_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = tanh_f32_func(f0);
+            float r1 = tanh_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = tanh_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return tanh_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::tanh(input);
+        } else {
+            return std::tanh(input);
+        }
+    }
+} TanhOp;
+} // namespace op::tanh::cuda
+
+#endif // __TANH_CUDA_H__
--- a/src/infiniop/ops/tanh/metax/tanh_metax.h
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.h
+#ifndef __TANH_METAX_API_H__
+#define __TANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, metax)
+
+#endif // __TANH_METAX_API_H__
--- a/src/infiniop/ops/tanh/metax/tanh_metax.maca
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca
+#include "tanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::tanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::metax
--- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanh_nvidia.cuh"
+
+namespace op::tanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::nvidia
--- a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
+#ifndef __TANH_CUDA_API_H__
+#define __TANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
+
+#endif // __TANH_CUDA_API_H__
--- a/src/infiniop/ops/tanh/operator.cc
+++ b/src/infiniop/ops/tanh/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/tanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanh_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanh(
+    infiniopTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
-#ifdef ENABLE_NVIDIA_API
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)

 #include "../../../devices/nvidia/nvidia_common.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"

--- a/src/infiniop/ops/topkrouter/operator.cc
+++ b/src/infiniop/ops/topkrouter/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/topkrouter_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include "nvidia/topkrouter_nvidia.cuh"
 #endif

@@ -23,6 +23,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
 #endif
 #ifdef ENABLE_NVIDIA_API
        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
    }

@@ -43,6 +46,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
 #endif
 #ifdef ENABLE_NVIDIA_API
        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
 #endif
    }

@@ -66,6 +72,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
 #endif
 #ifdef ENABLE_NVIDIA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
 #endif
    }

@@ -86,6 +95,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
 #endif
 #ifdef ENABLE_NVIDIA_API
        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia);
 #endif
    }


--- a/src/infiniop/ops/topksoftmax/operator.cc
+++ b/src/infiniop/ops/topksoftmax/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/topksoftmax_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include "nvidia/topksoftmax_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -28,6 +28,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
 #ifdef ENABLE_NVIDIA_API
        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -52,6 +55,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
 #ifdef ENABLE_NVIDIA_API
        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, metax);
 #endif
@@ -81,6 +87,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
 #ifdef ENABLE_NVIDIA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -105,6 +114,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
 #ifdef ENABLE_NVIDIA_API
        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        DESTROY(INFINI_DEVICE_METAX, metax);
 #endif

--- a/src/infiniop/ops/zeros/operator.cc
+++ b/src/infiniop/ops/zeros/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/zeros_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
 #include "nvidia/zeros_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -40,6 +40,9 @@ __C infiniStatus_t infiniopCreateZerosDescriptor(
 #ifdef ENABLE_ILUVATAR_API
        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -70,6 +73,9 @@ __C infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc,
 #ifdef ENABLE_ILUVATAR_API
        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, metax);
 #endif
@@ -108,6 +114,9 @@ __C infiniStatus_t infiniopZeros(
 #ifdef ENABLE_ILUVATAR_API
        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
@@ -140,6 +149,9 @@ infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc) {
 #ifdef ENABLE_ILUVATAR_API
        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
 #ifdef ENABLE_METAX_API
        DELETE(INFINI_DEVICE_METAX, metax);
 #endif

--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
 #ifndef __INFINIOP_REDUCE_CUDA_H__
 #define __INFINIOP_REDUCE_CUDA_H__
-
+#include <cub/block/block_reduce.cuh>
 /*
 * Device functions for reduction operations on CUDA.
 *

--- a/src/infinirt-test/main.cc
+++ b/src/infinirt-test/main.cc
@@ -20,6 +20,7 @@ void printUsage() {
              << "  metax" << std::endl
              << "  moore" << std::endl
              << "  iluvatar" << std::endl
+              << "  qy" << std::endl
              << "  kunlun" << std::endl
              << "  hygon" << std::endl
              << std::endl;
@@ -51,6 +52,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
        else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
        else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
        else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
+        else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
        else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
        else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
        else {

--- a/src/infinirt/cuda/infinirt_cuda.cuh
+++ b/src/infinirt/cuda/infinirt_cuda.cuh
@@ -3,7 +3,7 @@
 #include "../infinirt_impl.h"

 namespace infinirt::cuda {
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_HYGON_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
 INFINIRT_DEVICE_API_IMPL
 #else
 INFINIRT_DEVICE_API_NOOP

--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
        return INFINI_STATUS_NULL_POINTER;
    }
    for (size_t i = 0; i < INFINI_DEVICE_TYPE_COUNT; i++) {
-        if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_HYGON) {
+        if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_QY || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_HYGON) {
            count_array[i] = 0;
            continue;
        }
@@ -77,6 +77,9 @@ __C infiniStatus_t infinirtGetDevice(infiniDevice_t *device_ptr, int *device_id_
        case INFINI_DEVICE_ILUVATAR:                                   \
            _status = infinirt::cuda::API PARAMS;                      \
            break;                                                     \
+        case INFINI_DEVICE_QY:                                         \
+            _status = infinirt::cuda::API PARAMS;                      \
+            break;                                                     \
        case INFINI_DEVICE_HYGON:                                      \
            _status = infinirt::cuda::API PARAMS;                      \
            break;                                                     \

--- a/test/infinicore/debug.py
+++ b/test/infinicore/debug.py
@@ -33,14 +33,14 @@ from framework import (

 # Test cases - 定义不同的测试场景
 _TEST_CASES = [
-    TestCase("basic_print", (2, 3)),           # 基本打印
-    TestCase("binary_save", (3, 4)),           # 二进制保存
-    TestCase("multidimensional", (2, 2, 3)),   # 多维张量
+    TestCase("basic_print", (2, 3)),  # 基本打印
+    TestCase("binary_save", (3, 4)),  # 二进制保存
+    TestCase("multidimensional", (2, 2, 3)),  # 多维张量
 ]

 # 非连续内存布局测试用例 (is_contiguous=False)
 _NON_CONTIGUOUS_TEST_CASES = [
-    TestCase("non_contiguous", (3, 4)),        # 测试 transpose 等导致的非连续内存布局
+    TestCase("non_contiguous", (3, 4)),  # 测试 transpose 等导致的非连续内存布局
 ]

 # 大规模性能测试用例 - 一千万个数据
@@ -68,10 +68,11 @@ _TOLERANCE_MAP = {
 # Helper Functions
 # ==============================================================================

+
 def load_binary_with_torch(filename, dtype, shape):
    """使用 torch.frombuffer 读取二进制文件"""
    torch_dtype = to_torch_dtype(dtype)
-    with open(filename, 'rb') as f:
+    with open(filename, "rb") as f:
        data = f.read()
    return torch.frombuffer(data, dtype=torch_dtype).reshape(shape)

@@ -80,69 +81,80 @@ def load_binary_with_torch(filename, dtype, shape):
 # Test Methods
 # ==============================================================================

+
 def test_basic_print(device, test_case, dtype, config):
    """测试基本的 debug 打印功能"""
    test_name, shape = test_case.args
-    
-    print(f"Testing Basic Print on {InfiniDeviceNames[device]} with "
-          f"shape:{shape}, dtype:{dtype}")
-    
+
+    print(
+        f"Testing Basic Print on {InfiniDeviceNames[device]} with "
+        f"shape:{shape}, dtype:{dtype}"
+    )
+
    device_str = torch_device_map[device]
    torch_dtype = to_torch_dtype(dtype)
-    
+
    # 创建测试张量
-    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
-                                dtype=torch_dtype, device=device_str).reshape(shape)
-    
+    torch_tensor = torch.arange(
+        1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
+    ).reshape(shape)
+
    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
-    
+
    # 测试 debug 打印（不保存文件）
    infini_tensor.debug()
-    
+
    print(f"✓ Basic print test passed")


 def test_binary_save(device, test_case, dtype, config):
    """测试二进制格式保存"""
    test_name, shape = test_case.args
-    
-    print(f"Testing Binary Save on {InfiniDeviceNames[device]} with "
-          f"shape:{shape}, dtype:{dtype}")
-    
+
+    print(
+        f"Testing Binary Save on {InfiniDeviceNames[device]} with "
+        f"shape:{shape}, dtype:{dtype}"
+    )
+
    device_str = torch_device_map[device]
    torch_dtype = to_torch_dtype(dtype)
-    
+
    # 创建测试张量
-    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
-                                dtype=torch_dtype, device=device_str).reshape(shape)
-    
+    torch_tensor = torch.arange(
+        1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
+    ).reshape(shape)
+
    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
-    
+
    # 保存为二进制文件
    bin_file = f"/tmp/debug_test_{device}_{dtype}_binary.bin"
    infini_tensor.debug(bin_file)
-    
+
    # 验证文件存在
    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
-    
+
    # 验证文件大小
    expected_size = int(np.prod(shape)) * torch_tensor.element_size()
    actual_size = os.path.getsize(bin_file)
-    assert actual_size == expected_size, \
-        f"Binary file size mismatch: {actual_size} vs {expected_size}"
-    
+    assert (
+        actual_size == expected_size
+    ), f"Binary file size mismatch: {actual_size} vs {expected_size}"
+
    # 使用 torch.frombuffer 读取并验证
    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
-    
+
    # 将两个张量都移到 CPU 进行比较
    torch_tensor_cpu = torch_tensor.cpu()
    loaded_tensor_cpu = loaded_tensor.cpu()
-    
+
    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
-    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu, 
-                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
-        f"Binary data mismatch"
-    
+    assert torch.allclose(
+        loaded_tensor_cpu,
+        torch_tensor_cpu,
+        atol=tolerance["atol"],
+        rtol=tolerance["rtol"],
+    ), f"Binary data mismatch"
+
    # 清理
    os.remove(bin_file)
    print(f"✓ Binary save test passed")
@@ -151,38 +163,44 @@ def test_binary_save(device, test_case, dtype, config):
 def test_multidimensional(device, test_case, dtype, config):
    """测试多维张量"""
    test_name, shape = test_case.args
-    
-    print(f"Testing Multidimensional on {InfiniDeviceNames[device]} with "
-          f"shape:{shape}, dtype:{dtype}")
-    
+
+    print(
+        f"Testing Multidimensional on {InfiniDeviceNames[device]} with "
+        f"shape:{shape}, dtype:{dtype}"
+    )
+
    device_str = torch_device_map[device]
    torch_dtype = to_torch_dtype(dtype)
-    
+
    # 创建多维张量
-    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
-                                dtype=torch_dtype, device=device_str).reshape(shape)
-    
+    torch_tensor = torch.arange(
+        1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
+    ).reshape(shape)
+
    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
-    
+
    # 测试打印
    infini_tensor.debug()
-    
+
    # 测试保存和读取
    bin_file = f"/tmp/debug_test_multidim_{device}_{dtype}.bin"
    infini_tensor.debug(bin_file)
-    
+
    assert os.path.exists(bin_file), "Multidimensional binary file not created"
-    
+
    # 验证
    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
    torch_tensor_cpu = torch_tensor.cpu()
    loaded_tensor_cpu = loaded_tensor.cpu()
-    
+
    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
-    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
-                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
-        f"Multidimensional data mismatch"
-    
+    assert torch.allclose(
+        loaded_tensor_cpu,
+        torch_tensor_cpu,
+        atol=tolerance["atol"],
+        rtol=tolerance["rtol"],
+    ), f"Multidimensional data mismatch"
+
    # 清理
    os.remove(bin_file)
    print(f"✓ Multidimensional test passed")
@@ -191,24 +209,25 @@ def test_multidimensional(device, test_case, dtype, config):
 def test_non_contiguous_stride(device, test_case, dtype, config):
    """测试非连续内存布局的情况（is_contiguous=False，例如 transpose 后的张量）"""
    test_name, shape = test_case.args
-    
+
    print(f"\n{'='*70}")
    print(f"Testing Non-Contiguous Memory Layout on {InfiniDeviceNames[device]}")
    print(f"  Shape: {shape}, Dtype: {dtype}")
    print(f"{'='*70}")
-    
+
    device_str = torch_device_map[device]
    torch_dtype = to_torch_dtype(dtype)
-    
+
    # 创建连续张量
    print(f"\nStep 1: Creating contiguous tensor...")
-    torch_tensor_orig = torch.arange(1, int(np.prod(shape)) + 1, 
-                                     dtype=torch_dtype, device=device_str).reshape(shape)
+    torch_tensor_orig = torch.arange(
+        1, int(np.prod(shape)) + 1, dtype=torch_dtype, device=device_str
+    ).reshape(shape)
    print(f"  Original shape: {torch_tensor_orig.shape}")
    print(f"  Original stride: {torch_tensor_orig.stride()}")
    print(f"  Is contiguous: {torch_tensor_orig.is_contiguous()}")
    print(f"  Data:\n{torch_tensor_orig}")
-    
+
    # 进行 transpose 操作，创建非连续张量
    print(f"\nStep 2: Transposing to create non-contiguous tensor...")
    torch_tensor_t = torch_tensor_orig.t()  # transpose
@@ -216,78 +235,87 @@ def test_non_contiguous_stride(device, test_case, dtype, config):
    print(f"  Transposed stride: {torch_tensor_t.stride()}")
    print(f"  Is contiguous: {torch_tensor_t.is_contiguous()}")
    print(f"  Data:\n{torch_tensor_t}")
-    
+
    # 创建 InfiniCore 张量（非连续）
    # 注意：from_blob 不支持 strides，所以我们使用 permute 创建非连续张量
    # permute([1, 0]) 相当于 transpose，会创建非连续的内存布局
    infini_tensor_orig = create_infinicore_tensor(torch_tensor_orig, device_str)
    infini_tensor_t = infini_tensor_orig.as_strided(
-        list(torch_tensor_t.shape),
-        list(torch_tensor_t.stride())
+        list(torch_tensor_t.shape), list(torch_tensor_t.stride())
    )

    print(f"\nStep 3: InfiniCore tensor after permute:")
    print(f"  Shape: {infini_tensor_t.shape}")
    print(f"  Stride: {infini_tensor_t.stride()}")
    print(f"  Is contiguous: {infini_tensor_t.is_contiguous()}")
-    
+
    # ===== 测试二进制格式 =====
    print(f"\n{'='*70}")
    print(f"Testing Binary Format (.bin) with Non-Contiguous Memory Layout")
    print(f"{'='*70}")
    print(f"Note: Binary format now SUPPORTS non-contiguous memory layout!")
    print(f"      It automatically detects and handles stride correctly.")
-    
+
    bin_file = f"/tmp/debug_non_contiguous_{device}_{dtype}.bin"
    infini_tensor_t.debug(bin_file)
-    
+
    # 验证二进制文件
    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
-    
+
    # 检查文件大小
    actual_size = os.path.getsize(bin_file)
    expected_size = int(np.prod(torch_tensor_t.shape)) * torch_tensor_t.element_size()
-    
+
    print(f"\nFile size check:")
-    print(f"  Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)")
+    print(
+        f"  Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)"
+    )
    print(f"  Actual: {actual_size} bytes")
-    
-    assert actual_size == expected_size, \
-        f"File size mismatch: {actual_size} vs {expected_size}"
+
+    assert (
+        actual_size == expected_size
+    ), f"File size mismatch: {actual_size} vs {expected_size}"
    print(f"  ✓ File size is correct")
-    
+
    # 读取并验证数据
    loaded_tensor = load_binary_with_torch(bin_file, dtype, torch_tensor_t.shape)
    torch_tensor_cpu = torch_tensor_t.cpu()
    loaded_tensor_cpu = loaded_tensor.cpu()
-    
+
    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
-    
+
    print(f"\nData verification:")
    print(f"  Expected (first 2 rows):\n{torch_tensor_cpu[:2]}")
    print(f"  Got (first 2 rows):\n{loaded_tensor_cpu[:2]}")
-    
-    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
-                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
-        f"Data verification failed: loaded data doesn't match expected"
-    
+
+    assert torch.allclose(
+        loaded_tensor_cpu,
+        torch_tensor_cpu,
+        atol=tolerance["atol"],
+        rtol=tolerance["rtol"],
+    ), f"Data verification failed: loaded data doesn't match expected"
+
    print(f"\n✓ Binary format: Data matches perfectly!")
-    print(f"  Binary format correctly handles non-contiguous memory layout using stride")
-    
+    print(
+        f"  Binary format correctly handles non-contiguous memory layout using stride"
+    )
+
    # 清理
    os.remove(bin_file)
-    
+
    print(f"\n{'='*70}")
    print(f"Non-Contiguous Memory Layout Test Summary:")
    print(f"  ✅ Binary format (.bin): NOW supports non-contiguous memory!")
-    print(f"  Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing")
+    print(
+        f"  Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing"
+    )
    print(f"{'='*70}\n")


 def test_large_scale_binary_performance(device, test_case, dtype, config):
    """测试大规模数据二进制保存性能（一千万个数据）"""
    test_name, shape = test_case.args
-    
+
    num_elements = int(np.prod(shape))
    element_size_bytes = {
        infinicore.float32: 4,
@@ -296,9 +324,9 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
        infinicore.int32: 4,
        infinicore.int64: 8,
    }
-    
+
    total_size_mb = (num_elements * element_size_bytes.get(dtype, 4)) / (1024 * 1024)
-    
+
    print(f"\n{'='*70}")
    print(f"Performance Test: Large Scale Binary Save")
    print(f"  Device: {InfiniDeviceNames[device]}")
@@ -307,22 +335,22 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
    print(f"  Dtype: {dtype}")
    print(f"  Expected file size: {total_size_mb:.2f} MB")
    print(f"{'='*70}")
-    
+
    device_str = torch_device_map[device]
    torch_dtype = to_torch_dtype(dtype)
-    
+
    # 创建大规模张量
    print(f"Creating tensor with {num_elements:,} elements...")
    create_start = time.time()
    torch_tensor = torch.randn(shape, dtype=torch_dtype, device=device_str)
    create_time = time.time() - create_start
    print(f"  Tensor creation time: {create_time:.4f} seconds")
-    
+
    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
-    
+
    # 测试保存性能
    bin_file = f"/tmp/debug_large_scale_{device}_{dtype}.bin"
-    
+
    print(f"\n{'='*70}")
    print(f"[1/2] Writing Binary File")
    print(f"{'='*70}")
@@ -330,24 +358,24 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
    save_start = time.time()
    infini_tensor.debug(bin_file)
    save_time = time.time() - save_start
-    
+
    # 验证文件存在
    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
-    
+
    # 获取实际文件大小
    actual_size = os.path.getsize(bin_file)
    actual_size_mb = actual_size / (1024 * 1024)
-    
+
    # 计算写入吞吐量
    write_throughput_mbps = actual_size_mb / save_time if save_time > 0 else 0
-    
+
    # 打印写入性能结果
    print(f"\n✓ Write Performance:")
    print(f"  File size: {actual_size_mb:.2f} MB ({actual_size:,} bytes)")
    print(f"  Write time: {save_time:.4f} seconds")
    print(f"  Write throughput: {write_throughput_mbps:.2f} MB/s")
    print(f"  Elements written/sec: {num_elements/save_time:,.0f}")
-    
+
    # 测试读取性能
    print(f"\n{'='*70}")
    print(f"[2/2] Reading Binary File (for verification)")
@@ -356,25 +384,27 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
    read_time = time.time() - read_start
    read_throughput_mbps = actual_size_mb / read_time if read_time > 0 else 0
-    
+
    print(f"\n✓ Read Performance:")
    print(f"  Read time: {read_time:.4f} seconds")
    print(f"  Read throughput: {read_throughput_mbps:.2f} MB/s")
    print(f"  Elements read/sec: {num_elements/read_time:,.0f}")
-    
+
    # 简单验证前几个元素（不做完整验证以节省时间）
    torch_tensor_cpu = torch_tensor.cpu()
    loaded_tensor_cpu = loaded_tensor.cpu()
-    
+
    sample_size = min(1000, num_elements)
    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
-    assert torch.allclose(loaded_tensor_cpu.flatten()[:sample_size], 
-                         torch_tensor_cpu.flatten()[:sample_size],
-                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
-        f"Data verification failed (sampled first {sample_size} elements)"
-    
+    assert torch.allclose(
+        loaded_tensor_cpu.flatten()[:sample_size],
+        torch_tensor_cpu.flatten()[:sample_size],
+        atol=tolerance["atol"],
+        rtol=tolerance["rtol"],
+    ), f"Data verification failed (sampled first {sample_size} elements)"
+
    print(f"  Data verification: ✓ (sampled first {sample_size} elements)")
-    
+
    # 打印性能总结
    print(f"\n{'='*70}")
    print(f"Performance Summary")
@@ -383,21 +413,24 @@ def test_large_scale_binary_performance(device, test_case, dtype, config):
    print(f"  File size: {actual_size_mb:.2f} MB")
    print(f"  Write time: {save_time:.4f} sec  →  {write_throughput_mbps:.2f} MB/s")
    print(f"  Read time:  {read_time:.4f} sec  →  {read_throughput_mbps:.2f} MB/s")
-    print(f"  Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x")
+    print(
+        f"  Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x"
+    )
    print(f"{'='*70}")
-    
+
    # 清理
    os.remove(bin_file)
    print(f"\n✓ Large scale performance test passed\n")
-    
+

 # ==============================================================================
 # Main Execution Function
 # ==============================================================================

+
 def main():
    args = get_args()
-    
+
    # 创建测试配置
    config = TestConfig(
        tensor_dtypes=_TENSOR_DTYPES,
@@ -405,58 +438,62 @@ def main():
        debug=args.debug,
        bench=False,  # debug 测试不需要性能测试
    )
-    
+
    # 获取测试设备
    devices = get_test_devices(args)
-    
+
    print("Starting debug tests...")
-    
+
    all_passed = True
-    
+
    # 为每种测试类型运行测试
    test_funcs = [
        ("Basic Print", test_basic_print, [_TEST_CASES[0]]),
        ("Binary Save", test_binary_save, [_TEST_CASES[1]]),
        ("Multidimensional", test_multidimensional, [_TEST_CASES[2]]),
    ]
-    
+
    for test_name, test_func, test_cases in test_funcs:
        print(f"\n{'='*60}")
        print(f"Testing {test_name}")
        print(f"{'='*60}")
-        
+
        runner = TestRunner(test_cases, config)
        passed = runner.run_tests(devices, test_func)
        all_passed = all_passed and passed
-    
+
    # 运行非连续内存布局测试
    print(f"\n{'='*60}")
    print(f"Testing Non-Contiguous Memory Layout (is_contiguous=False)")
    print(f"{'='*60}")
-    
+
    non_contiguous_runner = TestRunner(_NON_CONTIGUOUS_TEST_CASES, config)
-    non_contiguous_passed = non_contiguous_runner.run_tests(devices, test_non_contiguous_stride)
+    non_contiguous_passed = non_contiguous_runner.run_tests(
+        devices, test_non_contiguous_stride
+    )
    all_passed = all_passed and non_contiguous_passed
-    
+
    # 运行大规模性能测试
    print(f"\n{'='*60}")
    print(f"Testing Large Scale Performance (10M elements)")
    print(f"{'='*60}")
-    
+
    large_scale_runner = TestRunner(_LARGE_SCALE_TEST_CASES, config)
-    large_scale_passed = large_scale_runner.run_tests(devices, test_large_scale_binary_performance)
+    large_scale_passed = large_scale_runner.run_tests(
+        devices, test_large_scale_binary_performance
+    )
    all_passed = all_passed and large_scale_passed
-    
+
    # 打印总结
    print(f"\n{'='*60}")
    print("Test Summary")
    print(f"{'='*60}")
-    
+
    if all_passed:
        print("\033[92m✅ All debug tests passed!\033[0m")
    else:
        print("\033[91m❌ Some tests failed!\033[0m")
-    
+
    sys.exit(0 if all_passed else 1)



--- a/test/infinicore/framework/config.py
+++ b/test/infinicore/framework/config.py
@@ -23,6 +23,7 @@ def get_supported_hardware_platforms():
        ("--moore", "Moore Threads GPUs (requires torch_musa)"),
        ("--kunlun", "Kunlun XPUs (requires torch_xmlir)"),
        ("--hygon", "Hygon DCUs"),
+        ("--qy", "QY GPUs"),
    ]


@@ -194,6 +195,15 @@ def get_test_devices(args):
            devices_to_test.append(InfiniDeviceEnum.HYGON)
        except ImportError:
            print("Warning: Hygon DCU support not available")
+            
+    if args.qy:
+        try:
+            # Iluvatar GPU detection
+            import torch
+
+            devices_to_test.append(InfiniDeviceEnum.QY)
+        except ImportError:
+            print("Warning: QY GPU support not available")

    # Default to CPU if no devices specified
    if not devices_to_test:

--- a/test/infinicore/framework/devices.py
+++ b/test/infinicore/framework/devices.py
@@ -8,6 +8,7 @@ class InfiniDeviceEnum:
    ILUVATAR = 6
    KUNLUN = 7
    HYGON = 8
+    QY = 9


 InfiniDeviceNames = {
@@ -18,6 +19,7 @@ InfiniDeviceNames = {
    InfiniDeviceEnum.METAX: "Metax",
    InfiniDeviceEnum.MOORE: "Moore",
    InfiniDeviceEnum.ILUVATAR: "Iluvatar",
+    InfiniDeviceEnum.QY: "Qy",
    InfiniDeviceEnum.KUNLUN: "Kunlun",
    InfiniDeviceEnum.HYGON: "Hygon",
 }
@@ -32,4 +34,5 @@ torch_device_map = {
    InfiniDeviceEnum.ILUVATAR: "cuda",
    InfiniDeviceEnum.KUNLUN: "cuda",
    InfiniDeviceEnum.HYGON: "cuda",
+    InfiniDeviceEnum.QY: "cuda",
 }
--- a/test/infiniop/attention.py
+++ b/test/infiniop/attention.py
@@ -23,7 +23,6 @@ from libinfiniop import (
 )


-
 def causal_softmax(x):
    type = x.dtype
    mask = torch.tril(torch.ones_like(x), diagonal=-1).flip(dims=[-2, -1])

--- a/test/infiniop/conv.py
+++ b/test/infiniop/conv.py
@@ -96,27 +96,27 @@ NUM_ITERATIONS = 1000


 def conv(x, w, stride, padding, dilation, y_tensor, bias=None):
-    match len(x.shape) - 2:
-        case 1:
-            y_tensor.copy_(
-                F.conv1d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+    ndim = len(x.shape) - 2#不要使用match，会导致CI无法通过
+    if ndim == 1:
+        y_tensor.copy_(
+            F.conv1d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
            )
-        case 2:
-            y_tensor.copy_(
-                F.conv2d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+        )
+    elif ndim == 2:
+        y_tensor.copy_(
+            F.conv2d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
            )
-        case 3:
-            y_tensor.copy_(
-                F.conv3d(
-                    x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
-                )
+        )
+    elif ndim == 3:
+        y_tensor.copy_(
+            F.conv3d(
+                x, w, bias=bias, stride=stride, padding=padding, dilation=dilation
            )
-        case _:
-            print("Error: Pytorch -> Unsupported tensor dimension")
+        )
+    else:
+        print("Error: Pytorch -> Unsupported tensor dimension")


 # infer the shape of the output given the inputs for a N-ary convolution

--- a/test/infiniop/gelu.py
+++ b/test/infiniop/gelu.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    #((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    #((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F64: {"atol": 1e-6, "rtol": 1e-6},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Gelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    new_tensor = torch.nn.functional.gelu(input.torch_tensor())
+    output.update_torch_tensor(new_tensor)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGeluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGeluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_gelu():
+        check_error(
+            LIBINFINIOP.infiniopGelu(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_gelu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.gelu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")