Merge branch 'main' into dev_topkrouter

ac4aae48 · Shijie · GitHub · a15aa367 · 2f3f4076 · ac4aae48
Unverified Commit ac4aae48 authored Dec 01, 2025 by Shijie Committed by GitHub Dec 01, 2025
13 changed files
--- a/src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.h
+++ b/src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.h
+#ifndef __TOPKROUTER_KUNLUN_H__
+#define __TOPKROUTER_KUNLUN_H__
+
+#include "../topkrouter.h"
+
+DESCRIPTOR(kunlun)
+
+#endif
--- a/src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.xpu
+++ b/src/infiniop/ops/topkrouter/kunlun/topkrouter_kunlun.xpu
+#include "../../../devices/kunlun/kunlun_common.h"
+#include "../../../devices/kunlun/kunlun_handle.h"
+#include "../../../devices/kunlun/kunlun_kernel_common.h"
+#include "kernel.h"
+#include "topkrouter_kunlun.h"
+#include <memory>
+#include <stdint.h>
+
+namespace op::topkrouter::kunlun {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::kunlun::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t correction_bias_desc) {
+    auto result = TopkrouterInfo::create(x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::kunlun::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <int BLOCK_SIZE = 64>
+infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias,
+                                 const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype,
+                                 kunlunStream_t stream) {
+
+    if (xtype == INFINI_DTYPE_F32) {
+        topkrouter_kernel<float, BLOCK_SIZE, 256, 8, 4, 2>
+            <<<N, BLOCK_SIZE, stream>>>(
+                d_values_out,
+                d_indices_out,
+                (float *)d_input,
+                (const float *)d_correction_bias,
+                routed_scaling_factor,
+                N,
+                width,
+                topk);
+    } else if (xtype == INFINI_DTYPE_F16) {
+        topkrouter_kernel<half, BLOCK_SIZE, 256, 8, 4, 2>
+            <<<N, BLOCK_SIZE, stream>>>(
+                d_values_out,
+                d_indices_out,
+                (half *)d_input,
+                (const float *)d_correction_bias,
+                routed_scaling_factor,
+                N,
+                width,
+                topk);
+    } else if (xtype == INFINI_DTYPE_BF16) {
+        topkrouter_kernel<bfloat16_t, BLOCK_SIZE, 256, 8, 4, 2>
+            <<<N, BLOCK_SIZE, stream>>>(
+                d_values_out,
+                d_indices_out,
+                (bfloat16_t *)d_input,
+                (const float *)d_correction_bias,
+                routed_scaling_factor,
+                N,
+                width,
+                topk);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    float *values,
+    int *indices,
+    const void *x,
+    const float *correction_bias,
+    const float routed_scaling_factor,
+    const size_t topk,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    size_t N = _info.N;
+    size_t width = _info.width;
+
+    auto kunlun_stream = reinterpret_cast<kunlunStream_t>(stream);
+
+    launch_topkrouter<64>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, kunlun_stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::topkrouter::kunlun
--- a/src/infiniop/ops/topkrouter/operator.cc
+++ b/src/infiniop/ops/topkrouter/operator.cc
@@ -11,6 +11,9 @@
 #ifdef ENABLE_METAX_API
 #include "metax/topkrouter_metax.h"
 #endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/topkrouter_kunlun.h"
+#endif

 __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, infiniopTopkrouterDescriptor_t *desc_ptr,
                                                      infiniopTensorDescriptor_t x_desc,
@@ -32,6 +35,9 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, i
 #endif
 #ifdef ENABLE_METAX_API
        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
    }

@@ -58,6 +64,9 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
 #endif
 #ifdef ENABLE_METAX_API
        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
    }

@@ -87,6 +96,9 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
 #endif
 #ifdef ENABLE_METAX_API
        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
    }

@@ -113,6 +125,9 @@ __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescrip
 #endif
 #ifdef ENABLE_METAX_API
        DESTROY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
    }


--- a/src/infiniop/sort/kunlun/heap.h
+++ b/src/infiniop/sort/kunlun/heap.h
+#ifndef __INFINIOP_HEAP_KUNLUN_H__
+#define __INFINIOP_HEAP_KUNLUN_H__
+#include "xpu/kernel/xtdk_simd_xpu2.h"
+
+template <typename TK, typename TV>
+static __device__ inline void sm_swap_kv(_shared_ptr_ TK *k0, _shared_ptr_ TV *v0,
+                                         _shared_ptr_ TK *k1, _shared_ptr_ TV *v1) {
+    TK tmpk = *k0;
+    TV tmpv = *v0;
+    *k0 = *k1;
+    *v0 = *v1;
+    *k1 = tmpk;
+    *v1 = tmpv;
+}
+
+template <typename TK, typename TV>
+static __device__ inline void update_sm_min_heap(_shared_ptr_ TK *heap_key,
+                                                 _shared_ptr_ TV *heap_value, int idx, int heap_capacity) {
+    while (idx < heap_capacity) {
+        int child_l = idx * 2 + 1;
+        int child_r = idx * 2 + 2;
+        int child_min = child_l;
+        if (child_r >= heap_capacity) {
+            if (child_l >= heap_capacity) { // idx is leaf node, shift finished
+                break;
+            } else { // if child_r does not exist while child_l does, choose child_l
+                child_min = child_l;
+            }
+        } else { // both child L & R exists
+            child_min = child_l + (heap_key[child_l] > heap_key[child_r]);
+        }
+        if (heap_key[idx] <= heap_key[child_min]) {
+            break;
+        }
+        sm_swap_kv(&heap_key[idx], &heap_value[idx], &heap_key[child_min], &heap_value[child_min]);
+        idx = child_min;
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void make_sm_min_heap(
+    _shared_ptr_ TK *heap_key, _shared_ptr_ TV *heap_value, int size) {
+    for (int i = size / 2 - 1; i >= 0; i--) {
+        update_sm_min_heap(heap_key, heap_value, i, size);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void sort_sm_min_heap(
+    _shared_ptr_ TK *heap_key, _shared_ptr_ TV *heap_value, int heap_capacity) {
+    for (int i = heap_capacity - 1; i > 0; i--) {
+        sm_swap_kv(&heap_key[0], &heap_value[0], &heap_key[i], &heap_value[i]);
+        update_sm_min_heap(heap_key, heap_value, 0, i);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void update_sm_max_heap(_shared_ptr_ TK *heap_key,
+                                                 _shared_ptr_ TV *heap_value, int idx, int heap_capacity) {
+    while (idx < heap_capacity) {
+        int child_l = idx * 2 + 1;
+        int child_r = idx * 2 + 2;
+        int child_max = child_l;
+        if (child_r >= heap_capacity) {
+            if (child_l >= heap_capacity) { // idx is leaf node, shift finished
+                break;
+            } else { // if child_r does not exist while child_l does, choose child_l
+                child_max = child_l;
+            }
+        } else { // both child L & R exists
+            child_max = child_l + (heap_key[child_l] < heap_key[child_r]);
+        }
+        if (heap_key[idx] >= heap_key[child_max]) {
+            break;
+        }
+        sm_swap_kv(&heap_key[idx], &heap_value[idx], &heap_key[child_max], &heap_value[child_max]);
+        idx = child_max;
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void make_sm_max_heap(
+    _shared_ptr_ TK *heap_key, _shared_ptr_ TV *heap_value, int size) {
+    for (int i = size / 2 - 1; i >= 0; i--) {
+        update_sm_max_heap(heap_key, heap_value, i, size);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void sort_sm_max_heap(_shared_ptr_ TK *heap_key,
+                                               _shared_ptr_ TV *heap_value, int heap_capacity) {
+    for (int i = heap_capacity - 1; i > 0; i--) {
+        sm_swap_kv(&heap_key[0], &heap_value[0], &heap_key[i], &heap_value[i]);
+        update_sm_max_heap(heap_key, heap_value, 0, i);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void lm_swap_kv(TK *k0, TV *v0,
+                                         TK *k1, TV *v1) {
+    TK tmpk = *k0;
+    TV tmpv = *v0;
+    *k0 = *k1;
+    *v0 = *v1;
+    *k1 = tmpk;
+    *v1 = tmpv;
+}
+
+template <typename TK, typename TV>
+static __device__ inline void update_lm_min_heap(TK *heap_key, TV *heap_value, int idx, int heap_capacity) {
+    while (idx < heap_capacity) {
+        int child_l = idx * 2 + 1;
+        int child_r = idx * 2 + 2;
+        int child_min = child_l;
+        if (child_r >= heap_capacity) {
+            if (child_l >= heap_capacity) { // idx is leaf node, shift finished
+                break;
+            } else { // if child_r does not exist while child_l does, choose child_l
+                child_min = child_l;
+            }
+        } else { // both child L & R exists
+            child_min = child_l + (heap_key[child_l] > heap_key[child_r]);
+        }
+        if (heap_key[idx] <= heap_key[child_min]) {
+            break;
+        }
+        lm_swap_kv(&heap_key[idx], &heap_value[idx], &heap_key[child_min], &heap_value[child_min]);
+        idx = child_min;
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void make_lm_min_heap(
+    TK *heap_key, TV *heap_value, int size) {
+    for (int i = size / 2 - 1; i >= 0; i--) {
+        update_lm_min_heap(heap_key, heap_value, i, size);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void sort_lm_min_heap(TK *heap_key, TV *heap_value, int heap_capacity) {
+    for (int i = heap_capacity - 1; i > 0; i--) {
+        lm_swap_kv(&heap_key[0], &heap_value[0], &heap_key[i], &heap_value[i]);
+        update_lm_min_heap(heap_key, heap_value, 0, i);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void update_lm_max_heap(TK *heap_key, TV *heap_value, int idx, int heap_capacity) {
+    while (idx < heap_capacity) {
+        int child_l = idx * 2 + 1;
+        int child_r = idx * 2 + 2;
+        int child_max = child_l;
+        if (child_r >= heap_capacity) {
+            if (child_l >= heap_capacity) { // idx is leaf node, shift finished
+                break;
+            } else { // if child_r does not exist while child_l does, choose child_l
+                child_max = child_l;
+            }
+        } else { // both child L & R exists
+            child_max = child_l + (heap_key[child_l] < heap_key[child_r]);
+        }
+        if (heap_key[idx] >= heap_key[child_max]) {
+            break;
+        }
+        lm_swap_kv(&heap_key[idx], &heap_value[idx], &heap_key[child_max], &heap_value[child_max]);
+        idx = child_max;
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void make_lm_max_heap(
+    TK *heap_key, TV *heap_value, int size) {
+    for (int i = size / 2 - 1; i >= 0; i--) {
+        update_lm_max_heap(heap_key, heap_value, i, size);
+    }
+}
+
+template <typename TK, typename TV>
+static __device__ inline void sort_lm_max_heap(TK *heap_key, TV *heap_value, int heap_capacity) {
+    for (int i = heap_capacity - 1; i > 0; i--) {
+        lm_swap_kv(&heap_key[0], &heap_value[0], &heap_key[i], &heap_value[i]);
+        update_lm_max_heap(heap_key, heap_value, 0, i);
+    }
+}
+
+template <typename TID>
+__device__ TID roundup_div_p(TID a, TID b) {
+    return (a + b - 1) / b;
+}
+
+template <typename T>
+__device__ T min_p(T a, T b) {
+    return a < b ? a : b;
+}
+
+template <typename TID>
+static __device__ inline void partition(int tid, int nthreads, TID len, int align, TID *start, TID *end) {
+    TID block_cnt = roundup_div_p<TID>(len, align);
+    TID remain_block = block_cnt % nthreads;
+    TID start_block = block_cnt / nthreads * static_cast<TID>(tid) + min_p<TID>(tid, remain_block);
+    TID end_block = start_block + block_cnt / nthreads + (tid < remain_block);
+    *start = min_p<TID>(start_block * align, len);
+    *end = min_p<TID>(end_block * align, len);
+}
+
+template <typename TX, typename TY>
+static __device__ void primitive_cast(const TX *x, TY *y, int len) {
+    return;
+}
+
+template <>
+__device__ void primitive_cast(const float *x, int *y, int len) {
+    for (int i = 0; i < len; i += 16) {
+        float32x16_t Y = vload_lm_float32x16(x);
+        __asm__ __volatile__("vfloat2fix.rz vr0, %0\t\n"
+                             "vstore_mask16.mz vr0{mr1}, 0(%1)" ::"v"(Y),
+                             "r"(y)
+                             : "vr0");
+        x += 16;
+        y += 16;
+    }
+    mfence_lm();
+}
+template <>
+__device__ void primitive_cast(const int *x, float *y, int len) {
+    for (int i = 0; i < len; i += 16) {
+        int32x16_t Y = vload_lm_int32x16(x);
+        __asm__ __volatile__("vfix2float.rn vr0, %0\t\n"
+                             "vstore_mask16.mz vr0{mr1}, 0(%1)" ::"v"(Y),
+                             "r"(y)
+                             : "vr0");
+        x += 16;
+        y += 16;
+    }
+    mfence_lm();
+}
+
+static __device__ inline void vload2_lm(const float *ptr, float32x16_t &vl, float32x16_t &vh) {
+    vl = __builtin_xpu2_vload_mask16_mr1(ptr, 0);
+    vh = __builtin_xpu2_vload_mask16_mr1(ptr + 16, 0);
+}
+
+static __device__ inline void vstore2_lm(float *ptr, float32x16_t &vl, float32x16_t &vh) {
+    vstore_lm_float32x16(ptr, vl);
+    vstore_lm_float32x16(ptr + 16, vh);
+}
+
+template <>
+__device__ void primitive_cast(const float *x, float *y, int len) {
+    if (x == y) {
+        return;
+    } else { // just copy
+        float32x16_t vec_x_0;
+        float32x16_t vec_x_1;
+        for (int i = 0; i < len; i += 32) {
+            vload2_lm(x + i, vec_x_0, vec_x_1);
+            vstore2_lm(y + i, vec_x_0, vec_x_1);
+        }
+        mfence_lm();
+    }
+}
+
+#endif
--- a/src/infinirt/infinirt.cc
+++ b/src/infinirt/infinirt.cc
@@ -23,7 +23,7 @@ __C infiniStatus_t infinirtGetAllDeviceCount(int *count_array) {
        return INFINI_STATUS_NULL_POINTER;
    }
    for (size_t i = 0; i < INFINI_DEVICE_TYPE_COUNT; i++) {
-        if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_QY || i == INFINI_DEVICE_KUNLUN || i == INFINI_DEVICE_HYGON) {
+        if (i == INFINI_DEVICE_ILUVATAR || i == INFINI_DEVICE_HYGON || i == INFINI_DEVICE_QY) {
            count_array[i] = 0;
            continue;
        }

--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
@@ -13,6 +13,7 @@ from .datatypes import to_torch_dtype, to_infinicore_dtype
 from .devices import InfiniDeviceNames, torch_device_map
 from .tensor import TensorSpec, TensorInitializer
 from .utils import (
+    clone_torch_tensor,
    create_test_comparator,
    infinicore_tensor_from_torch,
 )
@@ -321,7 +322,7 @@ class BaseOperatorTest(ABC):
        for item in input_sequence:
            if isinstance(item, torch.Tensor):
                if clone:
-                    cloned_item = item.clone().detach()
+                    cloned_item = clone_torch_tensor(item)
                    infini_item = infinicore_tensor_from_torch(cloned_item)
                    cloned_tensors.append(cloned_item)
                else:
@@ -340,7 +341,7 @@ class BaseOperatorTest(ABC):
            if isinstance(inp, torch.Tensor):
                # Clone only if this input will be used for comparison
                if comparison_target == i:
-                    cloned_inp = inp.clone().detach()
+                    cloned_inp = clone_torch_tensor(inp)
                    infini_tensor = infinicore_tensor_from_torch(cloned_inp)
                    cloned_tensors.append(cloned_inp)
                else:
@@ -362,7 +363,7 @@ class BaseOperatorTest(ABC):
            if isinstance(value, torch.Tensor):
                # Check if this tensor is used for output comparison
                if key == "out" and comparison_target == "out":
-                    cloned_value = value.clone().detach()
+                    cloned_value = clone_torch_tensor(value)
                    infini_kwargs[key] = infinicore_tensor_from_torch(cloned_value)
                    cloned_tensors.append(cloned_value)
                elif key == "out" and isinstance(comparison_target, int):
@@ -566,12 +567,12 @@ class BaseOperatorTest(ABC):
            elif comparison_target == "out":
                # Compare output tensor from kwargs (explicit output)
                torch_comparison = kwargs.get("out")
-                infini_comparison = infini_kwargs.get("out")
+                infini_comparison = cloned_tensors[0]
            elif isinstance(comparison_target, int):
                # Compare specific input tensor (in-place operation on input)
                if 0 <= comparison_target < len(inputs):
                    torch_comparison = inputs[comparison_target]
-                    infini_comparison = infini_inputs[comparison_target]
+                    infini_comparison = cloned_tensors[0]
                else:
                    raise ValueError(
                        f"Invalid comparison target index: {comparison_target}"

--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
@@ -118,6 +118,13 @@ def get_tolerance(tolerance_map, tensor_dtype, default_atol=0, default_rtol=1e-3
    return tolerance["atol"], tolerance["rtol"]


+def clone_torch_tensor(torch_tensor):
+    cloned = torch_tensor.clone().detach()
+    if not torch_tensor.is_contiguous():
+        cloned = rearrange_tensor(cloned, torch_tensor.stride())
+    return cloned
+
+
 def infinicore_tensor_from_torch(torch_tensor):
    infini_device = infinicore.device(torch_tensor.device.type, 0)
    if torch_tensor.is_contiguous():
@@ -152,6 +159,10 @@ def convert_infinicore_to_torch(infini_result):
        dtype=to_torch_dtype(infini_result.dtype),
        device=infini_result.device.type,
    )
+    if not infini_result.is_contiguous():
+        torch_result_from_infini = rearrange_tensor(
+            torch_result_from_infini, infini_result.stride()
+        )
    temp_tensor = infinicore_tensor_from_torch(torch_result_from_infini)
    temp_tensor.copy_(infini_result)
    return torch_result_from_infini
@@ -223,7 +234,10 @@ def compare_results(
            return result_equal

    # Convert infinicore result to PyTorch tensor for comparison
-    torch_result_from_infini = convert_infinicore_to_torch(infini_result)
+    if isinstance(infini_result, torch.Tensor):
+        torch_result_from_infini = infini_result
+    else:
+        torch_result_from_infini = convert_infinicore_to_torch(infini_result)

    # Debug mode: detailed comparison
    if debug_mode:

--- a/test/infinicore/ops/aminmax.py
+++ b/test/infinicore/ops/aminmax.py
@@ -49,8 +49,8 @@ _TEST_CASES_DATA = [
    ((13, 4), 0, False, None, (3,), (3,)),
    ((13, 4), 1, False, (20, 1), (10,), (10,)),
    # 3D in-place cases
-    ((4, 5, 6), 1, True, None, (4, 1, 6), (4, 1, 6)),
-    ((4, 5, 6), -1, False, (30, 6, 1), (4, 5), (4, 5)),
+    ((4, 5, 6), 1, True, None, (6, 6, 1), (6, 6, 1)),
+    ((4, 5, 6), -1, False, (30, 6, 1), (5, 1), (5, 1)),
 ]

 # Tolerance configuration

--- a/test/infinicore/ops/silu.py
+++ b/test/infinicore/ops/silu.py
@@ -28,7 +28,6 @@ _TEST_CASES_DATA = [
    ((4, 48, 6), None, None),
    # Strided tensors
    ((1, 2048), (4096, 1), (4096, 1)),
-    ((6, 2560), (2048, 1), (2560, 1)),
    # Mixed cases
    ((8, 16, 32), None, None),
    # Large tensors

--- a/test/infinicore/ops/sort.py
+++ b/test/infinicore/ops/sort.py
@@ -31,12 +31,12 @@ _TEST_CASES_DATA = [
    ((4, 5, 6), 1, False, None, None, None),
    ((4, 5, 6), -1, True, None, None, None),
    # 3D in-place cases
-    ((4, 5, 6), 1, False, None, (4, 1, 6), (4, 1, 6)),
-    ((4, 5, 6), -1, False, (30, 6, 1), (64, 1, 5), (64, 1, 5)),
+    ((4, 5, 6), 1, False, None, (30, 6, 1), (30, 6, 1)),
+    ((4, 5, 6), -1, False, (30, 6, 1), (30, 6, 1), (30, 6, 1)),
    # Strided inputs and outputs
-    ((13, 4), None, False, (4, 1), (12, 1), (24, 1)),
-    ((13, 4), 0, False, (1, 4), (64, 1), (1, 4)),
-    ((13, 4), 1, False, (1, 4), (64, 1), (1, 4)),
+    ((13, 4), None, False, (4, 1), (4, 1), (4, 1)),
+    ((13, 4), 0, False, (13, 1), (13, 1), (13, 1)),
+    ((13, 4), 1, False, (13, 1), (13, 1), (13, 1)),
 ]

 # Tolerance configuration

--- a/test/infiniop/topkrouter.py
+++ b/test/infiniop/topkrouter.py
@@ -33,7 +33,8 @@ _TEST_CASES_ = [

 # w (weight) types
 # Note: 'None' means the same as input dtype
-_X_DTYPES = [] # [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
+# _X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
+_X_DTYPES = [] # CPU CI
 # x types used for testing
 _VALUE_DTYPES = [InfiniDtype.F32]

@@ -194,6 +195,7 @@ def test(

    lib_topkrouter()

+
    lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk)
    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
    if DEBUG:

--- a/xmake.lua
+++ b/xmake.lua
@@ -326,6 +326,7 @@ target("infiniccl")
    end
    if has_config("qy-gpu") then
        add_deps("infiniccl-qy")
+        add_files("build/.objs/infiniccl-qy/rules/qy.cuda/src/infiniccl/cuda/*.cu.o", {public = true})
    end

    if has_config("moore-gpu") then
@@ -347,12 +348,45 @@ target("infiniccl")
    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
 target_end()

+target("infinicore_c_api")
+
 target("infinicore_c_api")
    set_kind("phony")
    add_deps("infiniop", "infinirt", "infiniccl")
    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
 target_end()

+target("infinicore_cpp_api")
+    set_kind("shared")
+    add_deps("infiniop", "infinirt", "infiniccl")
+    set_languages("cxx17")
+
+    local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
+
+    add_includedirs("include")
+    add_includedirs(INFINI_ROOT.."/include", { public = true })
+
+    add_linkdirs(INFINI_ROOT.."/lib")
+    add_links("infiniop", "infinirt", "infiniccl")
+
+    -- Add InfiniCore C++ source files (needed for RoPE and other nn modules)
+    add_files("src/infinicore/*.cc")
+    add_files("src/infinicore/context/*.cc")
+    add_files("src/infinicore/context/*/*.cc")
+    add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/nn/*.cc")
+    add_files("src/infinicore/ops/*/*.cc")
+
+    set_installdir(INFINI_ROOT)
+    add_installfiles("include/infinicore/(**.h)",    {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**.hpp)",    {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**/*.h)",  {prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore/(**/*.hpp)",{prefixdir = "include/infinicore"})
+    add_installfiles("include/infinicore.h",          {prefixdir = "include"})
+    add_installfiles("include/infinicore.hpp",        {prefixdir = "include"})
+    after_build(function (target) print(YELLOW .. "[Congratulations!] Now you can install the libraries with \"xmake install\"" .. NC) end)
+target_end()
+
 target("_infinicore")
    add_packages("boost")
    if is_mode("debug") then
@@ -378,6 +412,7 @@ target("_infinicore")
    add_files("src/infinicore/context/*.cc")
    add_files("src/infinicore/context/*/*.cc")
    add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/nn/*.cc")
    add_files("src/infinicore/ops/*/*.cc")
    add_files("src/infinicore/pybind11/**.cc")


--- a/xmake/test.lua
+++ b/xmake/test.lua
@@ -89,6 +89,7 @@ target("infinicore-test")
    add_files(os.projectdir().."/src/infinicore/nn/*.cc")

    add_files(os.projectdir().."/src/infinicore-test/*.cc")
+    add_files(os.projectdir().."/src/infinicore-test/*/*.cc")

    set_installdir(INFINI_ROOT)
 target_end()