issue/521 - support topksoftmax on metax GPU.

23781e21 · pengcheng888 · f5e6d729 · 23781e21 · 23781e21 · 23781e21
Commit 23781e21 authored Oct 22, 2025 by pengcheng888
6 changed files
--- a/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
+++ b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
@@ -5,16 +5,13 @@
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_reduce.cuh>
 #include <cub/block/block_store.cuh>
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 template <typename T>
 inline __device__ float exp_func(T x) {
    float data;
    if constexpr (std::is_same_v<T, float>) {
        data = x;
-    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
        data = __bfloat162float(x);
    } else if constexpr (std::is_same_v<T, half>) {
        data = __half2float(x);

--- a/src/infiniop/ops/topksoftmax/metax/topksoftmax_metax.cuh
+++ b/src/infiniop/ops/topksoftmax/metax/topksoftmax_metax.cuh
+#ifndef __TOPKSOFTMAX_METAX_CUH__
+#define __TOPKSOFTMAX_METAX_CUH__
+#include "../topksoftmax.h"
+DESCRIPTOR(metax)
+#endif
--- a/src/infiniop/ops/topksoftmax/metax/topksoftmax_metax.maca
+++ b/src/infiniop/ops/topksoftmax/metax/topksoftmax_metax.maca
+#include "../../../devices/metax/metax_common.h"
+#include "topksoftmax_metax.cuh"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include <cub/block/block_reduce.cuh>
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+namespace op::topksoftmax::metax {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc) {
+    auto result = TopksoftmaxInfo::create(x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+namespace {
+template <int BLOCK_SIZE = 128>
+infiniStatus_t launch_topksoftmax(float *d_values_out, int *d_indices_out, const void *d_input, const size_t N, const size_t width, const size_t topk, const bool norm, infiniDtype_t xtype, hcStream_t stream) {
+    const int block_threads = BLOCK_SIZE;
+    dim3 blocks(static_cast<unsigned int>(N));
+    dim3 threads(block_threads);
+    if (xtype == INFINI_DTYPE_F32) {
+        softmax_topk_row_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, N, width, topk, norm);
+    } else if (xtype == INFINI_DTYPE_F16) {
+        softmax_topk_row_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, N, width, topk, norm);
+    } else if (xtype == INFINI_DTYPE_BF16) {
+        softmax_topk_row_kernel<__hpcc_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (__hpcc_bfloat16 *)d_input, N, width, topk, norm);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+}; // namespace
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    float *values,
+    int *indices,
+    const void *x,
+    const size_t topk,
+    const bool norm,
+    void *stream_) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    size_t N = _info.N;
+    size_t width = _info.width;
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+    if (width <= 128) {
+        launch_topksoftmax<128>(values, indices, x, N, width, topk, norm, _info.xtype, stream);
+    } else if (width <= 256) {
+        launch_topksoftmax<256>(values, indices, x, N, width, topk, norm, _info.xtype, stream);
+    } else if (width <= 512) {
+        launch_topksoftmax<512>(values, indices, x, N, width, topk, norm, _info.xtype, stream);
+    } else {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::topksoftmax::metax
--- a/src/infiniop/ops/topksoftmax/operator.cc
+++ b/src/infiniop/ops/topksoftmax/operator.cc
@@ -8,6 +8,9 @@
 #if defined(ENABLE_NVIDIA_API)
 #include "nvidia/topksoftmax_nvidia.cuh"
 #endif
+#ifdef ENABLE_METAX_API
+#include "metax/topksoftmax_metax.cuh"
+#endif
 __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
                                                       infiniopTopksoftmaxDescriptor_t *desc_ptr,
@@ -24,6 +27,9 @@ __C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
 #endif
 #ifdef ENABLE_NVIDIA_API
        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
    }
@@ -45,6 +51,9 @@ __C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescri
 #endif
 #ifdef ENABLE_NVIDIA_API
        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
    }
@@ -71,6 +80,9 @@ __C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, voi
 #endif
 #ifdef ENABLE_NVIDIA_API
        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
    }
@@ -92,6 +104,9 @@ __C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescr
 #endif
 #ifdef ENABLE_NVIDIA_API
        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax);
 #endif
    }

--- a/test/infiniop-test/test_generate/testcases/topksoftmax.py
+++ b/test/infiniop-test/test_generate/testcases/topksoftmax.py
@@ -115,7 +115,7 @@ if __name__ == "__main__":
        # x_shape, x_strides, topk, norm
        ((1, 32), None, 4, True),
        ((8, 20), None, 8, False),
-        ((2, 128), None, 10, True)
+        ((2, 64), None, 6, True)
    ]
    _TENSOR_DTYPES_ = [np.float32, np.float16]
    for dtype in _TENSOR_DTYPES_:

--- a/test/infiniop/topksoftmax.py
+++ b/test/infiniop/topksoftmax.py
@@ -28,8 +28,8 @@ from libinfiniop import (
 _TEST_CASES_ = [
    # x_shape, x_stride, topk, norm
    ((1, 10), None, 7, True),
-    ((2, 20), None, 4, True),
+    ((8, 20), None, 4, True),
-    ((1, 128), None, 10, True),
+    ((2, 64), None, 6, True),
 ]
 # w (weight) types