issue/473 - the ones and zeros operators

Co-authored-by: pengcheng888 <pengcheng@example.com>

issue/473 - the ones and zeros operators
Co-authored-by: pengcheng888 <pengcheng@example.com>
9b8de584 · pengcheng888 · GitHub · f5e6d729 · 9b8de584 · 9b8de584
Unverified Commit 9b8de584 authored Oct 23, 2025 by pengcheng888 Committed by GitHub Oct 23, 2025
20 changed files
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -20,6 +20,8 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/topkrouter.h"
+#include "infiniop/ops/zeros.h"
+#include "infiniop/ops/ones.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/sigmoid.h"
 #include "infiniop/tensor_descriptor.h"

--- a/include/infiniop/ops/ones.h
+++ b/include/infiniop/ops/ones.h
+#ifndef __INFINIOP_ONES_API_H__
+#define __INFINIOP_ONES_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopOnesDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateOnesDescriptor(infiniopHandle_t handle,
+                                                         infiniopOnesDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetOnesWorkspaceSize(infiniopOnesDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopOnes(infiniopOnesDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyOnesDescriptor(infiniopOnesDescriptor_t desc);
+
+#endif
--- a/include/infiniop/ops/zeros.h
+++ b/include/infiniop/ops/zeros.h
+#ifndef __INFINIOP_ZEROS_API_H__
+#define __INFINIOP_ZEROS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopZerosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateZerosDescriptor(infiniopHandle_t handle,
+                                                          infiniopZerosDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t y,
+                                                          infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopZeros(infiniopZerosDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *y,
+                                          const void *x,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc);
+
+#endif
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -26,6 +26,8 @@ def run_tests(args):
        "sub.py",
        "swiglu.py",
        "softplus.py",
+        "ones.py",
+        "zeros.py",
        "sigmoid.py",
        "topkrouter.py",
        "topksoftmax.py",

--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,9 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(zeros)
+DECLARE_INFINIOP_TEST(ones)
+
 DECLARE_INFINIOP_TEST(sigmoid)
 DECLARE_INFINIOP_TEST(topkrouter)
 DECLARE_INFINIOP_TEST(topksoftmax)
@@ -45,6 +48,8 @@ DECLARE_INFINIOP_TEST(topksoftmax)
        REGISTER_INFINIOP_TEST(causal_softmax) \
        REGISTER_INFINIOP_TEST(rearrange)      \
        REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(zeros)          \
+        REGISTER_INFINIOP_TEST(ones)           \
        REGISTER_INFINIOP_TEST(sigmoid)        \
        REGISTER_INFINIOP_TEST(topkrouter)     \
        REGISTER_INFINIOP_TEST(topksoftmax)    \

--- a/src/infiniop-test/src/ops/ones.cpp
+++ b/src/infiniop-test/src/ops/ones.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::ones {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+
+    infiniopOnesDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateOnesDescriptor(handle, &op_desc,
+                                          y->desc(),
+                                          x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetOnesWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopOnes(op_desc, workspace, workspace_size,
+                          y->data(),
+                          x->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopOnes(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::ones
--- a/src/infiniop-test/src/ops/zeros.cpp
+++ b/src/infiniop-test/src/ops/zeros.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::zeros {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+
+    infiniopZerosDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateZerosDescriptor(handle, &op_desc,
+                                           y->desc(),
+                                           x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetZerosWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopZeros(op_desc, workspace, workspace_size,
+                           y->data(),
+                           x->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopZeros(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::zeros
--- a/src/infiniop/devices/metax/metax_kernel_common.h
+++ b/src/infiniop/devices/metax/metax_kernel_common.h
 #define INFINIOP_METAX_KERNEL __global__ void

+#include <hpcc_fp8.h>
+
 // Posible maximum number of threads per block for METAX architectures
 // Used for picking correct kernel launch configuration
 #define METAX_BLOCK_SIZE_1024 1024
@@ -9,6 +11,7 @@

 using cuda_bfloat16 = hpcc_bfloat16;
 using cuda_bfloat162 = hpcc_bfloat162;
+using cuda_fp8_e4m3 = __hpcc_fp8_e4m3;

 namespace device::metax {


--- a/src/infiniop/devices/moore/moore_kernel_common.h
+++ b/src/infiniop/devices/moore/moore_kernel_common.h
@@ -2,6 +2,7 @@

 #include <musa_bf16.h>
 #include <musa_fp16.h>
+#include <musa_fp8.h>

 // Posible maximum number of threads per block for MUSA architectures
 // Used for picking correct kernel launch configuration
@@ -13,6 +14,7 @@

 using cuda_bfloat16 = mt_bfloat16;
 using cuda_bfloat162 = mt_bfloat162;
+using cuda_fp8_e4m3 = __mt_fp8_e4m3;

 namespace device::moore {


--- a/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
+++ b/src/infiniop/devices/nvidia/nvidia_kernel_common.cuh
@@ -9,6 +9,7 @@

 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
+#include <cuda_fp8.h>

 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
@@ -25,6 +26,7 @@ using cuda_bfloat162 = __nv_bfloat162;
 #else
 using cuda_bfloat16 = nv_bfloat16;
 using cuda_bfloat162 = nv_bfloat162;
+using cuda_fp8_e4m3 = __nv_fp8_e4m3;
 #endif

 namespace device::nvidia {

--- a/src/infiniop/ops/ones/cpu/ones_cpu.cc
+++ b/src/infiniop/ops/ones/cpu/ones_cpu.cc
+#include "ones_cpu.h"
+
+namespace op::ones::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<OnesOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<OnesOp, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<OnesOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<OnesOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<OnesOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<OnesOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<OnesOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<OnesOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<OnesOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<OnesOp, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<OnesOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<OnesOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<OnesOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<OnesOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ones::cpu
--- a/src/infiniop/ops/ones/cpu/ones_cpu.h
+++ b/src/infiniop/ops/ones/cpu/ones_cpu.h
+#ifndef __ONES_CPU_H__
+#define __ONES_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(ones, cpu)
+
+namespace op::ones::cpu {
+typedef struct OnesOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return static_cast<T>(1.0);
+    }
+} OnesOp;
+} // namespace op::ones::cpu
+
+#endif // __ONES_CPU_H__
--- a/src/infiniop/ops/ones/cuda/kernel.cuh
+++ b/src/infiniop/ops/ones/cuda/kernel.cuh
+#ifndef __ONES_CUDA_H__
+#define __ONES_CUDA_H__
+
+namespace op::ones::cuda {
+typedef struct OnesOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, bool>) { // 1
+            return true;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 2
+            return 1;
+        } else if constexpr (std::is_same_v<T, int8_t>) { // 3
+            return 1;
+        } else if constexpr (std::is_same_v<T, int16_t>) { // 4
+            return 1;
+        } else if constexpr (std::is_same_v<T, int32_t>) { // 5
+            return 1;
+        } else if constexpr (std::is_same_v<T, int64_t>) { // 6
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 7
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint16_t>) { // 8
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint32_t>) { // 9
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint64_t>) { // 10
+            return 1;
+        } else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
+            return cuda_fp8_e4m3(1.0f);
+        } else if constexpr (std::is_same_v<T, half>) { // 12
+            return __float2half(1.0f);
+        } else if constexpr (std::is_same_v<T, float>) { // 13
+            return 1.0f;
+        } else if constexpr (std::is_same_v<T, double>) { // 14
+            return 1.0;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
+            return __float2bfloat16(1.0f);
+        } else {
+            return 1.0;
+        }
+    }
+} OnesOp;
+} // namespace op::ones::cuda
+
+#endif // __ONES_CUDA_H__
--- a/src/infiniop/ops/ones/metax/ones_metax.h
+++ b/src/infiniop/ops/ones/metax/ones_metax.h
+#ifndef __ONES_METAX_API_H__
+#define __ONES_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(ones, metax)
+
+#endif // __ONES_METAX_API_H__
--- a/src/infiniop/ops/ones/metax/ones_metax.maca
+++ b/src/infiniop/ops/ones/metax/ones_metax.maca
+#include "ones_metax.h"
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "../cuda/kernel.cuh"
+
+
+namespace op::ones::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ones::metax
--- a/src/infiniop/ops/ones/moore/ones_moore.h
+++ b/src/infiniop/ops/ones/moore/ones_moore.h
+#ifndef __ONES_MOORE_API_H__
+#define __ONES_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(ones, moore)
+
+#endif // __ONES_MOORE_API_H__
--- a/src/infiniop/ops/ones/moore/ones_moore.mu
+++ b/src/infiniop/ops/ones/moore/ones_moore.mu
+#include "ones_moore.h"
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include "../cuda/kernel.cuh"
+
+
+namespace op::ones::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ones::moore
--- a/src/infiniop/ops/ones/moore/ones_moore_kernel.h
+++ b/src/infiniop/ops/ones/moore/ones_moore_kernel.h
+#ifndef __ONES_MOORE_KERNEL_H__
+#define __ONES_MOORE_KERNEL_H__
+
+#include <cuda_fp8.h>
+namespace op::ones::cuda {
+typedef struct OnesOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, bool>) { // 1
+            return true;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 2
+            return 1;
+        } else if constexpr (std::is_same_v<T, int8_t>) { // 3
+            return 1;
+        } else if constexpr (std::is_same_v<T, int16_t>) { // 4
+            return 1;
+        } else if constexpr (std::is_same_v<T, int32_t>) { // 5
+            return 1;
+        } else if constexpr (std::is_same_v<T, int64_t>) { // 6
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 7
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint16_t>) { // 8
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint32_t>) { // 9
+            return 1;
+        } else if constexpr (std::is_same_v<T, uint64_t>) { // 10
+            return 1;
+        } else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
+            return cuda_fp8_e4m3(1.0f);
+        } else if constexpr (std::is_same_v<T, half>) { // 12
+            return __float2half(1.0f);
+        } else if constexpr (std::is_same_v<T, float>) { // 13
+            return 1.0f;
+        } else if constexpr (std::is_same_v<T, double>) { // 14
+            return 1.0;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
+            return __float2bfloat16(1.0f);
+        } else {
+            return 1.0;
+        }
+    }
+} OnesOp;
+} // namespace op::ones::cuda
+
+#endif // __ONES_MOORE_KERNEL_H__
--- a/src/infiniop/ops/ones/nvidia/ones_nvidia.cu
+++ b/src/infiniop/ops/ones/nvidia/ones_nvidia.cu
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "ones_nvidia.cuh"
+
+namespace op::ones::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::OnesOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::OnesOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::OnesOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::OnesOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::OnesOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::OnesOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::OnesOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::OnesOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::OnesOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::OnesOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::OnesOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::OnesOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::OnesOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::OnesOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::ones::nvidia
--- a/src/infiniop/ops/ones/nvidia/ones_nvidia.cuh
+++ b/src/infiniop/ops/ones/nvidia/ones_nvidia.cuh
+#ifndef __ONES_CUDA_API_H__
+#define __ONES_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(ones, nvidia)
+
+#endif // __ONES_CUDA_API_H__