issue/473 - the ones and zeros operators

Co-authored-by: pengcheng888 <pengcheng@example.com>

issue/473 - the ones and zeros operators
Co-authored-by: pengcheng888 <pengcheng@example.com>
9b8de584 · pengcheng888 · GitHub · f5e6d729 · 9b8de584 · 9b8de584
Unverified Commit 9b8de584 authored Oct 23, 2025 by pengcheng888 Committed by GitHub Oct 23, 2025
18 changed files
--- a/src/infiniop/ops/ones/operator.cc
+++ b/src/infiniop/ops/ones/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/ones.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/ones_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/ones_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/ones_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/ones_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateOnesDescriptor(
+    infiniopHandle_t handle,
+    infiniopOnesDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::ones::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::ones::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetOnesWorkspaceSize(infiniopOnesDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::ones::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopOnes(
+    infiniopOnesDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::ones::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyOnesDescriptor(infiniopOnesDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::ones::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/src/infiniop/ops/zeros/cpu/zeros_cpu.cc
+++ b/src/infiniop/ops/zeros/cpu/zeros_cpu.cc
+#include "zeros_cpu.h"
+
+namespace op::zeros::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<ZerosOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<ZerosOp, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<ZerosOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<ZerosOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<ZerosOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<ZerosOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<ZerosOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<ZerosOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<ZerosOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<ZerosOp, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<ZerosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<ZerosOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<ZerosOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<ZerosOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::zeros::cpu
--- a/src/infiniop/ops/zeros/cpu/zeros_cpu.h
+++ b/src/infiniop/ops/zeros/cpu/zeros_cpu.h
+#ifndef __ZEROS_CPU_H__
+#define __ZEROS_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(zeros, cpu)
+
+namespace op::zeros::cpu {
+typedef struct ZerosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return static_cast<T>(0.0);
+    }
+} ZerosOp;
+} // namespace op::zeros::cpu
+
+#endif // __ZEROS_CPU_H__
--- a/src/infiniop/ops/zeros/cuda/kernel.cuh
+++ b/src/infiniop/ops/zeros/cuda/kernel.cuh
+#ifndef __ZEROS_CUDA_H__
+#define __ZEROS_CUDA_H__
+
+namespace op::zeros::cuda {
+typedef struct ZerosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, bool>) { // 1
+            return false;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 2
+            return 0;
+        } else if constexpr (std::is_same_v<T, int8_t>) { // 3
+            return 0;
+        } else if constexpr (std::is_same_v<T, int16_t>) { // 4
+            return 0;
+        } else if constexpr (std::is_same_v<T, int32_t>) { // 5
+            return 0;
+        } else if constexpr (std::is_same_v<T, int64_t>) { // 6
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 7
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint16_t>) { // 8
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint32_t>) { // 9
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint64_t>) { // 10
+            return 0;
+        } else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
+            return cuda_fp8_e4m3(0.0f);
+        } else if constexpr (std::is_same_v<T, half>) { // 12
+            return __float2half(0.0f);
+        } else if constexpr (std::is_same_v<T, float>) { // 13
+            return 0.0f;
+        } else if constexpr (std::is_same_v<T, double>) { // 14
+            return 0.0;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
+            return __float2bfloat16(0.0f);
+        } else {
+            return 0.0;
+        }
+    }
+} ZerosOp;
+} // namespace op::zeros::cuda
+
+#endif // __ZEROS_CUDA_H__
--- a/src/infiniop/ops/zeros/metax/zeros_metax.h
+++ b/src/infiniop/ops/zeros/metax/zeros_metax.h
+#ifndef __ZEROS_METAX_API_H__
+#define __ZEROS_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(zeros, metax)
+
+#endif // __ZEROS_METAX_API_H__
--- a/src/infiniop/ops/zeros/metax/zeros_metax.maca
+++ b/src/infiniop/ops/zeros/metax/zeros_metax.maca
+#include "zeros_metax.h"
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "../cuda/kernel.cuh"
+
+
+namespace op::zeros::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::zeros::metax
--- a/src/infiniop/ops/zeros/moore/zeros_moore.h
+++ b/src/infiniop/ops/zeros/moore/zeros_moore.h
+#ifndef __ZEROS_MOORE_API_H__
+#define __ZEROS_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(zeros, moore)
+
+#endif // __ZEROS_MOORE_API_H__
--- a/src/infiniop/ops/zeros/moore/zeros_moore.mu
+++ b/src/infiniop/ops/zeros/moore/zeros_moore.mu
+#include "zeros_moore.h"
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include "../cuda/kernel.cuh"
+
+
+namespace op::zeros::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create MOORE elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::zeros::moore
--- a/src/infiniop/ops/zeros/moore/zeros_moore_kernel.h
+++ b/src/infiniop/ops/zeros/moore/zeros_moore_kernel.h
+#ifndef __ZEROS_MOORE_KERNEL_H__
+#define __ZEROS_MOORE_KERNEL_H__
+
+#include <cuda_fp8.h>
+namespace op::zeros::cuda {
+typedef struct ZerosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, bool>) { // 1
+            return false;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 2
+            return 0;
+        } else if constexpr (std::is_same_v<T, int8_t>) { // 3
+            return 0;
+        } else if constexpr (std::is_same_v<T, int16_t>) { // 4
+            return 0;
+        } else if constexpr (std::is_same_v<T, int32_t>) { // 5
+            return 0;
+        } else if constexpr (std::is_same_v<T, int64_t>) { // 6
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint8_t>) { // 7
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint16_t>) { // 8
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint32_t>) { // 9
+            return 0;
+        } else if constexpr (std::is_same_v<T, uint64_t>) { // 10
+            return 0;
+        } else if constexpr (std::is_same_v<T, cuda_fp8_e4m3>) { // 11
+            return cuda_fp8_e4m3(0.0f);
+        } else if constexpr (std::is_same_v<T, half>) { // 12
+            return __float2half(0.0f);
+        } else if constexpr (std::is_same_v<T, float>) { // 13
+            return 0.0f;
+        } else if constexpr (std::is_same_v<T, double>) { // 14
+            return 0.0;
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // 19
+            return __float2bfloat16(0.0f);
+        } else {
+            return 0.0;
+        }
+    }
+} ZerosOp;
+} // namespace op::zeros::cuda
+
+#endif // __ZEROS_MOORE_KERNEL_H__
--- a/src/infiniop/ops/zeros/nvidia/zeros_nvidia.cu
+++ b/src/infiniop/ops/zeros/nvidia/zeros_nvidia.cu
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "zeros_nvidia.cuh"
+
+namespace op::zeros::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_BYTE, // 1
+                INFINI_DTYPE_BOOL, // 2
+                INFINI_DTYPE_I8,   // 3
+                INFINI_DTYPE_I16,  // 4
+                INFINI_DTYPE_I32,  // 5
+                INFINI_DTYPE_I64,  // 6
+                INFINI_DTYPE_U8,   // 7
+                INFINI_DTYPE_U16,  // 8
+                INFINI_DTYPE_U32,  // 9
+                INFINI_DTYPE_U64,  // 10
+                INFINI_DTYPE_F8,   // 11
+                INFINI_DTYPE_F16,  // 12
+                INFINI_DTYPE_F32,  // 13
+                INFINI_DTYPE_F64,  // 14
+                INFINI_DTYPE_BF16, // 19
+    );
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BYTE: // 1
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL: // 2
+        return _device_info->calculate<256, cuda::ZerosOp, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8: // 3
+        return _device_info->calculate<256, cuda::ZerosOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16: // 4
+        return _device_info->calculate<256, cuda::ZerosOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32: // 5
+        return _device_info->calculate<256, cuda::ZerosOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64: // 6
+        return _device_info->calculate<256, cuda::ZerosOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8: // 7
+        return _device_info->calculate<256, cuda::ZerosOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16: // 8
+        return _device_info->calculate<256, cuda::ZerosOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32: // 9
+        return _device_info->calculate<256, cuda::ZerosOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64: // 10
+        return _device_info->calculate<256, cuda::ZerosOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F8: // 11
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_fp8_e4m3>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16: // 12
+        return _device_info->calculate<256, cuda::ZerosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32: // 13
+        return _device_info->calculate<256, cuda::ZerosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64: // 14
+        return _device_info->calculate<256, cuda::ZerosOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_C16: // 15
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C32: // 16
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C64: // 17
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_C128: // 18
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    case INFINI_DTYPE_BF16: // 19
+        return _device_info->calculate<256, cuda::ZerosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::zeros::nvidia
--- a/src/infiniop/ops/zeros/nvidia/zeros_nvidia.cuh
+++ b/src/infiniop/ops/zeros/nvidia/zeros_nvidia.cuh
+#ifndef __ZEROS_CUDA_API_H__
+#define __ZEROS_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(zeros, nvidia)
+
+#endif // __ZEROS_CUDA_API_H__
--- a/src/infiniop/ops/zeros/operator.cc
+++ b/src/infiniop/ops/zeros/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/zeros.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/zeros_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/zeros_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/zeros_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/zeros_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateZerosDescriptor(
+    infiniopHandle_t handle,
+    infiniopZerosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::zeros::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::zeros::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                          \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetZerosWorkspaceSize(infiniopZerosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::zeros::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopZeros(
+    infiniopZerosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::zeros::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyZerosDescriptor(infiniopZerosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::zeros::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
--- a/test/infiniop-test/test_generate/testcases/ones.py
+++ b/test/infiniop-test/test_generate/testcases/ones.py
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+
+
+def ones(x: np.ndarray):
+    return np.ones_like(x)
+
+
+class OnesTestCase(InfiniopTestCase):
+    def __init__(self,
+                 x: np.ndarray,
+                 shape_x: List[int] | None,
+                 stride_x: List[int] | None,
+                 y: np.ndarray,
+                 shape_y: List[int] | None,
+                 stride_y: List[int] | None
+                 ):
+        super().__init__("ones")
+        self.x = x
+        self.shape_x = shape_x
+        self.stride_x = stride_x
+        self.y = y
+        self.shape_y = shape_y
+        self.stride_y = stride_y
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
+        if self.stride_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+
+        test_writer.add_array(
+            test_writer.gguf_key("y.strides"),
+            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+        ans = ones(
+            self.x.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("ones.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # shape, x_stride, y_stride
+        ((13, 4), None, None),
+        ((13, 4), (10, 1), (10, 1)),
+        ((13, 4, 4), None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+        ((16, 5632), None, None),
+        ((16, 5632), (13312, 1), (13312, 1)),
+        ((4, 4, 5632), None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+    ]
+
+    _TENSOR_DTYPES_ = [np.bool_,  # 2
+                       np.int8,  # 3
+                       np.int16,  # 4
+                       np.int32,  # 5
+                       np.int64,  # 6
+                       # np.uint8,  # 7
+                       # np.uint16,  # 8
+                       # np.uint32,  # 9
+                       # np.uint64,  # 10
+                       # InfiniDtype.F8,  # 11
+                       np.float16,  # 12
+                       np.float32,  # 13
+                       np.float64,  # 14
+                       # InfiniDtype.BF16,  # 19
+                       ]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_x, stride_y in _TEST_CASES_:
+            x = np.random.rand(*shape).astype(dtype)
+
+            y = np.empty(tuple(0 for _ in shape), dtype=dtype)
+            x = process_zero_stride_tensor(x, stride_x)
+
+            test_case = OnesTestCase(
+                x=x,
+                shape_x=shape,
+                stride_x=stride_x,
+                y=y,
+                shape_y=shape,
+                stride_y=stride_y,
+            )
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop-test/test_generate/testcases/zeros.py
+++ b/test/infiniop-test/test_generate/testcases/zeros.py
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+
+
+def zeros(x: np.ndarray):
+    return np.zeros_like(x)
+
+
+class ZerosTestCase(InfiniopTestCase):
+    def __init__(self,
+                 x: np.ndarray,
+                 shape_x: List[int] | None,
+                 stride_x: List[int] | None,
+                 y: np.ndarray,
+                 shape_y: List[int] | None,
+                 stride_y: List[int] | None
+                 ):
+        super().__init__("zeros")
+        self.x = x
+        self.shape_x = shape_x
+        self.stride_x = stride_x
+        self.y = y
+        self.shape_y = shape_y
+        self.stride_y = stride_y
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
+
+        if self.stride_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+        test_writer.add_array(
+            test_writer.gguf_key("y.strides"),
+            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+        )
+        # print(test_writer)
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+        ans = zeros(
+            self.x.astype(np.float64),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("zeros.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # shape, x_stride, y_stride
+        ((13, 4), None, None),
+        ((13, 4), (10, 1), (10, 1)),
+        ((13, 4, 4), None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+        ((16, 5632), None, None),
+        ((16, 5632), (13312, 1), (13312, 1)),
+        ((4, 4, 5632), None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+    ]
+
+    _TENSOR_DTYPES_ = [np.bool_,  # 2
+                       np.int8,  # 3
+                       np.int16,  # 4
+                       np.int32,  # 5
+                       np.int64,  # 6
+                       # np.uint8,  # 7
+                       # np.uint16,  # 8
+                       # np.uint32,  # 9
+                       # np.uint64,  # 10
+                       # InfiniDtype.F8,  # 11
+                       np.float16,  # 12
+                       np.float32,  # 13
+                       np.float64,  # 14
+                       # InfiniDtype.BF16,  # 19
+                       ]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_x, stride_y in _TEST_CASES_:
+            x = np.random.rand(*shape).astype(dtype)
+
+            y = np.empty(tuple(0 for _ in shape), dtype=dtype)
+            x = process_zero_stride_tensor(x, stride_x)
+
+            test_case = ZerosTestCase(
+                x=x,
+                shape_x=shape,
+                stride_x=stride_x,
+                y=y,
+                shape_y=shape,
+                stride_y=stride_y,
+            )
+
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -673,3 +673,66 @@ def softplus_(lib):
    ]
    lib.infiniopDestroySoftplusDescriptor.restype = c_int32
    lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def zeros_(lib):
+    lib.infiniopCreateZerosDescriptor.restype = c_int32
+    lib.infiniopCreateZerosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetZerosWorkspaceSize.restype = c_int32
+    lib.infiniopGetZerosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopZeros.restype = c_int32
+    lib.infiniopZeros.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyZerosDescriptor.restype = c_int32
+    lib.infiniopDestroyZerosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def ones_(lib):
+    lib.infiniopCreateOnesDescriptor.restype = c_int32
+    lib.infiniopCreateOnesDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetOnesWorkspaceSize.restype = c_int32
+    lib.infiniopGetOnesWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopOnes.restype = c_int32
+    lib.infiniopOnes.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyOnesDescriptor.restype = c_int32
+    lib.infiniopDestroyOnesDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -51,6 +51,8 @@ class TestTensor(CTensor):
        scale=None,
        bias=None,
        set_tensor=None,
+        randint_low=None,
+        randint_high=None,
    ):
        self.dt = dt
        self.device = device
@@ -80,7 +82,11 @@ class TestTensor(CTensor):
                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
            )
        elif mode == "randint":
-            self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
+            randint_low = -2000000000 if randint_low is None else randint_low
+            randint_high = 2000000000 if randint_high is None else randint_high
+            self._torch_tensor = torch.randint(randint_low,randint_high, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
+        elif mode == "float8_e4m3fn":
+            self._torch_tensor = torch.rand(shape, dtype=torch.float32, device=torch_device_map[device]).to(dtype=torch.float8_e4m3fn)
        elif mode == "manual":
            assert set_tensor is not None
            assert torch_shape == list(set_tensor.shape)
@@ -139,7 +145,11 @@ class TestTensor(CTensor):


 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
-    if dt == InfiniDtype.I8:
+    if dt == InfiniDtype.BOOL:
+        return torch.bool
+    elif dt == InfiniDtype.BYTE:
+        return torch.uint8
+    elif dt == InfiniDtype.I8:
        return torch.int8
    elif dt == InfiniDtype.I16:
        return torch.int16
@@ -165,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
        return torch.int32 if compatability_mode else torch.uint32
    elif dt == InfiniDtype.U64:
        return torch.int64 if compatability_mode else torch.uint64
+    elif dt == InfiniDtype.F8:
+        return torch.float8_e4m3fn
    else:
        raise ValueError("Unsupported data type")

@@ -269,7 +281,21 @@ def rearrange_tensor(tensor, new_strides):
    new_positions += offset

    # Copy the original data to the new tensor
-    new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    if tensor.dtype in [torch.bool, torch.uint8, torch.int8, torch.int16, torch.int32,torch.int64, torch.float16,torch.bfloat16,torch.float32,torch.float64]:
+        new_tensor.view(-1).index_add_(0, new_positions, tensor.view(-1))
+    elif tensor.dtype in [torch.uint16, torch.uint32, torch.uint64]:
+        new_tensor_int64 = new_tensor.to(dtype=torch.int64)
+        tensor_int64 = tensor.to(dtype=torch.int64)
+        new_tensor_int64.view(-1).index_add_(0, new_positions, tensor_int64.view(-1))
+        new_tensor = new_tensor_int64.to(dtype=tensor.dtype)
+    elif tensor.dtype in [torch.float8_e4m3fn]:
+        new_tensor_float64 = new_tensor.to(dtype=torch.float64)
+        tensor_float64  = tensor.to(dtype=torch.float64)
+        new_tensor_float64.view(-1).index_add_(0, new_positions, tensor_float64.view(-1))
+        new_tensor = new_tensor_float64.to(dtype=tensor.dtype)
+    else:
+        raise ValueError("Unsupported data type")
+    
    new_tensor.set_(new_tensor.untyped_storage(), offset, shape, tuple(new_strides))

    return new_tensor
@@ -484,11 +510,12 @@ def print_discrepancy(
    nan_mismatch = (
        actual_isnan ^ expected_isnan if equal_nan else actual_isnan | expected_isnan
    )
+
    diff_mask = nan_mismatch | (
-        torch.abs(actual - expected) > (atol + rtol * torch.abs(expected))
+        torch.abs(actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)) > (atol + rtol * torch.abs(expected.to(dtype=torch.float64)))
    )
    diff_indices = torch.nonzero(diff_mask, as_tuple=False)
-    delta = actual - expected
+    delta = actual.to(dtype=torch.float64) - expected.to(dtype=torch.float64)

    # Display format: widths for columns
    col_width = [18, 20, 20, 20]

--- a/test/infiniop/ones.py
+++ b/test/infiniop/ones.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), (0, 1)),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BYTE,  # 1
+                  InfiniDtype.BOOL,  # 2
+                  InfiniDtype.I8,  # 3
+                  InfiniDtype.I16,  # 4
+                  InfiniDtype.I32,  # 5
+                  InfiniDtype.I64,  # 6
+                  InfiniDtype.U8,  # 7
+                #   InfiniDtype.U16,  # 8
+                #   InfiniDtype.U32,  # 9
+                #   InfiniDtype.U64,  # 10
+                #   InfiniDtype.F8,  # 11
+                  InfiniDtype.F16,  # 12
+                  InfiniDtype.F32,  # 13
+                  InfiniDtype.F64,  # 14
+                  InfiniDtype.BF16,  # 19
+                  ]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BYTE: {"atol": 1e-3, "rtol": 1e-3},  # 1
+    InfiniDtype.BOOL: {"atol": 1e-3, "rtol": 1e-3},  # 2
+    InfiniDtype.I8: {"atol": 1e-3, "rtol": 1e-3},  # 3
+    InfiniDtype.I16: {"atol": 1e-3, "rtol": 1e-3},  # 4
+    InfiniDtype.I32: {"atol": 1e-3, "rtol": 1e-3},  # 5
+    InfiniDtype.I64: {"atol": 1e-3, "rtol": 1e-3},  # 6
+    InfiniDtype.U8: {"atol": 1e-3, "rtol": 1e-3},  # 7
+    InfiniDtype.U16: {"atol": 1e-3, "rtol": 1e-3},  # 8
+    InfiniDtype.U32: {"atol": 1e-3, "rtol": 1e-3},  # 9
+    InfiniDtype.U64: {"atol": 1e-3, "rtol": 1e-3},  # 10
+    InfiniDtype.F8: {"atol": 1e-3, "rtol": 1e-3},  # 11
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},  # 12
+    InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},  # 13
+    InfiniDtype.F64: {"atol": 1e-3, "rtol": 1e-3},  # 14
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},  # 19
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_ones(y, x):
+    # y[...] = 1
+    y.copy_(torch.ones_like(y))
+
+def test(
+        handle,
+        device,
+        shape,
+        x_stride=None,
+        y_stride=None,
+        inplace=Inplace.OUT_OF_PLACE,
+        dtype=None,
+        sync=None,
+):
+    if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
+        x = TestTensor(shape, x_stride, dtype, device)
+    elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
+                   InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=16)
+    elif dtype in [InfiniDtype.F8]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="float8_e4m3fn")
+    elif dtype in [InfiniDtype.BOOL]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=2)
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Ones on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    torch_ones(y.torch_tensor(), x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateOnesDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetOnesWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_ones():
+        check_error(
+            LIBINFINIOP.infiniopOnes(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_ones()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(y.actual_tensor().to(dtype=torch.float32), y.torch_tensor().to(dtype=torch.float32), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_ones(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_ones(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyOnesDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
--- a/test/infiniop/zeros.py
+++ b/test/infiniop/zeros.py
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), (0, 1)),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+                  InfiniDtype.BYTE,  # 1
+                  InfiniDtype.BOOL,  # 2
+                  InfiniDtype.I8,  # 3
+                  InfiniDtype.I16,  # 4
+                  InfiniDtype.I32,  # 5
+                  InfiniDtype.I64,  # 6
+                  InfiniDtype.U8,  # 7
+                #   InfiniDtype.U16,  # 8
+                #   InfiniDtype.U32,  # 9
+                #   InfiniDtype.U64,  # 10
+                #   InfiniDtype.F8,  # 11
+                  InfiniDtype.F16,  # 12
+                  InfiniDtype.F32,  # 13
+                  InfiniDtype.F64,  # 14
+                  InfiniDtype.BF16,  # 19
+                  ]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BYTE: {"atol": 1e-3, "rtol": 1e-3},  # 1
+    InfiniDtype.BOOL: {"atol": 1e-3, "rtol": 1e-3},  # 2
+    InfiniDtype.I8: {"atol": 1e-3, "rtol": 1e-3},  # 3
+    InfiniDtype.I16: {"atol": 1e-3, "rtol": 1e-3},  # 4
+    InfiniDtype.I32: {"atol": 1e-3, "rtol": 1e-3},  # 5
+    InfiniDtype.I64: {"atol": 1e-3, "rtol": 1e-3},  # 6
+    InfiniDtype.U8: {"atol": 1e-3, "rtol": 1e-3},  # 7
+    InfiniDtype.U16: {"atol": 1e-3, "rtol": 1e-3},  # 8
+    InfiniDtype.U32: {"atol": 1e-3, "rtol": 1e-3},  # 9
+    InfiniDtype.U64: {"atol": 1e-3, "rtol": 1e-3},  # 10
+    InfiniDtype.F8: {"atol": 1e-3, "rtol": 1e-3},  # 11
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},  # 12
+    InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},  # 13
+    InfiniDtype.F64: {"atol": 1e-3, "rtol": 1e-3},  # 14
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},  # 19
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_zeros(y, x):
+    # y[...] = 0
+    y.copy_(torch.zeros_like(y))
+
+
+def test(
+        handle,
+        device,
+        shape,
+        x_stride=None,
+        y_stride=None,
+        inplace=Inplace.OUT_OF_PLACE,
+        dtype=None,
+        sync=None,
+):
+    if dtype in [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32, InfiniDtype.F64]:
+        x = TestTensor(shape, x_stride, dtype, device)
+    elif dtype in [InfiniDtype.BYTE, InfiniDtype.U8, InfiniDtype.U16, InfiniDtype.U32, InfiniDtype.U64,
+                   InfiniDtype.I8, InfiniDtype.I16, InfiniDtype.I32, InfiniDtype.I64]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=16)
+    elif dtype in [InfiniDtype.F8]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="float8_e4m3fn")
+    elif dtype in [InfiniDtype.BOOL]:
+        x = TestTensor(shape, x_stride, dtype, device, mode="randint", randint_low=0, randint_high=2)
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Zeros on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    torch_zeros(y.torch_tensor(), x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateZerosDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetZerosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_zeros():
+        check_error(
+            LIBINFINIOP.infiniopZeros(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_zeros()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+
+    assert torch.allclose(y.actual_tensor().to(dtype=torch.float32), y.torch_tensor().to(dtype=torch.float32), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_zeros(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_zeros(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyZerosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")