issue/1031 T1-1-17

45a3794b · wooway777 · cb7f0b7d · 45a3794b · 45a3794b · 45a3794b
Commit 45a3794b authored Mar 11, 2026 by wooway777
20 changed files
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -17,12 +17,12 @@ def run_tests(args):
        "causal_softmax.py",
        "clip.py",
        "conv.py",
-        #"dequantize_awq.py",
+        # "dequantize_awq.py",
        "gelu.py",
        "gemm.py",
-        #"layer_norm.py",
+        # "layer_norm.py",
        "logsoftmax.py",
-        #"lp_norm.py",
+        # "lp_norm.py",
        "mul.py",
        "ones.py",
        "random_sample.py",
@@ -31,7 +31,7 @@ def run_tests(args):
        "rms_norm.py",
        "rope.py",
        "sigmoid.py",
-        #"softmax.py",
+        # "softmax.py",
        "softplus.py",
        "sub.py",
        "swiglu.py",
@@ -42,6 +42,7 @@ def run_tests(args):
        # "paged_attention.py",
        # "paged_caching.py",
        # "paged_attention_prefill.py"
+        "cross_entropy.py",
    ]:
        result = subprocess.run(
            f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True

--- a/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
+++ b/src/infinicore/ops/avg_pool1d/avg_pool1d.cc
+#include "infinicore/ops/avg_pool1d.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<AvgPool1d::schema> &AvgPool1d::dispatcher() {
+    static common::OpDispatcher<AvgPool1d::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void AvgPool1d::execute(
+    Tensor output,
+    Tensor input,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No AvgPool1d implementation for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, kernel_size, stride, padding);
+}
+
+Tensor avg_pool1d(Tensor input, size_t kernel_size, size_t stride, size_t padding) {
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    const auto &shape = input->shape();
+    if (shape.size() != 3) {
+        throw std::runtime_error("AvgPool1d expects tensors with shape [N, C, L]");
+    }
+
+    const size_t n = shape[0];
+    const size_t c = shape[1];
+    const size_t l_in = shape[2];
+
+    if (l_in + 2 * padding < kernel_size) {
+        throw std::runtime_error("AvgPool1d kernel_size is larger than padded length");
+    }
+
+    const size_t out_width = (l_in + 2 * padding - kernel_size) / stride + 1;
+
+    Shape out_shape = {n, c, out_width};
+    auto output = Tensor::empty(out_shape, input->dtype(), input->device());
+    avg_pool1d_(output, input, kernel_size, stride, padding);
+    return output;
+}
+
+void avg_pool1d_(Tensor output, Tensor input, size_t kernel_size, size_t stride, size_t padding) {
+    AvgPool1d::execute(output, input, kernel_size, stride, padding);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
+++ b/src/infinicore/ops/avg_pool1d/avg_pool1d_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/avg_pool1d.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::avg_pool1d_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopAvgPool1dDescriptor_t> caches(
+    100,
+    [](infiniopAvgPool1dDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAvgPool1dDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(
+    Tensor output,
+    Tensor input,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    if (stride == 0) {
+        stride = kernel_size;
+    }
+
+    size_t seed = hash_combine(output, input, kernel_size, stride, padding);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAvgPool1dDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAvgPool1dDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            kernel_size,
+            stride,
+            padding));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAvgPool1dWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAvgPool1d(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    AvgPool1d::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::avg_pool1d_impl::infiniop
--- a/src/infinicore/ops/cross_entropy/cross_entropy.cc
+++ b/src/infinicore/ops/cross_entropy/cross_entropy.cc
+#include "infinicore/ops/cross_entropy.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<CrossEntropy::schema> &CrossEntropy::dispatcher() {
+    static common::OpDispatcher<CrossEntropy::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void CrossEntropy::execute(Tensor output, Tensor input, Tensor target) {
+
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(input, target);
+
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No CrossEntropy implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, target);
+}
+
+Tensor cross_entropy(Tensor input, Tensor target) {
+
+    Shape shape = target->shape();
+
+    auto output = Tensor::empty(shape, input->dtype(), input->device());
+
+    cross_entropy_(output, input, target);
+    return output;
+}
+
+void cross_entropy_(Tensor output, Tensor input, Tensor target) {
+    CrossEntropy::execute(output, input, target);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
+++ b/src/infinicore/ops/cross_entropy/cross_entropy_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/cross_entropy.hpp"
+
+#include <infiniop.h>
+
+namespace infinicore::op::cross_entropy_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopCrossEntropyDescriptor_t> caches(
+    100,
+    [](infiniopCrossEntropyDescriptor_t &desc) {
+        if (desc != nullptr) {
+
+            INFINICORE_CHECK_ERROR(infiniopDestroyCrossEntropyDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, Tensor target) {
+
+    size_t seed = hash_combine(output, input, target);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopCrossEntropyDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+
+        INFINICORE_CHECK_ERROR(infiniopCreateCrossEntropyDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            target->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetCrossEntropyWorkspaceSize(desc, &workspace_size));
+
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopCrossEntropy(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        target->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    CrossEntropy::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::cross_entropy_impl::infiniop
--- a/src/infinicore/ops/equal/equal.cc
+++ b/src/infinicore/ops/equal/equal.cc
+#include "infinicore/ops/equal.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Equal::schema> &Equal::dispatcher() {
+    static common::OpDispatcher<Equal::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Equal::execute(Tensor out, Tensor a, Tensor b) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, a, b);
+    infinicore::context::setDevice(out->device());
+    dispatcher().lookup(out->device().getType())(out, a, b);
+}
+
+Tensor equal(Tensor a, Tensor b) {
+    auto out = Tensor::empty(a->shape(), DataType::BOOL, a->device());
+    equal_(out, a, b);
+    return out;
+}
+
+void equal_(Tensor out, Tensor a, Tensor b) {
+    if (out->dtype() != DataType::BOOL) {
+        throw std::runtime_error("Equal expects bool output tensor.");
+    }
+    Equal::execute(out, a, b);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/equal/equal_infiniop.cc
+++ b/src/infinicore/ops/equal/equal_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/equal.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::equal_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopEqualDescriptor_t> caches(
+    100,
+    [](infiniopEqualDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyEqualDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor a, Tensor b) {
+    size_t seed = hash_combine(out, a, b);
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    infiniopEqualDescriptor_t desc = nullptr;
+    if (auto cached = cache.get(seed)) {
+        desc = *cached;
+    } else {
+        INFINICORE_CHECK_ERROR(infiniopCreateEqualDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            out->desc(), a->desc(), b->desc()));
+        cache.put(seed, desc);
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetEqualWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopEqual(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        out->data(),
+        a->data(),
+        b->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Equal::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::equal_impl::infiniop
--- a/src/infinicore/ops/hardswish/hardswish.cc
+++ b/src/infinicore/ops/hardswish/hardswish.cc
+#include "infinicore/ops/hardswish.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<Hardswish::schema> &Hardswish::dispatcher() {
+    static common::OpDispatcher<Hardswish::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void Hardswish::execute(Tensor output, Tensor input) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    infinicore::context::setDevice(output->device());
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No Hardswish implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input);
+}
+
+Tensor hardswish(Tensor input) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    hardswish_(output, input);
+    return output;
+}
+
+void hardswish_(Tensor output, Tensor input) {
+    Hardswish::execute(output, input);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/hardswish/hardswish_infiniop.cc
+++ b/src/infinicore/ops/hardswish/hardswish_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/hardswish.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::hardswish_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopHardSwishDescriptor_t> caches(
+    100,
+    [](infiniopHardSwishDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyHardSwishDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input) {
+    size_t seed = hash_combine(output, input);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHardSwishDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateHardSwishDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHardSwishWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopHardSwish(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    Hardswish::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::hardswish_impl::infiniop
--- a/src/infinicore/ops/hardtanh/hardtanh.cc
+++ b/src/infinicore/ops/hardtanh/hardtanh.cc
+#include "infinicore/ops/hardtanh.hpp"
+
+#include "../../utils.hpp"
+
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<HardTanh::schema> &HardTanh::dispatcher() {
+    static common::OpDispatcher<HardTanh::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void HardTanh::execute(Tensor output, Tensor input, float min_val, float max_val) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
+    infinicore::context::setDevice(output->device());
+
+    auto device_type = output->device().getType();
+    auto func = dispatcher().lookup(device_type);
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No HardTanh implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input, min_val, max_val);
+}
+
+Tensor hardtanh(Tensor input, float min_val, float max_val) {
+    auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
+    hardtanh_(output, input, min_val, max_val);
+    return output;
+}
+
+void hardtanh_(Tensor output, Tensor input, float min_val, float max_val) {
+    HardTanh::execute(output, input, min_val, max_val);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
+++ b/src/infinicore/ops/hardtanh/hardtanh_infiniop.cc
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/hardtanh.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::hardtanh_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopHardTanhDescriptor_t> caches(
+    100,
+    [](infiniopHardTanhDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyHardTanhDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, float min_val, float max_val) {
+    size_t seed = hash_combine(output, input, min_val, max_val);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHardTanhDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateHardTanhDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            output->desc(),
+            input->desc(),
+            min_val,
+            max_val));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHardTanhWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace;
+    void *workspace_ptr = nullptr;
+    if (workspace_size != 0) {
+        workspace = context::allocateMemory(workspace_size);
+        workspace_ptr = workspace->data();
+    }
+
+    INFINICORE_CHECK_ERROR(infiniopHardTanh(
+        desc,
+        workspace_ptr,
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()));
+}
+
+static bool registered = []() {
+    HardTanh::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::hardtanh_impl::infiniop
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -6,9 +6,14 @@
 #include "ops/add_rms_norm.hpp"
 #include "ops/all.hpp"
 #include "ops/attention.hpp"
+#include "ops/avg_pool1d.hpp"
 #include "ops/causal_softmax.hpp"
+#include "ops/cross_entropy.hpp"
 #include "ops/embedding.hpp"
+#include "ops/equal.hpp"
 #include "ops/flash_attention.hpp"
+#include "ops/hardswish.hpp"
+#include "ops/hardtanh.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/linear.hpp"
 #include "ops/linear_w8a8i8.hpp"
@@ -45,12 +50,16 @@ inline void bind(py::module &m) {
    bind_matmul(m);
    bind_mul(m);
    bind_mha_varlen(m);
+    bind_hardswish(m);
+    bind_hardtanh(m);
    bind_paged_attention(m);
    bind_paged_attention_prefill(m);
    bind_paged_caching(m);
    bind_random_sample(m);
+    bind_cross_entropy(m);
    bind_rearrange(m);
    bind_rms_norm(m);
+    bind_avg_pool1d(m);
    bind_silu(m);
    bind_swiglu(m);
    bind_rope(m);
@@ -62,6 +71,7 @@ inline void bind(py::module &m) {
    bind_var(m);
    bind_topk(m);
    bind_all(m);
+    bind_equal(m);
 }

 } // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/avg_pool1d.hpp
+++ b/src/infinicore/pybind11/ops/avg_pool1d.hpp
+#pragma once
+
+#include <optional>
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/avg_pool1d.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_avg_pool1d(py::module &m) {
+    m.def(
+        "avg_pool1d",
+        [](::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
+            return op::avg_pool1d(input, kernel_size, stride.value_or(0), padding);
+        },
+        py::arg("input"),
+        py::arg("kernel_size"),
+        py::arg("stride") = py::none(),
+        py::arg("padding") = 0,
+        R"doc(AvgPool1d out-of-place.)doc");
+
+    m.def(
+        "avg_pool1d_",
+        [](::infinicore::Tensor output, ::infinicore::Tensor input, size_t kernel_size, std::optional<size_t> stride, size_t padding) {
+            op::avg_pool1d_(output, input, kernel_size, stride.value_or(0), padding);
+        },
+        py::arg("output"),
+        py::arg("input"),
+        py::arg("kernel_size"),
+        py::arg("stride") = py::none(),
+        py::arg("padding") = 0,
+        R"doc(AvgPool1d in-place variant writing to provided output tensor.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/cross_entropy.hpp
+++ b/src/infinicore/pybind11/ops/cross_entropy.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/cross_entropy.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_cross_entropy(py::module &m) {
+    m.def("cross_entropy",
+          &op::cross_entropy,
+          py::arg("logits"),
+          py::arg("target"),
+          R"doc(Token-wise cross entropy loss without reduction.)doc");
+
+    m.def("cross_entropy_",
+          &op::cross_entropy_,
+          py::arg("loss"),
+          py::arg("logits"),
+          py::arg("target"),
+          R"doc(Write cross entropy loss into a provided tensor.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/equal.hpp
+++ b/src/infinicore/pybind11/ops/equal.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/equal.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_equal(py::module &m) {
+    m.def("equal",
+          &op::equal,
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(Elementwise equality returning a bool tensor.)doc");
+
+    m.def("equal_",
+          &op::equal_,
+          py::arg("out"),
+          py::arg("a"),
+          py::arg("b"),
+          R"doc(In-place elementwise equality writing into `out`.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/hardswish.hpp
+++ b/src/infinicore/pybind11/ops/hardswish.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/hardswish.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_hardswish(py::module &m) {
+    m.def("hardswish",
+          &op::hardswish,
+          py::arg("input"),
+          R"doc(Out-of-place Hardswish activation.)doc");
+
+    m.def("hardswish_",
+          &op::hardswish_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place Hardswish activation.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infinicore/pybind11/ops/hardtanh.hpp
+++ b/src/infinicore/pybind11/ops/hardtanh.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/hardtanh.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_hardtanh(py::module &m) {
+    m.def("hardtanh",
+          &op::hardtanh,
+          py::arg("input"),
+          py::arg("min_val") = -1.0f,
+          py::arg("max_val") = 1.0f,
+          R"doc(Apply the HardTanh activation.)doc");
+
+    m.def("hardtanh_",
+          &op::hardtanh_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("min_val") = -1.0f,
+          py::arg("max_val") = 1.0f,
+          R"doc(In-place HardTanh activation.)doc");
+}
+
+} // namespace infinicore::ops
--- a/src/infiniop/ops/avg_pool1d/avg_pool1d.h
+++ b/src/infiniop/ops/avg_pool1d/avg_pool1d.h
+#ifndef __AVG_POOL1D_H__
+#define __AVG_POOL1D_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/avg_pool1d.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::avg_pool1d::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        AvgPool1dInfo _info;                                     \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            AvgPool1dInfo info,                                  \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t kernel_size,                                  \
+            size_t stride,                                       \
+            size_t padding);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+class AvgPool1dInfo {
+private:
+    AvgPool1dInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+    size_t batch, channels, in_width, out_width;
+    size_t kernel_size, stride, padding;
+
+    ptrdiff_t y_stride_batch, y_stride_channel, y_stride_width;
+    ptrdiff_t x_stride_batch, x_stride_channel, x_stride_width;
+
+    static utils::Result<AvgPool1dInfo> createAvgPool1dInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        size_t kernel_size,
+        size_t stride,
+        size_t padding) {
+
+        CHECK_OR_RETURN(y_desc != nullptr && x_desc != nullptr, INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = y_desc->dtype();
+        CHECK_OR_RETURN(dtype == x_desc->dtype(), INFINI_STATUS_BAD_TENSOR_DTYPE);
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(y_desc->ndim() == 3 && x_desc->ndim() == 3, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t batch = x_desc->dim(0);
+        size_t channels = x_desc->dim(1);
+        size_t in_width = x_desc->dim(2);
+
+        CHECK_OR_RETURN(y_desc->dim(0) == batch, INFINI_STATUS_BAD_TENSOR_SHAPE);
+        CHECK_OR_RETURN(y_desc->dim(1) == channels, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t padded_len = in_width + 2 * padding;
+
+        CHECK_OR_RETURN(padded_len >= kernel_size, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t expected_out_width = (padded_len - kernel_size) / stride + 1;
+        CHECK_OR_RETURN(y_desc->dim(2) == expected_out_width, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t out_width = expected_out_width;
+
+        return utils::Result<AvgPool1dInfo>(AvgPool1dInfo{
+            dtype,
+            batch, channels, in_width, out_width,
+            kernel_size, stride, padding,
+            y_desc->stride(0), y_desc->stride(1), y_desc->stride(2),
+            x_desc->stride(0), x_desc->stride(1), x_desc->stride(2)});
+    }
+};
+
+#endif
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.cc
+#include "avg_pool1d_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+
+namespace op::avg_pool1d::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info = AvgPool1dInfo::createAvgPool1dInfo(y_desc, x_desc, kernel_size, stride, padding);
+    CHECK_RESULT(info);
+
+    *desc_ptr = new Descriptor(
+        info.take(),
+        0,
+        nullptr,
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t calculateAvgPool1d(const AvgPool1dInfo &info,
+                                  T *y,
+                                  const T *x) {
+    const float inv_kernel = 1.0f / static_cast<float>(info.kernel_size);
+
+#pragma omp parallel for
+    for (ptrdiff_t bc = 0; bc < ptrdiff_t(info.batch * info.channels); ++bc) {
+
+        ptrdiff_t b = bc / info.channels;
+        ptrdiff_t c = bc % info.channels;
+
+        size_t y_base = b * info.y_stride_batch + c * info.y_stride_channel;
+        size_t x_base = b * info.x_stride_batch + c * info.x_stride_channel;
+
+        for (size_t ow = 0; ow < info.out_width; ++ow) {
+            size_t y_offset = y_base + ow * info.y_stride_width;
+
+            long long start_w = static_cast<long long>(ow * info.stride) - info.padding;
+            long long end_w = start_w + info.kernel_size;
+
+            long long valid_start = std::max(0LL, start_w);
+            long long valid_end = std::min(static_cast<long long>(info.in_width), end_w);
+
+            float sum = 0.0f;
+            for (long long iw = valid_start; iw < valid_end; ++iw) {
+                size_t x_offset = x_base + iw * info.x_stride_width;
+                sum += utils::cast<float>(x[x_offset]);
+            }
+
+            const float avg = sum * inv_kernel;
+            y[y_offset] = utils::cast<T>(avg);
+        }
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define CALCULATE(TDATA) calculateAvgPool1d(_info, (TDATA *)y, (const TDATA *)x)
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    switch (_info.dtype) {
+    case INFINI_DTYPE_F16:
+        return CALCULATE(fp16_t);
+    case INFINI_DTYPE_BF16:
+        return CALCULATE(bf16_t);
+    case INFINI_DTYPE_F32:
+        return CALCULATE(float);
+    case INFINI_DTYPE_F64:
+        return CALCULATE(double);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+#undef CALCULATE
+
+} // namespace op::avg_pool1d::cpu
--- a/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
+++ b/src/infiniop/ops/avg_pool1d/cpu/avg_pool1d_cpu.h
+#ifndef __INFINIOP_AVG_POOL1D_CPU_H__
+#define __INFINIOP_AVG_POOL1D_CPU_H__
+
+#include "../avg_pool1d.h"
+
+DESCRIPTOR(cpu)
+
+#endif