issue/810 add common graph op macros

0fa8805e · PanZezhong · 006d530c · 0fa8805e · 0fa8805e · 0fa8805e
Commit 0fa8805e authored Jan 09, 2026 by PanZezhong
12 changed files
--- a/include/infinicore/graph/graph.hpp
+++ b/include/infinicore/graph/graph.hpp
@@ -43,3 +43,50 @@ protected:
    friend class GraphManager;
 };
 } // namespace infinicore::graph
+
+#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...)                        \
+    class __OP_NAME__ : public graph::GraphOperator {                      \
+    public:                                                                \
+        using schema = void (*)(__VA_ARGS__);                              \
+        using plan_schema = void *(*)(__VA_ARGS__);                        \
+        static common::OpDispatcher<plan_schema> &plan_dispatcher();       \
+        static common::OpDispatcher<run_schema> &run_dispatcher();         \
+        static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
+        __OP_NAME__(__VA_ARGS__);                                          \
+        static void execute(__VA_ARGS__);                                  \
+    };
+
+#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__)                                  \
+    common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() {       \
+        static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_;                 \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() {         \
+        static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_;                  \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
+        static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_;              \
+        return dispatcher_;                                                                \
+    }
+
+#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...)                  \
+    planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
+    runner_ = run_dispatcher().lookup(__DEVICE_TYPE__);                     \
+    deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
+
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
+    auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__);   \
+    if (context::isGraphRecording()) {                      \
+        context::addGraphOperator(op);                      \
+    } else {                                                \
+        op->run();                                          \
+    }
+
+#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
+    static bool registered = []() {                                                               \
+        __OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false);                            \
+        __OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false);                              \
+        __OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false);                      \
+        return true;                                                                              \
+    }();
--- a/include/infinicore/ops/gemm.hpp
+++ b/include/infinicore/ops/gemm.hpp
@@ -6,20 +6,7 @@

 namespace infinicore::op {

-class Gemm : public graph::GraphOperator {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, float, float);
-    using plan_schema = void *(*)(Tensor, Tensor, Tensor, float, float);
-
-    Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta);
-
-    static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
-
-    static common::OpDispatcher<schema> &dispatcher();
-    static common::OpDispatcher<plan_schema> &plan_dispatcher();
-    static common::OpDispatcher<run_schema> &run_dispatcher();
-    static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float);

 Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
 void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);

--- a/src/infinicore/context/allocators/device_pinned_allocator.cc
+++ b/src/infinicore/context/allocators/device_pinned_allocator.cc
@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() {
 }

 std::byte *DevicePinnedHostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    void *ptr;
    INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size));
    return (std::byte *)ptr;
 }

 void DevicePinnedHostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    if (owner_ == context::getDevice()) {
        INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr));
        gc();

--- a/src/infinicore/context/allocators/host_allocator.cc
+++ b/src/infinicore/context/allocators/host_allocator.cc
@@ -4,10 +4,16 @@

 namespace infinicore {
 std::byte *HostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    return (std::byte *)std::malloc(size);
 }

 void HostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    std::free(ptr);
 }


--- a/src/infinicore/context/allocators/pinnable_block_allocator.cc
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.cc
@@ -37,6 +37,9 @@ PinnableBlockAllocator::PinnableBlockAllocator(Device device)

 // ------------------- allocate -------------------
 std::byte *PinnableBlockAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    std::lock_guard<std::mutex> lock(mutex_);

    // Align size to 256 bytes for GPU
@@ -94,7 +97,7 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {

 // ------------------- deallocate -------------------
 void PinnableBlockAllocator::deallocate(std::byte *ptr) {
-    if (!ptr) {
+    if (ptr == nullptr) {
        return;
    }


--- a/src/infinicore/context/allocators/stream_ordered_allocator.cc
+++ b/src/infinicore/context/allocators/stream_ordered_allocator.cc
@@ -8,12 +8,18 @@ namespace infinicore {
 StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}

 std::byte *StreamOrderedAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    void *ptr = nullptr;
    INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
    return (std::byte *)ptr;
 }

 void StreamOrderedAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
 }
 } // namespace infinicore
--- a/src/infinicore/ops/gemm/gemm.cc
+++ b/src/infinicore/ops/gemm/gemm.cc
@@ -3,40 +3,15 @@
 #include "../../utils.hpp"

 namespace infinicore::op {
-
-common::OpDispatcher<Gemm::schema> &Gemm::dispatcher() {
-    static common::OpDispatcher<Gemm::schema> dispatcher_;
-    return dispatcher_;
-};
-
-common::OpDispatcher<Gemm::plan_schema> &Gemm::plan_dispatcher() {
-    static common::OpDispatcher<Gemm::plan_schema> dispatcher_;
-    return dispatcher_;
-}
-common::OpDispatcher<Gemm::run_schema> &Gemm::run_dispatcher() {
-    static common::OpDispatcher<Gemm::run_schema> dispatcher_;
-    return dispatcher_;
-}
-common::OpDispatcher<Gemm::cleanup_schema> &Gemm::cleanup_dispatcher() {
-    static common::OpDispatcher<Gemm::cleanup_schema> dispatcher_;
-    return dispatcher_;
-}
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm);

 Gemm::Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    planned_meta_ = plan_dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta);
-    runner_ = run_dispatcher().lookup(c->device().getType());
-    deleter_ = cleanup_dispatcher().lookup(c->device().getType());
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta);
 }

 void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-
-    auto op = std::make_shared<Gemm>(c, a, b, alpha, beta);
-    if (context::isGraphRecording()) {
-        context::addGraphOperator(op);
-    } else {
-        op->run();
-    }
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta);
 }

 Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {

--- a/src/infinicore/ops/gemm/gemm_infiniop.cc
+++ b/src/infinicore/ops/gemm/gemm_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
+#include "../infiniop_impl.hpp"
 #include "infinicore/ops/gemm.hpp"
-#include <infiniop.h>

 namespace infinicore::op::gemm_impl::infiniop {
-// A desc holder to make it a shared pointer that can auto clean-up
-struct Descriptor {
-    infiniopGemmDescriptor_t desc;
-    Descriptor(infiniopGemmDescriptor_t desc) : desc(desc) {}
-    ~Descriptor() {
-        if (desc != nullptr) {
-            infiniopDestroyGemmDescriptor(desc);
-            desc = nullptr;
-        }
-    }
-};

-thread_local common::OpCache<size_t, std::shared_ptr<Descriptor>>
-    caches(
-        // capacity
-        100,
-        // on evict
-        [](std::shared_ptr<Descriptor> &desc) {
-            desc = nullptr;
-        });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100);

 struct PlannedMeta {
    std::shared_ptr<Descriptor> descriptor;
@@ -33,25 +12,13 @@ struct PlannedMeta {
 };

 void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-    size_t seed = hash_combine(c, b, a, alpha, beta);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto descriptor = cache.get(seed).value_or(nullptr);
+    size_t seed = hash_combine(c, a, b);

-    if (!descriptor) {
-        descriptor = std::make_shared<Descriptor>(nullptr);
-        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
-            context::getInfiniopHandle(device),
-            &descriptor->desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, descriptor);
-    }
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Gemm,
+        seed, c->desc(), a->desc(), b->desc());

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(descriptor->desc, &workspace_size));
-    Tensor workspace = Tensor::empty({workspace_size}, DataType::U8, device);
+    INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor);

    auto planned = new PlannedMeta{
        descriptor,
@@ -77,18 +44,6 @@ void cleanup(void **planned_meta_ptr) {
    *planned_meta_ptr = nullptr;
 }

-void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-    auto planned = plan(c, a, b, alpha, beta);
-    run(planned);
-    cleanup(&planned);
-}
-
-static bool registered = []() {
-    Gemm::dispatcher().registerAll(&calculate, false);
-    Gemm::plan_dispatcher().registerAll(&plan, false);
-    Gemm::run_dispatcher().registerAll(&run, false);
-    Gemm::cleanup_dispatcher().registerAll(&cleanup, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup);

 } // namespace infinicore::op::gemm_impl::infiniop
--- a/src/infinicore/ops/infiniop_impl.hpp
+++ b/src/infinicore/ops/infiniop_impl.hpp
+#pragma once
+
+#include "../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__)   \
+    struct __DESC_TYPE__ {                                                   \
+        infiniop##__OP_NAME__##Descriptor_t desc;                            \
+        Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \
+        ~Descriptor() {                                                      \
+            if (desc != nullptr) {                                           \
+                infiniopDestroy##__OP_NAME__##Descriptor(desc);              \
+                desc = nullptr;                                              \
+            }                                                                \
+        }                                                                    \
+    };                                                                       \
+                                                                             \
+    thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>>     \
+        caches(                                                              \
+            __SIZE__,                                                        \
+            [](std::shared_ptr<__DESC_TYPE__> &desc) {                       \
+                desc = nullptr;                                              \
+            });
+
+#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
+    std::shared_ptr<__DESC_TYPE__> __DESC_NAME__;                                                                      \
+    {                                                                                                                  \
+        auto device__ = context::getDevice();                                                                          \
+        auto &cache__ = caches.getCache(device__);                                                                     \
+        __DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr);                                                   \
+        if (!__DESC_NAME__) {                                                                                          \
+            __DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr);                                                  \
+            INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor(                                      \
+                context::getInfiniopHandle(device__),                                                                  \
+                &__DESC_NAME__->desc,                                                                                  \
+                __VA_ARGS__));                                                                                         \
+            cache__.put(__HASH_KEY__, __DESC_NAME__);                                                                  \
+        }                                                                                                              \
+    }
+
+#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__)                                 \
+    Tensor __TENSOR_NAME__;                                                                                          \
+    {                                                                                                                \
+        auto device__ = context::getDevice();                                                                        \
+        size_t workspace_size = 0;                                                                                   \
+        INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
+        __TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__);                                   \
+    }
--- a/src/infinicore/ops/linear/linear.cc
+++ b/src/infinicore/ops/linear/linear.cc
 #include "infinicore/ops/linear.hpp"
-#include "infinicore/ops/add.hpp"
-#include "infinicore/ops/matmul.hpp"
+#include "infinicore/ops/gemm.hpp"
+#include "infinicore/ops/rearrange.hpp"

 namespace infinicore::op {

@@ -42,16 +42,18 @@ void linear_(Tensor out,

    // linear transformation
    Tensor out_view = out->view({N, out_features});
-    matmul_(out_view,
-            input->view({N, in_features}),
-            weight->permute({1, 0}));
-
    // Add bias
+    float alpha = 1.0f;
+    float beta = 0.0f;
    if (bias.has_value()) {
-        add_(out_view,
-             out_view,
-             bias.value()->as_strided({N, out_features}, {0, 1}));
+        rearrange_(out_view,
+                   bias.value()->as_strided({N, out_features}, {0, 1}));
+        beta = 1.0f;
    }
+
+    gemm_(out_view,
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
 }

 } // namespace infinicore::op
--- a/test/infinicore/framework/tensor.py
+++ b/test/infinicore/framework/tensor.py
@@ -60,7 +60,12 @@ class TensorInitializer:

        # Handle real floating-point types
        if mode == TensorInitializer.RANDOM:
-            return torch.rand(shape, dtype=torch_dtype, device=torch_device_str)
+            scale = kwargs.get("scale", 1.0)
+            bias = kwargs.get("bias", 0.0)
+            return (
+                torch.rand(shape, dtype=torch_dtype, device=torch_device_str) * scale
+                + bias
+            )
        elif mode == TensorInitializer.ZEROS:
            return torch.zeros(shape, dtype=torch_dtype, device=torch_device_str)
        elif mode == TensorInitializer.ONES:

--- a/xmake.lua
+++ b/xmake.lua
@@ -268,6 +268,9 @@ target("infinirt")
        add_deps("infinirt-hygon")
    end
    set_languages("cxx17")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC")
+    end
    set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
    add_files("src/infinirt/*.cc")
    add_installfiles("include/infinirt.h", {prefixdir = "include"})