Commit 0fa8805e authored by PanZezhong's avatar PanZezhong
Browse files

issue/810 add common graph op macros

parent 006d530c
...@@ -43,3 +43,50 @@ protected: ...@@ -43,3 +43,50 @@ protected:
friend class GraphManager; friend class GraphManager;
}; };
} // namespace infinicore::graph } // namespace infinicore::graph
#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...) \
class __OP_NAME__ : public graph::GraphOperator { \
public: \
using schema = void (*)(__VA_ARGS__); \
using plan_schema = void *(*)(__VA_ARGS__); \
static common::OpDispatcher<plan_schema> &plan_dispatcher(); \
static common::OpDispatcher<run_schema> &run_dispatcher(); \
static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
__OP_NAME__(__VA_ARGS__); \
static void execute(__VA_ARGS__); \
};
#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__) \
common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_; \
return dispatcher_; \
} \
common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_; \
return dispatcher_; \
} \
common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_; \
return dispatcher_; \
}
#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...) \
planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
runner_ = run_dispatcher().lookup(__DEVICE_TYPE__); \
deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
if (context::isGraphRecording()) { \
context::addGraphOperator(op); \
} else { \
op->run(); \
}
#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
static bool registered = []() { \
__OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false); \
__OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false); \
__OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false); \
return true; \
}();
...@@ -6,20 +6,7 @@ ...@@ -6,20 +6,7 @@
namespace infinicore::op { namespace infinicore::op {
class Gemm : public graph::GraphOperator { INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float);
public:
using schema = void (*)(Tensor, Tensor, Tensor, float, float);
using plan_schema = void *(*)(Tensor, Tensor, Tensor, float, float);
Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta);
static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
static common::OpDispatcher<schema> &dispatcher();
static common::OpDispatcher<plan_schema> &plan_dispatcher();
static common::OpDispatcher<run_schema> &run_dispatcher();
static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher();
};
Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f); Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta); void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);
......
...@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() { ...@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() {
} }
std::byte *DevicePinnedHostAllocator::allocate(size_t size) { std::byte *DevicePinnedHostAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
void *ptr; void *ptr;
INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size)); INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size));
return (std::byte *)ptr; return (std::byte *)ptr;
} }
void DevicePinnedHostAllocator::deallocate(std::byte *ptr) { void DevicePinnedHostAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
if (owner_ == context::getDevice()) { if (owner_ == context::getDevice()) {
INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr)); INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr));
gc(); gc();
......
...@@ -4,10 +4,16 @@ ...@@ -4,10 +4,16 @@
namespace infinicore { namespace infinicore {
std::byte *HostAllocator::allocate(size_t size) { std::byte *HostAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
return (std::byte *)std::malloc(size); return (std::byte *)std::malloc(size);
} }
void HostAllocator::deallocate(std::byte *ptr) { void HostAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
std::free(ptr); std::free(ptr);
} }
......
...@@ -37,6 +37,9 @@ PinnableBlockAllocator::PinnableBlockAllocator(Device device) ...@@ -37,6 +37,9 @@ PinnableBlockAllocator::PinnableBlockAllocator(Device device)
// ------------------- allocate ------------------- // ------------------- allocate -------------------
std::byte *PinnableBlockAllocator::allocate(size_t size) { std::byte *PinnableBlockAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);
// Align size to 256 bytes for GPU // Align size to 256 bytes for GPU
...@@ -94,7 +97,7 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) { ...@@ -94,7 +97,7 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
// ------------------- deallocate ------------------- // ------------------- deallocate -------------------
void PinnableBlockAllocator::deallocate(std::byte *ptr) { void PinnableBlockAllocator::deallocate(std::byte *ptr) {
if (!ptr) { if (ptr == nullptr) {
return; return;
} }
......
...@@ -8,12 +8,18 @@ namespace infinicore { ...@@ -8,12 +8,18 @@ namespace infinicore {
StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {} StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}
std::byte *StreamOrderedAllocator::allocate(size_t size) { std::byte *StreamOrderedAllocator::allocate(size_t size) {
if (size == 0) {
return nullptr;
}
void *ptr = nullptr; void *ptr = nullptr;
INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream())); INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
return (std::byte *)ptr; return (std::byte *)ptr;
} }
void StreamOrderedAllocator::deallocate(std::byte *ptr) { void StreamOrderedAllocator::deallocate(std::byte *ptr) {
if (ptr == nullptr) {
return;
}
INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream())); INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
} }
} // namespace infinicore } // namespace infinicore
...@@ -3,40 +3,15 @@ ...@@ -3,40 +3,15 @@
#include "../../utils.hpp" #include "../../utils.hpp"
namespace infinicore::op { namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm);
common::OpDispatcher<Gemm::schema> &Gemm::dispatcher() {
static common::OpDispatcher<Gemm::schema> dispatcher_;
return dispatcher_;
};
common::OpDispatcher<Gemm::plan_schema> &Gemm::plan_dispatcher() {
static common::OpDispatcher<Gemm::plan_schema> dispatcher_;
return dispatcher_;
}
common::OpDispatcher<Gemm::run_schema> &Gemm::run_dispatcher() {
static common::OpDispatcher<Gemm::run_schema> dispatcher_;
return dispatcher_;
}
common::OpDispatcher<Gemm::cleanup_schema> &Gemm::cleanup_dispatcher() {
static common::OpDispatcher<Gemm::cleanup_schema> dispatcher_;
return dispatcher_;
}
Gemm::Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta) { Gemm::Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b); INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
planned_meta_ = plan_dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta); INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta);
runner_ = run_dispatcher().lookup(c->device().getType());
deleter_ = cleanup_dispatcher().lookup(c->device().getType());
} }
void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) { void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta);
auto op = std::make_shared<Gemm>(c, a, b, alpha, beta);
if (context::isGraphRecording()) {
context::addGraphOperator(op);
} else {
op->run();
}
} }
Tensor gemm(Tensor a, Tensor b, float alpha, float beta) { Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
......
#include "../../utils.hpp" #include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/gemm.hpp" #include "infinicore/ops/gemm.hpp"
#include <infiniop.h>
namespace infinicore::op::gemm_impl::infiniop { namespace infinicore::op::gemm_impl::infiniop {
// A desc holder to make it a shared pointer that can auto clean-up
struct Descriptor {
infiniopGemmDescriptor_t desc;
Descriptor(infiniopGemmDescriptor_t desc) : desc(desc) {}
~Descriptor() {
if (desc != nullptr) {
infiniopDestroyGemmDescriptor(desc);
desc = nullptr;
}
}
};
thread_local common::OpCache<size_t, std::shared_ptr<Descriptor>> INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100);
caches(
// capacity
100,
// on evict
[](std::shared_ptr<Descriptor> &desc) {
desc = nullptr;
});
struct PlannedMeta { struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor; std::shared_ptr<Descriptor> descriptor;
...@@ -33,25 +12,13 @@ struct PlannedMeta { ...@@ -33,25 +12,13 @@ struct PlannedMeta {
}; };
void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) { void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
size_t seed = hash_combine(c, b, a, alpha, beta); size_t seed = hash_combine(c, a, b);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto descriptor = cache.get(seed).value_or(nullptr);
if (!descriptor) { INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
descriptor = std::make_shared<Descriptor>(nullptr); Descriptor, descriptor, Gemm,
INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor( seed, c->desc(), a->desc(), b->desc());
context::getInfiniopHandle(device),
&descriptor->desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, descriptor);
}
size_t workspace_size = 0; INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor);
INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(descriptor->desc, &workspace_size));
Tensor workspace = Tensor::empty({workspace_size}, DataType::U8, device);
auto planned = new PlannedMeta{ auto planned = new PlannedMeta{
descriptor, descriptor,
...@@ -77,18 +44,6 @@ void cleanup(void **planned_meta_ptr) { ...@@ -77,18 +44,6 @@ void cleanup(void **planned_meta_ptr) {
*planned_meta_ptr = nullptr; *planned_meta_ptr = nullptr;
} }
void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) { INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup);
auto planned = plan(c, a, b, alpha, beta);
run(planned);
cleanup(&planned);
}
static bool registered = []() {
Gemm::dispatcher().registerAll(&calculate, false);
Gemm::plan_dispatcher().registerAll(&plan, false);
Gemm::run_dispatcher().registerAll(&run, false);
Gemm::cleanup_dispatcher().registerAll(&cleanup, false);
return true;
}();
} // namespace infinicore::op::gemm_impl::infiniop } // namespace infinicore::op::gemm_impl::infiniop
#pragma once
#include "../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
struct __DESC_TYPE__ { \
infiniop##__OP_NAME__##Descriptor_t desc; \
Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \
~Descriptor() { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
desc = nullptr; \
} \
} \
}; \
\
thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>> \
caches( \
__SIZE__, \
[](std::shared_ptr<__DESC_TYPE__> &desc) { \
desc = nullptr; \
});
#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
std::shared_ptr<__DESC_TYPE__> __DESC_NAME__; \
{ \
auto device__ = context::getDevice(); \
auto &cache__ = caches.getCache(device__); \
__DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr); \
if (!__DESC_NAME__) { \
__DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr); \
INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor( \
context::getInfiniopHandle(device__), \
&__DESC_NAME__->desc, \
__VA_ARGS__)); \
cache__.put(__HASH_KEY__, __DESC_NAME__); \
} \
}
#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__) \
Tensor __TENSOR_NAME__; \
{ \
auto device__ = context::getDevice(); \
size_t workspace_size = 0; \
INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
__TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__); \
}
#include "infinicore/ops/linear.hpp" #include "infinicore/ops/linear.hpp"
#include "infinicore/ops/add.hpp" #include "infinicore/ops/gemm.hpp"
#include "infinicore/ops/matmul.hpp" #include "infinicore/ops/rearrange.hpp"
namespace infinicore::op { namespace infinicore::op {
...@@ -42,16 +42,18 @@ void linear_(Tensor out, ...@@ -42,16 +42,18 @@ void linear_(Tensor out,
// linear transformation // linear transformation
Tensor out_view = out->view({N, out_features}); Tensor out_view = out->view({N, out_features});
matmul_(out_view,
input->view({N, in_features}),
weight->permute({1, 0}));
// Add bias // Add bias
float alpha = 1.0f;
float beta = 0.0f;
if (bias.has_value()) { if (bias.has_value()) {
add_(out_view, rearrange_(out_view,
out_view,
bias.value()->as_strided({N, out_features}, {0, 1})); bias.value()->as_strided({N, out_features}, {0, 1}));
beta = 1.0f;
} }
gemm_(out_view,
input->view({N, in_features}),
weight->permute({1, 0}), alpha, beta);
} }
} // namespace infinicore::op } // namespace infinicore::op
...@@ -60,7 +60,12 @@ class TensorInitializer: ...@@ -60,7 +60,12 @@ class TensorInitializer:
# Handle real floating-point types # Handle real floating-point types
if mode == TensorInitializer.RANDOM: if mode == TensorInitializer.RANDOM:
return torch.rand(shape, dtype=torch_dtype, device=torch_device_str) scale = kwargs.get("scale", 1.0)
bias = kwargs.get("bias", 0.0)
return (
torch.rand(shape, dtype=torch_dtype, device=torch_device_str) * scale
+ bias
)
elif mode == TensorInitializer.ZEROS: elif mode == TensorInitializer.ZEROS:
return torch.zeros(shape, dtype=torch_dtype, device=torch_device_str) return torch.zeros(shape, dtype=torch_dtype, device=torch_device_str)
elif mode == TensorInitializer.ONES: elif mode == TensorInitializer.ONES:
......
...@@ -268,6 +268,9 @@ target("infinirt") ...@@ -268,6 +268,9 @@ target("infinirt")
add_deps("infinirt-hygon") add_deps("infinirt-hygon")
end end
set_languages("cxx17") set_languages("cxx17")
if not is_plat("windows") then
add_cxflags("-fPIC")
end
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
add_files("src/infinirt/*.cc") add_files("src/infinirt/*.cc")
add_installfiles("include/infinirt.h", {prefixdir = "include"}) add_installfiles("include/infinirt.h", {prefixdir = "include"})
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment