Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
#include "infinicore/ops/paged_caching.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedCaching);
PagedCaching::PagedCaching(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, slot_mapping);
INFINICORE_GRAPH_OP_DISPATCH(k->device().getType(), k_cache, v_cache, k, v, slot_mapping);
}
void PagedCaching::execute(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(PagedCaching, k_cache, v_cache, k, v, slot_mapping);
}
void paged_caching_(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
PagedCaching::execute(k_cache, v_cache, k, v, slot_mapping);
}
} // namespace infinicore::op
#include "infinicore/ops/paged_caching.hpp"
#include "../infiniop_impl.hpp"
namespace infinicore::op::paged_caching_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedCaching, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, k_cache, v_cache, k, v, slot_mapping;
};
void *plan(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
size_t key = hash_combine(k_cache, v_cache, k, v, slot_mapping);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, PagedCaching,
key, k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, PagedCaching, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(k_cache),
graph::GraphTensor(v_cache),
graph::GraphTensor(k),
graph::GraphTensor(v),
graph::GraphTensor(slot_mapping)};
}
void run(void *planned_meta) {
auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(
infiniopPagedCaching(
p->descriptor->desc,
p->workspace->data(),
p->workspace->numel(),
p->k_cache->data(),
p->v_cache->data(),
p->k->data(),
p->v->data(),
p->slot_mapping->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedCaching, &plan, &run, &cleanup);
} // namespace infinicore::op::paged_caching_impl::infiniop
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerChannelQuantI8);
PerChannelQuantI8::PerChannelQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale);
}
void PerChannelQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerChannelQuantI8, x, x_packed, x_scale);
}
void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale) {
PerChannelQuantI8::execute(x, x_packed, x_scale);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include <infiniop.h>
namespace infinicore::op::per_channel_quant_i8_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerChannelQuantI8, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, x, x_packed, x_scale;
};
void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale) {
size_t seed = hash_combine(x, x_packed, x_scale);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, PerChannelQuantI8,
seed,
x_packed->desc(), x_scale->desc(), nullptr, x->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, PerChannelQuantI8, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(x),
graph::GraphTensor(x_packed),
graph::GraphTensor(x_scale)};
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopPerChannelQuantI8(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->x_packed->data(),
planned->x_scale->data(),
nullptr,
planned->x->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerChannelQuantI8, &plan, &run, &cleanup);
} // namespace infinicore::op::per_channel_quant_i8_impl::infiniop
......@@ -3,24 +3,30 @@
namespace infinicore::op {
common::OpDispatcher<Rearrange::schema> &Rearrange::dispatcher() {
static common::OpDispatcher<Rearrange::schema> dispatcher_;
return dispatcher_;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Rearrange);
void Rearrange::execute(Tensor y, Tensor x) {
Rearrange::Rearrange(Tensor y, const Tensor &x) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
infinicore::context::setDevice(y->device());
dispatcher().lookup(y->device().getType())(y, x);
INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x);
}
Tensor rearrange(Tensor x) {
void Rearrange::execute(Tensor y, const Tensor &x) {
auto op = std::make_shared<Rearrange>(y, x);
if (context::isGraphRecording()) {
context::addGraphOperator(op);
} else {
op->run();
}
}
Tensor rearrange(const Tensor &x) {
auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
rearrange_(y, x);
return y;
}
void rearrange_(Tensor y, Tensor x) {
void rearrange_(Tensor y, const Tensor &x) {
Rearrange::execute(y, x);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rearrange.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::rearrange_impl::infiniop {
thread_local common::OpCache<size_t, infiniopRearrangeDescriptor_t> caches(
100, // capacity
[](infiniopRearrangeDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyRearrangeDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Rearrange, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor y, x;
};
void calculate(Tensor y, Tensor x) {
void *plan(Tensor y, const Tensor &x) {
size_t seed = hash_combine(y, x);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Rearrange,
seed, y->desc(),
x->desc());
auto desc_opt = cache.get(seed);
infiniopRearrangeDescriptor_t desc = nullptr;
return new PlannedMeta{
descriptor,
graph::GraphTensor(y),
graph::GraphTensor(x)};
}
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateRearrangeDescriptor(context::getInfiniopHandle(device), &desc, y->desc(), x->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(
infiniopRearrange(
desc,
y->data(),
x->data(),
planned->descriptor->desc,
planned->y->data(),
planned->x->data(),
context::getStream()));
}
static bool registered = []() {
Rearrange::dispatcher().registerAll(&calculate, false);
return true;
}();
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Rearrange, &plan, &run, &cleanup);
} // namespace infinicore::op::rearrange_impl::infiniop
#include "infinicore/ops/rms_norm.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RMSNorm);
common::OpDispatcher<RMSNorm::schema> &RMSNorm::dispatcher() {
static common::OpDispatcher<RMSNorm::schema> dispatcher_;
return dispatcher_;
};
void RMSNorm::execute(Tensor y, Tensor x, Tensor weight, float epsilon) {
RMSNorm::RMSNorm(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x, weight);
infinicore::context::setDevice(y->device());
dispatcher().lookup(y->device().getType())(y, x, weight, epsilon);
INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x, weight, epsilon);
}
void RMSNorm::execute(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(RMSNorm, y, x, weight, epsilon);
}
Tensor rms_norm(Tensor x, Tensor weight, float epsilon) {
Tensor rms_norm(const Tensor &x, const Tensor &weight, float epsilon) {
auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
rms_norm_(y, x, weight, epsilon);
return y;
}
void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon) {
void rms_norm_(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
RMSNorm::execute(y, x, weight, epsilon);
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rms_norm.hpp"
#include <infiniop.h>
namespace infinicore::op::rms_norm_impl::infiniop {
#include "../infiniop_impl.hpp"
thread_local common::OpCache<size_t, infiniopRMSNormDescriptor_t> caches(
100, // capacity
[](infiniopRMSNormDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyRMSNormDescriptor(desc));
desc = nullptr;
}
});
namespace infinicore::op::rms_norm_impl::infiniop {
void calculate(Tensor y, Tensor x, Tensor weight, float epsilon) {
size_t seed = hash_combine(y, x, weight, epsilon);
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RMSNorm, 100);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, y, x, weight;
};
auto desc_opt = cache.get(seed);
infiniopRMSNormDescriptor_t desc = nullptr;
void *plan(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
size_t seed = hash_combine(y, x, weight, epsilon);
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateRMSNormDescriptor(
context::getInfiniopHandle(device), &desc,
y->desc(), x->desc(), weight->desc(), epsilon));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, RMSNorm,
seed, y->desc(),
x->desc(),
weight->desc(),
epsilon);
INFINIOP_WORKSPACE_TENSOR(workspace, RMSNorm, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(y),
graph::GraphTensor(x),
graph::GraphTensor(weight)};
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(
infiniopRMSNorm(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->y->data(),
planned->x->data(),
planned->weight->data(),
context::getStream()));
}
INFINICORE_CHECK_ERROR(infiniopRMSNorm(
desc, workspace->data(), workspace_size,
y->data(), x->data(), weight->data(), context::getStream()));
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
RMSNorm::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RMSNorm, &plan, &run, &cleanup);
} // namespace infinicore::op::rms_norm_impl::infiniop
#include "infinicore/ops/rope.hpp"
#include "../../utils.hpp"
#include "infinicore/context/context.hpp"
#include <stdexcept>
namespace infinicore::op {
common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
static common::OpDispatcher<RoPE::schema> dispatcher_;
return dispatcher_;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RoPE);
void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
RoPE::RoPE(Tensor x_out,
const Tensor &x,
const Tensor &pos,
const Tensor &sin_table,
const Tensor &cos_table,
infinicore::nn::RoPE::Algo algo) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x_out, x, pos, sin_table, cos_table);
infinicore::context::setDevice(x_out->device());
auto device_type = x_out->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
INFINICORE_GRAPH_OP_DISPATCH(x_out->device().getType(), x_out, x, pos, sin_table, cos_table, algo);
}
func(x_out, x, pos, sin_table, cos_table, algo);
void RoPE::execute(Tensor x_out,
const Tensor &x,
const Tensor &pos,
const Tensor &sin_table,
const Tensor &cos_table,
infinicore::nn::RoPE::Algo algo) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(RoPE, x_out, x, pos, sin_table, cos_table, algo);
}
void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
void rope_(Tensor x_out,
const Tensor &x,
const Tensor &pos,
const Tensor &sin_table,
const Tensor &cos_table,
infinicore::nn::RoPE::Algo algo) {
RoPE::execute(x_out, x, pos, sin_table, cos_table, algo);
}
Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
Shape shape = x->shape();
auto x_out = Tensor::empty(shape, x->dtype(), x->device());
Tensor rope(const Tensor &x,
const Tensor &pos,
const Tensor &sin_table,
const Tensor &cos_table,
infinicore::nn::RoPE::Algo algo) {
auto x_out = Tensor::empty(x->shape(), x->dtype(), x->device());
rope_(x_out, x, pos, sin_table, cos_table, algo);
return x_out;
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/rope.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::rope_impl::infiniop {
thread_local common::OpCache<size_t, infiniopRoPEDescriptor_t> caches(
100, // capacity
[](infiniopRoPEDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyRoPEDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RoPE, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace;
graph::GraphTensor x_out;
graph::GraphTensor x;
graph::GraphTensor pos;
graph::GraphTensor sin;
graph::GraphTensor cos;
};
void calculate(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
// Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
infiniopRoPEAlgo_t infiniop_algo;
static infiniopRoPEAlgo_t to_infiniop_algo(infinicore::nn::RoPE::Algo algo) {
switch (algo) {
case infinicore::nn::RoPE::Algo::GPT_J:
infiniop_algo = INFINIOP_ROPE_ALGO_GPT_J;
break;
return INFINIOP_ROPE_ALGO_GPT_J;
case infinicore::nn::RoPE::Algo::GPT_NEOX:
infiniop_algo = INFINIOP_ROPE_ALGO_GPT_NEOX;
break;
return INFINIOP_ROPE_ALGO_GPT_NEOX;
default:
throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo)));
throw std::runtime_error("Unsupported RoPE algorithm");
}
}
// Create hash key for descriptor caching
size_t key = hash_combine(x_out, x, pos, sin_cache, cos_cache);
hash_combine(key, std::hash<int>()(static_cast<int>(infiniop_algo)));
void *plan(Tensor x_out,
const Tensor &x,
const Tensor &pos,
const Tensor &sin,
const Tensor &cos,
infinicore::nn::RoPE::Algo algo) {
auto infiniop_algo = to_infiniop_algo(algo);
size_t key = hash_combine(x_out, x, pos, sin, cos, static_cast<int>(infiniop_algo));
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, RoPE, key, x_out->desc(),
x->desc(),
pos->desc(),
sin->desc(),
cos->desc(),
infiniop_algo);
auto desc_opt = cache.get(key);
infiniopRoPEDescriptor_t desc = nullptr;
INFINIOP_WORKSPACE_TENSOR(workspace, RoPE, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(x_out),
graph::GraphTensor(x),
graph::GraphTensor(pos),
graph::GraphTensor(sin),
graph::GraphTensor(cos)};
}
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateRoPEDescriptor(
context::getInfiniopHandle(device), &desc,
x_out->desc(), x->desc(),
pos->desc(), sin_cache->desc(), cos_cache->desc(),
infiniop_algo));
cache.put(key, desc);
} else {
desc = *desc_opt;
}
void run(void *planned_meta) {
auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(
infiniopRoPE(
p->descriptor->desc,
p->workspace->data(),
p->workspace->numel(),
p->x_out->data(),
p->x->data(),
p->pos->data(),
p->sin->data(),
p->cos->data(),
context::getStream()));
}
// InfiniOP reads from x and writes to x_out (handles copying internally)
INFINICORE_CHECK_ERROR(infiniopRoPE(
desc, workspace->data(), workspace_size,
x_out->data(), x->data(), pos->data(),
sin_cache->data(), cos_cache->data(), context::getStream()));
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
RoPE::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RoPE, &plan, &run, &cleanup);
} // namespace infinicore::op::rope_impl::infiniop
#include "infinicore/ops/scaled_mm_i8.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(I8Gemm);
I8Gemm::I8Gemm(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a_p, a_s, b_p, b_s);
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a_p, a_s, b_p, b_s, bias);
}
void I8Gemm::execute(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(I8Gemm, c, a_p, a_s, b_p, b_s, bias);
}
void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
I8Gemm::execute(c, a_p, a_s, b_p, b_s, bias);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/scaled_mm_i8.hpp"
#include <infiniop.h>
namespace infinicore::op::scaled_mm_i8_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, I8Gemm, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, c, a_p, a_s, b_p, b_s;
std::optional<graph::GraphTensor> bias;
};
void *plan(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
size_t seed = hash_combine(c, a_p, a_s, b_p, b_s);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, I8Gemm,
seed,
c->desc(), bias.has_value() ? bias.value()->desc() : nullptr,
a_p->desc(), a_s->desc(), b_p->desc(), b_s->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, I8Gemm, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a_p),
graph::GraphTensor(a_s),
graph::GraphTensor(b_p),
graph::GraphTensor(b_s),
// bias.has_value() ? bias.value()->desc() : nullptr};
bias ? std::optional<graph::GraphTensor>(graph::GraphTensor(*bias)) : std::nullopt};
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopI8Gemm(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->c->data(),
// planned->bias->data(),
planned->bias.has_value() ? planned->bias.value()->data() : nullptr,
planned->a_p->data(),
planned->a_s->data(),
planned->b_p->data(),
planned->b_s->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(I8Gemm, &plan, &run, &cleanup);
} // namespace infinicore::op::scaled_mm_i8_impl::infiniop
#include "infinicore/ops/silu_and_mul.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SiluAndMul);
SiluAndMul::SiluAndMul(Tensor out, const Tensor &x) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, x);
}
void SiluAndMul::execute(Tensor out, const Tensor &x) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(SiluAndMul, out, x);
}
Tensor silu_and_mul(const Tensor &x) {
Shape shape = x->shape();
size_t ndim = x->ndim();
if (shape[ndim - 1] % 2 != 0) {
throw std::runtime_error("SiluAndMul input last dim must be even.");
}
shape[ndim - 1] /= 2;
auto out = Tensor::empty(shape, x->dtype(), x->device());
silu_and_mul_(out, x);
return out;
}
void silu_and_mul_(Tensor out, const Tensor &x) {
SiluAndMul::execute(out, x);
}
} // namespace infinicore::op
#include "../infiniop_impl.hpp"
#include "infinicore/ops/silu_and_mul.hpp"
namespace infinicore::op::silu_and_mul_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SiluAndMul, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, output, input;
};
void *plan(Tensor output, const Tensor &input) {
size_t seed = hash_combine(output, input);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, SiluAndMul,
seed, output->desc(), input->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, SiluAndMul, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(output),
graph::GraphTensor(input)};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopSiluAndMul(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->output->data(),
planned->input->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SiluAndMul, &plan, &run, &cleanup);
} // namespace infinicore::op::silu_and_mul_impl::infiniop
#include "infinicore/ops/swiglu.hpp"
#include "../../utils.hpp"
#include <stdexcept>
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SwiGLU);
common::OpDispatcher<SwiGLU::schema> &SwiGLU::dispatcher() {
static common::OpDispatcher<SwiGLU::schema> dispatcher_;
return dispatcher_;
};
void SwiGLU::execute(Tensor c, Tensor a, Tensor b) {
SwiGLU::SwiGLU(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
auto device_type = c->device().getType();
auto func = dispatcher().lookup(device_type);
if (func == nullptr) {
throw std::runtime_error("No SwiGLU implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
}
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
}
func(c, a, b);
void SwiGLU::execute(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(SwiGLU, c, a, b);
}
Tensor swiglu(Tensor a, Tensor b) {
Shape shape = a->shape();
auto c = Tensor::empty(shape, a->dtype(), a->device());
Tensor swiglu(const Tensor &a, const Tensor &b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
swiglu_(c, a, b);
return c;
}
void swiglu_(Tensor c, Tensor a, Tensor b) {
void swiglu_(Tensor c, const Tensor &a, const Tensor &b) {
SwiGLU::execute(c, a, b);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/swiglu.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::swiglu_impl::infiniop {
thread_local common::OpCache<size_t, infiniopSwiGLUDescriptor_t> caches(
100, // capacity
[](infiniopSwiGLUDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroySwiGLUDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor c, Tensor a, Tensor b) {
size_t seed = hash_combine(c, b, a);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopSwiGLUDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateSwiGLUDescriptor(
context::getInfiniopHandle(device), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetSwiGLUWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopSwiGLU(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), context::getStream()));
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SwiGLU, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace;
graph::GraphTensor c;
graph::GraphTensor a;
graph::GraphTensor b;
};
void *plan(Tensor c, const Tensor &a, const Tensor &b) {
size_t key = hash_combine(c, a, b);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, SwiGLU,
key, c->desc(), a->desc(), b->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, SwiGLU, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a),
graph::GraphTensor(b)};
}
void run(void *planned_meta) {
auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(
infiniopSwiGLU(
p->descriptor->desc,
p->workspace->data(),
p->workspace->numel(),
p->c->data(),
p->a->data(),
p->b->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
SwiGLU::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SwiGLU, &plan, &run, &cleanup);
} // namespace infinicore::op::swiglu_impl::infiniop
......@@ -24,6 +24,11 @@ inline void bind(py::module &m) {
// Synchronization
m.def("sync_stream", &syncStream, "Synchronize the current stream");
m.def("sync_device", &syncDevice, "Synchronize the current device");
// Graph
m.def("is_graph_recording", &isGraphRecording, "Check if graph recording is turned on");
m.def("start_graph_recording", &startGraphRecording, "Start graph recording");
m.def("stop_graph_recording", &stopGraphRecording, "Stop graph recording and return the graph");
}
} // namespace infinicore::context
......@@ -22,6 +22,7 @@ inline void bind(py::module &m) {
.value("QY", Device::Type::QY)
.value("KUNLUN", Device::Type::KUNLUN)
.value("HYGON", Device::Type::HYGON)
.value("ALI", Device::Type::ALI)
.value("COUNT", Device::Type::COUNT);
device
......
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "infinicore.hpp"
namespace py = pybind11;
namespace infinicore::graph {
inline void bind(py::module_ &m) {
py::class_<infinicore::graph::Graph,
std::shared_ptr<infinicore::graph::Graph>>(m, "Graph")
.def(py::init<>()) // allow construction
.def("run", &infinicore::graph::Graph::run);
}
} // namespace infinicore::graph
......@@ -6,6 +6,7 @@
#include "device.hpp"
#include "device_event.hpp"
#include "dtype.hpp"
#include "graph.hpp"
#include "ops.hpp"
#include "tensor.hpp"
......@@ -18,6 +19,7 @@ PYBIND11_MODULE(_infinicore, m) {
dtype::bind(m);
ops::bind(m);
tensor::bind(m);
graph::bind(m);
}
} // namespace infinicore
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment