Unverified Commit 8d09630a authored by gongchensu's avatar gongchensu Committed by GitHub
Browse files

Merge branch 'demo131' into Issue/862

parents ab52dead 012df56c
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/dequantize_awq.hpp"
#include <infiniop.h>
namespace infinicore::op::dequantize_awq_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, DequantizeAWQ, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, x, x_packed, x_scale, x_zeros;
};
void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zeros) {
size_t seed = hash_combine(x, x_packed, x_scale, x_zeros);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, DequantizeAWQ,
seed,
x->desc(), x_packed->desc(), x_scale->desc(), x_zeros->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, DequantizeAWQ, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(x),
graph::GraphTensor(x_packed),
graph::GraphTensor(x_scale),
graph::GraphTensor(x_zeros)};
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopDequantizeAWQ(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->x->data(),
planned->x_packed->data(),
planned->x_scale->data(),
planned->x_zeros->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(DequantizeAWQ, &plan, &run, &cleanup);
} // namespace infinicore::op::dequantize_awq_impl::infiniop
#include "infinicore/ops/distributed/allreduce.hpp"
#include "../../utils.hpp"
namespace infinicore::op::distributed {
struct PlannedMeta {
graph::GraphTensor output, input;
infinicclReduceOp_t op;
infinicclComm_t communicator;
};
AllReduce::AllReduce(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(output, input);
INFINICORE_ASSERT(output->is_contiguous() && input->is_contiguous());
INFINICORE_ASSERT(output->numel() == input->numel());
planned_meta_ = new PlannedMeta{graph::GraphTensor(output), graph::GraphTensor(input), op, communicator};
}
AllReduce::~AllReduce() {
if (planned_meta_) {
PlannedMeta *meta = reinterpret_cast<PlannedMeta *>(planned_meta_);
delete meta;
}
}
void AllReduce::run() const {
PlannedMeta *meta = reinterpret_cast<PlannedMeta *>(planned_meta_);
INFINICORE_CHECK_ERROR(infinicclAllReduce(meta->input->data(),
meta->output->data(),
meta->input->numel(),
static_cast<infiniDtype_t>(static_cast<int>(meta->input->dtype())),
meta->op,
meta->communicator,
infinicore::context::getStream()));
}
void AllReduce::execute(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(AllReduce, output, input, op, communicator);
}
Tensor allreduce(const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
auto output = Tensor::empty(input->shape(), input->dtype(), input->device());
allreduce_(output, input, op, communicator);
return output;
}
void allreduce_(Tensor output, const Tensor &input, infinicclReduceOp_t op, infinicclComm_t communicator) {
AllReduce::execute(output, input, op, communicator);
}
} // namespace infinicore::op::distributed
#include "infinicore/ops/embedding.hpp"
#include "infinicore/context/context.hpp"
#include <cstring>
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);
Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
}
Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
}
Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
) {
auto input_shape = input->shape();
auto weight_shape = weight->shape();
// auto vocab_size = weight_shape[0];
auto embedding_dim = weight_shape[1];
// Assign memory to out variables
......@@ -21,69 +30,8 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
return inputs_embeds;
}
void embedding_(Tensor out, Tensor input, Tensor weight) {
assert(infinicore::DataType::I64 == input->dtype() || (infinicore::DataType::I32 == input->dtype()));
assert(infinicore::Device::Type::CPU == input->device().getType());
auto input_shape = input->shape();
auto weight_shape = weight->shape();
auto embedding_dim = weight_shape[1];
// Calculate the number of token
Size counts = 1;
for (auto &v : input_shape) {
counts *= v;
}
// the bytes of one token
const Size bytes = dsize(weight->dtype()) * embedding_dim;
auto *weight_ptr = weight->data();
auto *out_ptr = out->data();
// copies
if (weight->device().getType() == Device::Type::CPU) {
if (infinicore::DataType::I64 == input->dtype()) {
const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
for (Size i = 0; i < counts; ++i) {
int64_t idx = input_arr[i];
assert((idx >= 0) && (idx < weight_shape[0]));
std::memcpy(out_ptr + i * bytes,
weight_ptr + idx * bytes,
bytes);
}
} else if (infinicore::DataType::I32 == input->dtype()) {
const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
for (Size i = 0; i < counts; ++i) {
int32_t idx = input_arr[i];
assert((idx >= 0) && (idx < weight_shape[0]));
std::memcpy(out_ptr + i * bytes,
weight_ptr + idx * bytes,
bytes);
}
}
} else {
if (infinicore::DataType::I64 == input->dtype()) {
const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
for (Size i = 0; i < counts; ++i) {
int64_t idx = input_arr[i];
assert((idx >= 0) && (idx < weight_shape[0]));
context::memcpyD2D(out_ptr + i * bytes,
weight_ptr + idx * bytes,
bytes);
}
} else if (infinicore::DataType::I32 == input->dtype()) {
const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
for (Size i = 0; i < counts; ++i) {
int32_t idx = input_arr[i];
assert((idx >= 0) && (idx < weight_shape[0]));
context::memcpyD2D(out_ptr + i * bytes,
weight_ptr + idx * bytes,
bytes);
}
}
}
void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
Embedding::execute(out, input, weight);
}
} // namespace infinicore::op
#include "../infiniop_impl.hpp"
#include "infinicore/ops/embedding.hpp"
namespace infinicore::op::embedding_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor out, input, weight;
};
void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
size_t seed = hash_combine(out, input, weight);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Embedding,
seed, out->desc(), input->desc(), weight->desc());
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(out),
graph::GraphTensor(input),
graph::GraphTensor(weight)};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopEmbedding(
planned->descriptor->desc,
planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);
} // namespace infinicore::op::embedding_impl::infiniop
#include "infinicore/ops/flash_attention.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(FlashAttention);
FlashAttention::FlashAttention(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k, v);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
out, q, k, v, total_kv_len, scale, is_causal);
}
void FlashAttention::execute(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(FlashAttention, out, q, k, v, total_kv_len, scale, is_causal);
}
Tensor flash_attention(const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
Shape shape = q->shape();
int idx = shape.size() - 1;
shape[idx] = v->shape()[idx];
auto out = Tensor::empty(shape, q->dtype(), q->device());
flash_attention_(out, q, k, v, total_kv_len, scale, is_causal);
return out;
}
void flash_attention_(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
FlashAttention::execute(out, q, k, v, total_kv_len, scale, is_causal);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/flash_attention.hpp"
#include <infiniop.h>
namespace infinicore::op::flash_attention_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, FlashAttention, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, out, q, k, v, total_kv_len;
float scale;
bool is_causal;
};
void *plan(Tensor out, const Tensor &q, const Tensor &k, const Tensor &v, const Tensor &total_kv_len, float scale, bool is_causal) {
size_t seed = hash_combine(out, q, k, v, total_kv_len, scale, is_causal);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, FlashAttention,
seed, out->desc(), q->desc(), k->desc(), v->desc(), total_kv_len->desc(), scale, is_causal);
INFINIOP_WORKSPACE_TENSOR(workspace, FlashAttention, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(out),
graph::GraphTensor(q),
graph::GraphTensor(k),
graph::GraphTensor(v),
graph::GraphTensor(total_kv_len), scale, is_causal};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopFlashAttention(
planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
planned->out->data(), planned->q->data(), planned->k->data(), planned->v->data(), planned->total_kv_len->data(), context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(FlashAttention, &plan, &run, &cleanup);
} // namespace infinicore::op::flash_attention_impl::infiniop
......@@ -3,19 +3,18 @@
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm);
common::OpDispatcher<Gemm::schema> &Gemm::dispatcher() {
static common::OpDispatcher<Gemm::schema> dispatcher_;
return dispatcher_;
};
void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
Gemm::Gemm(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta);
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta);
}
void Gemm::execute(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta);
}
Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
Tensor gemm(const Tensor &a, const Tensor &b, float alpha, float beta) {
Shape shape = a->shape();
Size size = a->ndim();
shape[size - 1] = b->size(size - 1);
......@@ -24,7 +23,7 @@ Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
return c;
}
void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
void gemm_(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
Gemm::execute(c, a, b, alpha, beta);
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/ops/gemm.hpp"
#include <infiniop.h>
namespace infinicore::op::gemm_impl::infiniop {
thread_local common::OpCache<size_t, infiniopGemmDescriptor_t> caches(
100, // capacity
[](infiniopGemmDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyGemmDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
size_t seed = hash_combine(c, b, a, alpha, beta);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopGemmDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
context::getInfiniopHandle(device), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, c, a, b;
float alpha, beta;
};
void *plan(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
size_t seed = hash_combine(c, a, b);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Gemm,
seed, c->desc(), a->desc(), b->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a),
graph::GraphTensor(b),
alpha, beta};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopGemm(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), alpha, beta, context::getStream()));
planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
planned->c->data(), planned->a->data(), planned->b->data(), planned->alpha, planned->beta, context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
Gemm::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup);
} // namespace infinicore::op::gemm_impl::infiniop
#pragma once
#include "../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include <infiniop.h>
#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
struct __DESC_TYPE__ { \
infiniop##__OP_NAME__##Descriptor_t desc = nullptr; \
\
explicit __DESC_TYPE__(infiniop##__OP_NAME__##Descriptor_t d) \
: desc(d) {} \
\
/* non-copyable */ \
__DESC_TYPE__(const __DESC_TYPE__ &) = delete; \
__DESC_TYPE__ &operator=(const __DESC_TYPE__ &) = delete; \
\
/* movable */ \
__DESC_TYPE__(__DESC_TYPE__ &&other) noexcept \
: desc(other.desc) { \
other.desc = nullptr; \
} \
\
__DESC_TYPE__ &operator=(__DESC_TYPE__ &&other) noexcept { \
if (this != &other) { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
desc = other.desc; \
other.desc = nullptr; \
} \
return *this; \
} \
\
~__DESC_TYPE__() { \
if (desc != nullptr) { \
infiniopDestroy##__OP_NAME__##Descriptor(desc); \
} \
} \
}; \
\
thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>> \
caches( \
__SIZE__, \
[](std::shared_ptr<__DESC_TYPE__> &desc) { \
desc = nullptr; \
});
#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
std::shared_ptr<__DESC_TYPE__> __DESC_NAME__; \
{ \
auto device__ = context::getDevice(); \
auto &cache__ = caches.getCache(device__); \
__DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr); \
if (!__DESC_NAME__) { \
__DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr); \
INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor( \
context::getInfiniopHandle(device__), \
&__DESC_NAME__->desc, \
__VA_ARGS__)); \
cache__.put(__HASH_KEY__, __DESC_NAME__); \
} \
}
#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__) \
Tensor __TENSOR_NAME__; \
{ \
auto device__ = context::getDevice(); \
size_t workspace_size = 0; \
INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
__TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__); \
}
#include "infinicore/ops/kv_caching.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(KVCaching);
KVCaching::KVCaching(Tensor k_cache,
Tensor v_cache,
const Tensor &k,
const Tensor &v,
const Tensor &past_kv_lengths) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, past_kv_lengths);
INFINICORE_GRAPH_OP_DISPATCH(k_cache->device().getType(),
k_cache,
v_cache,
k,
v,
past_kv_lengths);
}
void KVCaching::execute(Tensor k_cache,
Tensor v_cache,
const Tensor &k,
const Tensor &v,
const Tensor &past_kv_lengths) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(KVCaching,
k_cache,
v_cache,
k,
v,
past_kv_lengths);
}
void kv_caching_(Tensor k_cache,
Tensor v_cache,
const Tensor &k,
const Tensor &v,
const Tensor &past_kv_lengths) {
KVCaching::execute(k_cache, v_cache, k, v, past_kv_lengths);
}
} // namespace infinicore::op
#include "../infiniop_impl.hpp"
#include "infinicore/ops/kv_caching.hpp"
namespace infinicore::op::kv_caching_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, KVCaching, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, k_cache, v_cache, k, v, past_kv_lengths;
};
void *plan(Tensor k_cache,
Tensor v_cache,
const Tensor &k,
const Tensor &v,
const Tensor &past_kv_lengths) {
size_t seed = hash_combine(k_cache, v_cache, k, v, past_kv_lengths);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, KVCaching,
seed, k_cache->desc(), v_cache->desc(),
k->desc(), v->desc(), past_kv_lengths->desc());
INFINIOP_WORKSPACE_TENSOR(workspace, KVCaching, descriptor);
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(k_cache),
graph::GraphTensor(v_cache),
graph::GraphTensor(k),
graph::GraphTensor(v),
graph::GraphTensor(past_kv_lengths)};
return planned;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopKVCaching(
planned->descriptor->desc,
nullptr, 0,
planned->k_cache->data(),
planned->v_cache->data(),
planned->k->data(),
planned->v->data(),
planned->past_kv_lengths->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(KVCaching, &plan, &run, cleanup);
} // namespace infinicore::op::kv_caching_impl::infiniop
#include "infinicore/ops/linear.hpp"
#include "infinicore/ops/add.hpp"
#include "infinicore/ops/matmul.hpp"
#include "infinicore/ops/gemm.hpp"
#include "infinicore/ops/rearrange.hpp"
namespace infinicore::op {
......@@ -42,16 +42,18 @@ void linear_(Tensor out,
// linear transformation
Tensor out_view = out->view({N, out_features});
matmul_(out_view,
input->view({N, in_features}),
weight->permute({1, 0}));
// Add bias
float alpha = 1.0f;
float beta = 0.0f;
if (bias.has_value()) {
add_(out_view,
out_view,
bias.value()->as_strided({N, out_features}, {0, 1}));
rearrange_(out_view,
bias.value()->as_strided({N, out_features}, {0, 1}));
beta = 1.0f;
}
gemm_(out_view,
input->view({N, in_features}),
weight->permute({1, 0}), alpha, beta);
}
} // namespace infinicore::op
#include "infinicore/ops/linear_w4a16_awq.hpp"
#include "infinicore/ops/dequantize_awq.hpp"
#include "infinicore/ops/gemm.hpp"
namespace infinicore::op {
Tensor linear_w4a16_awq(Tensor input,
Tensor weight_packed,
Tensor weight_scale,
Tensor weight_zeros,
std::optional<Tensor> bias) {
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size ndim = input->ndim();
Size out_features = weight_packed->shape()[0];
// Assign memory to out variables
auto output_shape = input->shape();
output_shape[ndim - 1] = out_features;
auto out = Tensor::empty(output_shape, input->dtype(), input->device());
// Inplace Calculate
linear_w4a16_awq_(out, input, weight_packed, weight_scale, weight_zeros, bias);
return out;
}
void linear_w4a16_awq_(Tensor out,
Tensor input,
Tensor weight_packed,
Tensor weight_scale,
Tensor weight_zeros,
std::optional<Tensor> bias) {
auto weight_packed_shape = weight_packed->shape();
Size out_features = weight_packed_shape[0];
Size in_features = weight_packed_shape[1];
Size ndim = input->ndim();
assert(out->ndim() == ndim);
Size N = 1;
auto input_shape = input->shape();
for (size_t i = 0; i < ndim - 1; ++i) {
N *= input_shape[i];
}
auto weight = Tensor::empty(
{out_features, in_features},
out->dtype(),
weight_packed->device());
float alpha = 1.0f;
float beta = 0.0f;
op::dequantize_awq_(weight, weight_packed, weight_scale, weight_zeros);
bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
gemm_(out->view({N, out_features}),
input->view({N, in_features}),
weight->permute({1, 0}), alpha, beta);
}
} // namespace infinicore::op
#include "infinicore/ops/linear_w8a8i8.hpp"
#include "infinicore/ops/per_channel_quant_i8.hpp"
#include "infinicore/ops/scaled_mm_i8.hpp"
namespace infinicore::op {
Tensor linear_w8a8i8(Tensor input,
Tensor weight_packed,
Tensor weight_scale,
std::optional<Tensor> bias) {
// Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
Size ndim = input->ndim();
Size out_features = weight_packed->shape()[0];
// Assign memory to out variables
auto output_shape = input->shape();
output_shape[ndim - 1] = out_features;
auto out = Tensor::empty(output_shape, input->dtype(), input->device());
// Inplace Calculate
linear_w8a8i8_(out, input, weight_packed, weight_scale, bias);
return out;
}
void linear_w8a8i8_(Tensor out,
Tensor input,
Tensor weight_packed,
Tensor weight_scale,
std::optional<Tensor> bias) {
auto weight_packed_shape = weight_packed->shape();
Size out_features = weight_packed_shape[0];
Size in_features = weight_packed_shape[1];
Size ndim = input->ndim();
assert(out->ndim() == ndim);
Size N = 1;
auto input_shape = input->shape();
for (size_t i = 0; i < ndim - 1; ++i) {
N *= input_shape[i];
}
auto input_packed = Tensor::empty(
{N, input_shape[ndim - 1]},
DataType::I8,
input->device());
auto input_scale = Tensor::empty(
{N, 1},
DataType::F32,
input->device());
op::per_channel_quant_i8_(input->view({N, in_features}), input_packed, input_scale);
if (bias.has_value()) {
bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
}
op::scaled_mm_i8_(
out->view({N, out_features}),
input_packed,
input_scale,
weight_packed->permute({1, 0}),
weight_scale,
bias);
}
} // namespace infinicore::op
#include "infinicore/ops/mul.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<Mul::schema> &Mul::dispatcher() {
static common::OpDispatcher<Mul::schema> dispatcher_;
return dispatcher_;
};
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Mul);
void Mul::execute(Tensor c, Tensor a, Tensor b) {
Mul::Mul(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
infinicore::context::setDevice(c->device());
dispatcher().lookup(c->device().getType())(c, a, b);
INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
}
void Mul::execute(Tensor c, const Tensor &a, const Tensor &b) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Mul, c, a, b);
}
Tensor mul(Tensor a, Tensor b) {
Tensor mul(const Tensor &a, const Tensor &b) {
auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
mul_(c, a, b);
return c;
}
void mul_(Tensor c, Tensor a, Tensor b) {
void mul_(Tensor c, const Tensor &a, const Tensor &b) {
Mul::execute(c, a, b);
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/mul.hpp"
#include <infiniop.h>
#include "../infiniop_impl.hpp"
namespace infinicore::op::mul_impl::infiniop {
thread_local common::OpCache<size_t, infiniopMulDescriptor_t> caches(
100, // capacity
[](infiniopMulDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyMulDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Mul, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, c, a, b;
};
void calculate(Tensor c, Tensor a, Tensor b) {
void *plan(Tensor c, const Tensor &a, const Tensor &b) {
size_t seed = hash_combine(c, b, a);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Mul,
seed, c->desc(), a->desc(), b->desc());
auto desc_opt = cache.get(seed);
infiniopMulDescriptor_t desc = nullptr;
INFINIOP_WORKSPACE_TENSOR(workspace, Mul, descriptor);
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateMulDescriptor(
context::getInfiniopHandle(device), &desc,
c->desc(), a->desc(), b->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(c),
graph::GraphTensor(a),
graph::GraphTensor(b)};
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetMulWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopMul(
desc, workspace->data(), workspace_size,
c->data(), a->data(), b->data(), context::getStream()));
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->c->data(),
planned->a->data(),
planned->b->data(),
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
Mul::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Mul, &plan, &run, &cleanup);
} // namespace infinicore::op::mul_impl::infiniop
#include "infinicore/ops/paged_attention.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedAttention);
PagedAttention::PagedAttention(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
const Tensor &block_tables, const Tensor &kv_lens,
std::optional<Tensor> alibi_slopes, float scale) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
}
void PagedAttention::execute(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
const Tensor &block_tables, const Tensor &kv_lens,
std::optional<Tensor> alibi_slopes, float scale) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(
PagedAttention,
out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
}
Tensor paged_attention(const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
const Tensor &block_tables, const Tensor &kv_lens,
std::optional<Tensor> alibi_slopes, float scale) {
auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
return out;
}
void paged_attention_(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
const Tensor &block_tables, const Tensor &kv_lens,
std::optional<Tensor> alibi_slopes, float scale) {
PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
}
} // namespace infinicore::op
#include "infinicore/ops/paged_attention.hpp"
#include "../infiniop_impl.hpp"
namespace infinicore::op::paged_attention_impl::infiniop {
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedAttention, 100);
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, out, q, k_cache, v_cache, block_tables, cache_lens;
std::optional<graph::GraphTensor> alibi_slopes;
float scale;
};
void *plan(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
const Tensor &block_tables, const Tensor &cache_lens,
std::optional<Tensor> alibi_slopes, float scale) {
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, PagedAttention,
seed,
out->desc(), q->desc(), k_cache->desc(), v_cache->desc(),
block_tables->desc(), cache_lens->desc(),
alibi_slopes ? alibi_slopes.value()->desc() : nullptr,
scale);
INFINIOP_WORKSPACE_TENSOR(workspace, PagedAttention, descriptor);
return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(out),
graph::GraphTensor(q),
graph::GraphTensor(k_cache),
graph::GraphTensor(v_cache),
graph::GraphTensor(block_tables),
graph::GraphTensor(cache_lens),
alibi_slopes ? std::optional<graph::GraphTensor>(graph::GraphTensor(*alibi_slopes)) : std::nullopt,
scale};
}
void run(void *planned_meta) {
auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(
infiniopPagedAttention(
p->descriptor->desc,
p->workspace->data(),
p->workspace->numel(),
p->out->data(),
p->q->data(),
p->k_cache->data(),
p->v_cache->data(),
p->block_tables->data(),
p->cache_lens->data(),
p->alibi_slopes.has_value() ? p->alibi_slopes.value()->data() : nullptr,
context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedAttention, &plan, &run, &cleanup);
} // namespace infinicore::op::paged_attention_impl::infiniop
#include "infinicore/ops/paged_attention_prefill.hpp"
#include "../../utils.hpp"
namespace infinicore::op {
common::OpDispatcher<PagedAttentionPrefill::schema> &PagedAttentionPrefill::dispatcher() {
static common::OpDispatcher<PagedAttentionPrefill::schema> dispatcher_;
return dispatcher_;
};
void PagedAttentionPrefill::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
std::optional<Tensor> alibi_slopes, float scale) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q);
infinicore::context::setDevice(out->device());
dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables,
kv_lens, cum_seqlens_q, alibi_slopes, scale);
}
Tensor paged_attention_prefill(Tensor q, Tensor k_cache, Tensor v_cache,
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
std::optional<Tensor> alibi_slopes, float scale) {
auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
paged_attention_prefill_(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
return out;
}
void paged_attention_prefill_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
std::optional<Tensor> alibi_slopes, float scale) {
PagedAttentionPrefill::execute(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
}
} // namespace infinicore::op
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/paged_attention_prefill.hpp"
#include <infiniop.h>
namespace infinicore::op::paged_attention_prefill_impl::infiniop {
thread_local common::OpCache<size_t, infiniopPagedAttentionPrefillDescriptor_t> caches(
100, // capacity
[](infiniopPagedAttentionPrefillDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyPagedAttentionPrefillDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache,
Tensor block_tables, Tensor kv_lens, Tensor cum_seqlens_q,
std::optional<Tensor> alibi_slopes, float scale) {
size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, cum_seqlens_q, alibi_slopes, scale);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
auto desc_opt = cache.get(seed);
infiniopPagedAttentionPrefillDescriptor_t desc = nullptr;
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionPrefillDescriptor(
context::getInfiniopHandle(device), &desc,
out->desc(),
q->desc(),
k_cache->desc(),
v_cache->desc(),
block_tables->desc(),
kv_lens->desc(),
cum_seqlens_q->desc(),
alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
scale));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
size_t workspace_size = 0;
INFINICORE_CHECK_ERROR(infiniopGetPagedAttentionPrefillWorkspaceSize(desc, &workspace_size));
std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
INFINICORE_CHECK_ERROR(infiniopPagedAttentionPrefill(
desc,
workspace->data(),
workspace_size,
out->data(),
q->data(),
k_cache->data(),
v_cache->data(),
block_tables->data(),
kv_lens->data(),
cum_seqlens_q->data(),
alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
context::getStream()));
}
static bool registered = []() {
PagedAttentionPrefill::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::paged_attention_prefill_impl::infiniop
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment