Commit d7965f91 authored by wooway777's avatar wooway777
Browse files

issue/21 - Initial Modualization

parent f59c7bf5
#ifndef CACHE_MANAGER_HPP
#define CACHE_MANAGER_HPP
#include <functional>
#include <memory>
#include <unordered_map>
#include <vector>
#include "../tensor.hpp"
#include "../utils.hpp"
#include "infinicore_infer.h"
// Hash combine utility (similar to boost::hash_combine)
inline void hash_combine(size_t &seed, size_t value) {
seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
// Specialization for enum types
template <typename T>
inline void hash_combine(size_t &seed, T value, typename std::enable_if<std::is_enum<T>::value>::type * = 0) {
hash_combine(seed, static_cast<size_t>(value));
}
// Specialization for float to handle potential precision issues
inline void hash_combine(size_t &seed, float value) {
// Treat float bits as uint32_t for consistent hashing
uint32_t int_value;
static_assert(sizeof(value) == sizeof(int_value), "Size mismatch");
std::memcpy(&int_value, &value, sizeof(value));
hash_combine(seed, static_cast<size_t>(int_value));
}
// Helper function to compute hash for tensor descriptors
inline size_t computeTensorDescHash(std::shared_ptr<TensorDesc> desc) {
size_t seed = 0;
hash_combine(seed, desc->dtype());
for (auto dim : desc->shape()) {
hash_combine(seed, dim);
}
for (auto stride : desc->strides()) {
hash_combine(seed, static_cast<size_t>(stride));
}
return seed;
}
enum class OperatorType {
RMS_NORM,
GEMM,
ROPE,
REARRANGE,
CAUSAL_SOFTMAX,
SWIGLU,
RANDOM_SAMPLE
};
template <typename DescriptorType>
class LRUDescriptorCache {
private:
struct CacheNode {
size_t key;
DescriptorType desc;
CacheNode *prev;
CacheNode *next;
CacheNode() : key(0), desc(), prev(nullptr), next(nullptr) {}
CacheNode(size_t k, const DescriptorType &d) : key(k), desc(d), prev(nullptr), next(nullptr) {}
};
std::unordered_map<size_t, CacheNode *> cache;
CacheNode *head;
CacheNode *tail;
const size_t capacity;
size_t size;
const OperatorType opType;
void destroyDescriptor(DescriptorType &desc) {
switch (opType) {
case OperatorType::RMS_NORM:
infiniopDestroyRMSNormDescriptor(desc);
break;
case OperatorType::GEMM:
infiniopDestroyGemmDescriptor(desc);
break;
case OperatorType::ROPE:
infiniopDestroyRoPEDescriptor(desc);
break;
case OperatorType::REARRANGE:
infiniopDestroyRearrangeDescriptor(desc);
break;
case OperatorType::CAUSAL_SOFTMAX:
infiniopDestroyCausalSoftmaxDescriptor(desc);
break;
case OperatorType::SWIGLU:
infiniopDestroySwiGLUDescriptor(desc);
break;
case OperatorType::RANDOM_SAMPLE:
infiniopDestroyRandomSampleDescriptor(desc);
break;
default:
throw std::runtime_error("Unknown descriptor type");
}
}
void removeNode(CacheNode *node) {
node->prev->next = node->next;
node->next->prev = node->prev;
destroyDescriptor(node->desc);
cache.erase(node->key);
delete node;
--size;
}
void addToTop(CacheNode *node) {
node->next = head->next;
node->next->prev = node;
node->prev = head;
head->next = node;
cache[node->key] = node;
if (++size > capacity) {
removeNode(tail->prev);
}
}
void moveToTop(CacheNode *node) {
node->prev->next = node->next;
node->next->prev = node->prev;
node->next = head->next;
node->next->prev = node;
node->prev = head;
head->next = node;
}
public:
LRUDescriptorCache(size_t c, OperatorType t) : capacity(c), size(0), opType(t) {
head = new CacheNode();
tail = new CacheNode();
head->next = tail;
tail->prev = head;
}
~LRUDescriptorCache() {
while (head->next != tail) {
removeNode(head->next);
}
delete head;
delete tail;
}
bool get(size_t key, DescriptorType &out_desc) {
auto it = cache.find(key);
if (it == cache.end()) {
return false;
}
CacheNode *node = it->second;
moveToTop(node);
out_desc = node->desc;
return true;
}
void put(size_t key, const DescriptorType &descriptor) {
auto it = cache.find(key);
if (it != cache.end()) {
// Key already exists, update the descriptor
CacheNode *node = it->second;
destroyDescriptor(node->desc);
node->desc = descriptor;
moveToTop(node);
return;
}
// Check if we need to evict
if (size >= capacity) {
removeNode(tail->prev);
}
// Create new node and add to top
CacheNode *node = new CacheNode(key, descriptor);
addToTop(node);
}
LRUDescriptorCache(const LRUDescriptorCache &) = delete;
LRUDescriptorCache &operator=(const LRUDescriptorCache &) = delete;
};
class CacheManager {
private:
const size_t DEFAULT_CACHE_CAPACITY = 100;
LRUDescriptorCache<infiniopRMSNormDescriptor_t> rms_norm_cache;
LRUDescriptorCache<infiniopGemmDescriptor_t> gemm_cache;
LRUDescriptorCache<infiniopRoPEDescriptor_t> rope_cache;
LRUDescriptorCache<infiniopRearrangeDescriptor_t> rearrange_cache;
LRUDescriptorCache<infiniopCausalSoftmaxDescriptor_t> causal_softmax_cache;
LRUDescriptorCache<infiniopSwiGLUDescriptor_t> swiglu_cache;
LRUDescriptorCache<infiniopRandomSampleDescriptor_t> random_sample_cache;
public:
CacheManager(size_t capacity = 100) : rms_norm_cache(capacity, OperatorType::RMS_NORM),
gemm_cache(capacity, OperatorType::GEMM),
rope_cache(capacity, OperatorType::ROPE),
rearrange_cache(capacity, OperatorType::REARRANGE),
causal_softmax_cache(capacity, OperatorType::CAUSAL_SOFTMAX),
swiglu_cache(capacity, OperatorType::SWIGLU),
random_sample_cache(capacity, OperatorType::RANDOM_SAMPLE) {}
// RMSNorm operations
bool getRMSNormDescriptor(size_t key, infiniopRMSNormDescriptor_t &desc) {
return rms_norm_cache.get(key, desc);
}
void putRMSNormDescriptor(size_t key, const infiniopRMSNormDescriptor_t &desc) {
rms_norm_cache.put(key, desc);
}
// GEMM operations
bool getGemmDescriptor(size_t key, infiniopGemmDescriptor_t &desc) {
return gemm_cache.get(key, desc);
}
void putGemmDescriptor(size_t key, const infiniopGemmDescriptor_t &desc) {
gemm_cache.put(key, desc);
}
// RoPE operations
bool getRoPEDescriptor(size_t key, infiniopRoPEDescriptor_t &desc) {
return rope_cache.get(key, desc);
}
void putRoPEDescriptor(size_t key, const infiniopRoPEDescriptor_t &desc) {
rope_cache.put(key, desc);
}
// Rearrange operations
bool getRearrangeDescriptor(size_t key, infiniopRearrangeDescriptor_t &desc) {
return rearrange_cache.get(key, desc);
}
void putRearrangeDescriptor(size_t key, const infiniopRearrangeDescriptor_t &desc) {
rearrange_cache.put(key, desc);
}
// Softmax operations
bool getCausalSoftmaxDescriptor(size_t key, infiniopCausalSoftmaxDescriptor_t &desc) {
return causal_softmax_cache.get(key, desc);
}
void putCausalSoftmaxDescriptor(size_t key, const infiniopCausalSoftmaxDescriptor_t &desc) {
causal_softmax_cache.put(key, desc);
}
// SwiGLU operations
bool getSwiGLUDescriptor(size_t key, infiniopSwiGLUDescriptor_t &desc) {
return swiglu_cache.get(key, desc);
}
void putSwiGLUDescriptor(size_t key, const infiniopSwiGLUDescriptor_t &desc) {
swiglu_cache.put(key, desc);
}
// Random Sample operations
bool getRandomSampleDescriptor(size_t key, infiniopRandomSampleDescriptor_t &desc) {
return random_sample_cache.get(key, desc);
}
void putRandomSampleDescriptor(size_t key, const infiniopRandomSampleDescriptor_t &desc) {
random_sample_cache.put(key, desc);
}
static size_t createDescriptorKey(std::shared_ptr<TensorDesc> desc0,
std::shared_ptr<TensorDesc> desc1,
std::shared_ptr<TensorDesc> desc2,
std::shared_ptr<TensorDesc> desc3,
std::shared_ptr<TensorDesc> desc4) {
size_t seed = 0;
if (desc0) {
hash_combine(seed, computeTensorDescHash(desc0));
}
if (desc1) {
hash_combine(seed, computeTensorDescHash(desc1));
}
if (desc2) {
hash_combine(seed, computeTensorDescHash(desc2));
}
if (desc3) {
hash_combine(seed, computeTensorDescHash(desc3));
}
if (desc4) {
hash_combine(seed, computeTensorDescHash(desc4));
}
return seed;
}
};
#endif // CACHE_MANAGER_HPP
#include "inference_context.hpp"
#include "../tensor.hpp"
#include "../utils.hpp"
InferenceContext::InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream)
: rsrc(rsrc), cache_manager(cache_manager), stream(stream) {}
void InferenceContext::ensure_workspace(size_t required_size) {
if (required_size > current_workspace_size) {
workspace_storage = Storage::createFromPool(required_size, rsrc->memory_pool);
current_workspace_size = required_size;
}
}
void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
float epsilon) {
size_t key = CacheManager::createDescriptorKey(y->tdesc(), x->tdesc(), w->tdesc(), nullptr, nullptr);
infiniopRMSNormDescriptor_t desc;
if (!cache_manager->getRMSNormDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRMSNormDescriptor(
rsrc->handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
cache_manager->putRMSNormDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopRMSNorm(
desc, workspace, workspace_size,
y->data(), x->data(), w->data(), stream));
}
void InferenceContext::gemm(std::shared_ptr<Tensor> c, std::shared_ptr<TensorDesc> c_desc_overwrite,
std::shared_ptr<Tensor> a, std::shared_ptr<TensorDesc> a_desc_overwrite,
std::shared_ptr<Tensor> b, std::shared_ptr<TensorDesc> b_desc_overwrite,
float alpha, float beta) {
size_t key = CacheManager::createDescriptorKey(
c_desc_overwrite ? c_desc_overwrite : c->tdesc(),
a_desc_overwrite ? a_desc_overwrite : a->tdesc(),
b_desc_overwrite ? b_desc_overwrite : b->tdesc(),
nullptr, nullptr);
infiniopGemmDescriptor_t desc;
if (!cache_manager->getGemmDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateGemmDescriptor(
rsrc->handle, &desc,
c_desc_overwrite ? c_desc_overwrite->desc() : c->desc(),
a_desc_overwrite ? a_desc_overwrite->desc() : a->desc(),
b_desc_overwrite ? b_desc_overwrite->desc() : b->desc()));
cache_manager->putGemmDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopGemm(
desc, workspace, workspace_size,
c->data(), a->data(), b->data(), alpha, beta, stream));
}
void InferenceContext::rearrange(std::shared_ptr<Tensor> dst, std::shared_ptr<TensorDesc> dst_desc_overwrite,
std::shared_ptr<Tensor> src, std::shared_ptr<TensorDesc> src_desc_overwrite) {
size_t key = CacheManager::createDescriptorKey(
dst_desc_overwrite ? dst_desc_overwrite : dst->tdesc(),
src_desc_overwrite ? src_desc_overwrite : src->tdesc(),
nullptr, nullptr, nullptr);
infiniopRearrangeDescriptor_t desc;
if (!cache_manager->getRearrangeDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRearrangeDescriptor(
rsrc->handle, &desc,
dst_desc_overwrite ? dst_desc_overwrite->desc() : dst->desc(),
src_desc_overwrite ? src_desc_overwrite->desc() : src->desc()));
cache_manager->putRearrangeDescriptor(key, desc);
}
RUN_INFINI(infiniopRearrange(
desc,
dst->data(),
src->data(),
stream));
}
void InferenceContext::rope(std::shared_ptr<Tensor> q,
std::shared_ptr<Tensor> k,
std::shared_ptr<Tensor> pos,
std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos) {
size_t key = CacheManager::createDescriptorKey(q->tdesc(), k->tdesc(), pos->tdesc(), sin->tdesc(), cos->tdesc());
infiniopRoPEDescriptor_t desc;
if (!cache_manager->getRoPEDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRoPEDescriptor(
rsrc->handle, &desc, q->desc(), k->desc(),
pos->desc(), sin->desc(), cos->desc()));
cache_manager->putRoPEDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopRoPE(
desc, workspace, workspace_size,
q->data(), k->data(), pos->data(),
sin->data(), cos->data(), stream));
}
void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y, std::shared_ptr<TensorDesc> y_desc_overwrite,
std::shared_ptr<Tensor> x, std::shared_ptr<TensorDesc> x_desc_overwrite) {
size_t key = CacheManager::createDescriptorKey(
y_desc_overwrite ? y_desc_overwrite : y->tdesc(),
x_desc_overwrite ? x_desc_overwrite : x->tdesc(),
nullptr, nullptr, nullptr);
infiniopCausalSoftmaxDescriptor_t desc;
if (!cache_manager->getCausalSoftmaxDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateCausalSoftmaxDescriptor(
rsrc->handle, &desc,
y_desc_overwrite ? y_desc_overwrite->desc() : y->desc(),
x_desc_overwrite ? x_desc_overwrite->desc() : x->desc()));
cache_manager->putCausalSoftmaxDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetCausalSoftmaxWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopCausalSoftmax(desc, workspace, workspace_size,
y->data(), x->data(), stream));
}
void InferenceContext::swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up, std::shared_ptr<Tensor> gate) {
size_t key = CacheManager::createDescriptorKey(out->tdesc(), up->tdesc(), gate->tdesc(), nullptr, nullptr);
infiniopSwiGLUDescriptor_t desc;
if (!cache_manager->getSwiGLUDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateSwiGLUDescriptor(
rsrc->handle, &desc, out->desc(), up->desc(), gate->desc()));
cache_manager->putSwiGLUDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetSwiGLUWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopSwiGLU(desc, workspace, workspace_size,
out->data(), up->data(), gate->data(), stream));
}
void InferenceContext::randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<TensorDesc> out_desc_overwrite,
std::shared_ptr<Tensor> prob, std::shared_ptr<TensorDesc> prob_desc_overwrite,
float random_val, float top_p, uint32_t top_k, float temperature) {
size_t key = CacheManager::createDescriptorKey(
out_desc_overwrite ? out_desc_overwrite : out->tdesc(),
prob_desc_overwrite ? prob_desc_overwrite : prob->tdesc(),
nullptr, nullptr, nullptr);
infiniopRandomSampleDescriptor_t desc;
if (!cache_manager->getRandomSampleDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRandomSampleDescriptor(
rsrc->handle, &desc,
out_desc_overwrite ? out_desc_overwrite->desc() : out->desc(),
prob_desc_overwrite ? prob_desc_overwrite->desc() : prob->desc()));
cache_manager->putRandomSampleDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetRandomSampleWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopRandomSample(
desc, workspace, workspace_size,
out->data(), prob->data(),
random_val, top_p, top_k, temperature,
stream));
}
// inference_context.hpp
#pragma once
#include "cache_manager.hpp"
#include "jiuge/jiuge_impl.hpp"
#include "jiuge/jiuge_weight.hpp"
struct InferenceContext {
DeviceResource *rsrc;
CacheManager *cache_manager;
infinirtStream_t stream;
std::shared_ptr<Storage> workspace_storage;
size_t current_workspace_size = 0;
InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream);
void ensure_workspace(size_t required_size);
void rmsnorm(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w,
float epsilon);
void gemm(std::shared_ptr<Tensor> c, std::shared_ptr<TensorDesc> c_desc_overwrite,
std::shared_ptr<Tensor> a, std::shared_ptr<TensorDesc> a_desc_overwrite,
std::shared_ptr<Tensor> b, std::shared_ptr<TensorDesc> b_desc_overwrite,
float alpha, float beta);
void rearrange(std::shared_ptr<Tensor> dst, std::shared_ptr<TensorDesc> dst_desc_overwrite,
std::shared_ptr<Tensor> src, std::shared_ptr<TensorDesc> src_desc_overwrite);
void rope(std::shared_ptr<Tensor> q,
std::shared_ptr<Tensor> k,
std::shared_ptr<Tensor> pos,
std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos);
void causalSoftmax(std::shared_ptr<Tensor> y, std::shared_ptr<TensorDesc> y_desc_overwrite,
std::shared_ptr<Tensor> x, std::shared_ptr<TensorDesc> x_desc_overwrite);
void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up, std::shared_ptr<Tensor> gate);
void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<TensorDesc> out_desc_overwrite,
std::shared_ptr<Tensor> prob, std::shared_ptr<TensorDesc> prob_desc_overwrite,
float random_val, float top_p, uint32_t top_k, float temperature);
};
This diff is collapsed.
......@@ -120,6 +120,7 @@ public:
infiniDtype_t dtype() const;
bool isContigous() const;
infiniopTensorDescriptor_t desc() const;
std::shared_ptr<TensorDesc> tdesc() const;
ptrdiff_t dataOffset() const;
infiniDevice_t deviceType() const;
int deviceId() const;
......
......@@ -108,6 +108,7 @@ ptrdiff_t Tensor::dataOffset() const {
}
infiniopTensorDescriptor_t Tensor::desc() const { return _desc->desc(); }
std::shared_ptr<TensorDesc> Tensor::tdesc() const { return _desc; }
std::shared_ptr<Tensor> Tensor::buffer(infiniDtype_t dtype,
const std::vector<size_t> &shape,
......
......@@ -12,6 +12,7 @@ target("infinicore_infer")
set_languages("cxx17")
set_warnings("all", "error")
add_files("src/models/*.cpp")
add_files("src/models/*/*.cpp")
add_files("src/tensor/*.cpp")
add_files("src/allocator/*.cpp")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment