Unverified Commit 22804eaa authored by blkmjsian's avatar blkmjsian Committed by GitHub
Browse files

[T2-3-1]blkmjsian

- deepseek
- jiuge 4B awq 
parent 5c6000ec
......@@ -2,12 +2,12 @@
#include "../tensor.hpp"
#include "../utils.hpp"
InferenceContext::InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream)
: rsrc(rsrc), cache_manager(cache_manager), stream(stream) {}
InferenceContext::InferenceContext(infiniopHandle_t op_handle_, std::shared_ptr<MemoryPool> memory_pool_, CacheManager *cache_manager, infinirtStream_t stream)
: op_handle(op_handle_), memory_pool(memory_pool_), cache_manager(cache_manager), stream(stream) {}
void InferenceContext::ensure_workspace(size_t required_size) {
if (required_size > current_workspace_size || !workspace_storage) {
workspace_storage = Storage::createFromPool(required_size, rsrc->memory_pool);
workspace_storage = Storage::createFromPool(required_size, memory_pool);
current_workspace_size = required_size;
}
}
......@@ -19,7 +19,7 @@ void InferenceContext::add(std::shared_ptr<Tensor> c,
infiniopAddDescriptor_t desc;
if (!cache_manager->getAddDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateAddDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
RUN_INFINI(infiniopCreateAddDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
cache_manager->putAddDescriptor(key, desc);
}
......@@ -42,7 +42,7 @@ void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
infiniopRMSNormDescriptor_t desc;
if (!cache_manager->getRMSNormDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRMSNormDescriptor(
rsrc->handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
op_handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
cache_manager->putRMSNormDescriptor(key, desc);
}
......@@ -64,7 +64,7 @@ void InferenceContext::gemm(std::shared_ptr<Tensor> c,
infiniopGemmDescriptor_t desc;
if (!cache_manager->getGemmDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateGemmDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
RUN_INFINI(infiniopCreateGemmDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc()));
cache_manager->putGemmDescriptor(key, desc);
}
......@@ -84,7 +84,7 @@ void InferenceContext::rearrange(std::shared_ptr<Tensor> dst,
infiniopRearrangeDescriptor_t desc;
if (!cache_manager->getRearrangeDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRearrangeDescriptor(rsrc->handle, &desc, dst->desc(), src->desc()));
RUN_INFINI(infiniopCreateRearrangeDescriptor(op_handle, &desc, dst->desc(), src->desc()));
cache_manager->putRearrangeDescriptor(key, desc);
}
......@@ -105,7 +105,7 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
infiniopRoPEDescriptor_t desc;
if (!cache_manager->getRoPEDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRoPEDescriptor(
rsrc->handle, &desc, q->desc(), k->desc(),
op_handle, &desc, q->desc(), k->desc(),
pos->desc(), sin->desc(), cos->desc()));
cache_manager->putRoPEDescriptor(key, desc);
}
......@@ -121,6 +121,32 @@ void InferenceContext::rope(std::shared_ptr<Tensor> q,
sin->data(), cos->data(), stream));
}
void InferenceContext::rope_v2(std::shared_ptr<Tensor> q,
std::shared_ptr<Tensor> k,
std::shared_ptr<Tensor> pos,
std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos) {
size_t key = CacheManager::createDescriptorKey(q, k, pos, sin, cos);
infiniopRoPEv2Descriptor_t desc;
if (!cache_manager->getRoPEv2Descriptor(key, desc)) {
RUN_INFINI(infiniopCreateRoPEv2Descriptor(
op_handle, &desc, q->desc(), k->desc(),
pos->desc(), sin->desc(), cos->desc()));
cache_manager->putRoPEv2Descriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetRoPEv2WorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopRoPEv2(
desc, workspace, workspace_size,
q->data(), k->data(), pos->data(),
sin->data(), cos->data(), stream));
}
void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x) {
size_t key = CacheManager::createDescriptorKey(y, x);
......@@ -128,7 +154,7 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
infiniopCausalSoftmaxDescriptor_t desc;
if (!cache_manager->getCausalSoftmaxDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateCausalSoftmaxDescriptor(
rsrc->handle, &desc, y->desc(), x->desc()));
op_handle, &desc, y->desc(), x->desc()));
cache_manager->putCausalSoftmaxDescriptor(key, desc);
}
......@@ -141,6 +167,31 @@ void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
y->data(), x->data(), stream));
}
void InferenceContext::topkrouter(std::shared_ptr<Tensor> values, // F32
std::shared_ptr<Tensor> indices, // I32
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> correction_bias, // F32
float routed_scaling_factor,
size_t topk) {
size_t key = CacheManager::createDescriptorKey(values, indices, x, correction_bias);
infiniopTopkrouterDescriptor_t desc;
if (!cache_manager->getTopkrouterDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateTopkrouterDescriptor(
op_handle, &desc, x->desc(), correction_bias->desc()));
cache_manager->putTopkrouterDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetTopkrouterWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopTopkrouter(desc, workspace, workspace_size,
values->data(), indices->data(), x->data(), correction_bias->data(),
routed_scaling_factor, topk, stream));
}
void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> up,
std::shared_ptr<Tensor> gate) {
......@@ -149,7 +200,7 @@ void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
infiniopSwiGLUDescriptor_t desc;
if (!cache_manager->getSwiGLUDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateSwiGLUDescriptor(
rsrc->handle, &desc, out->desc(), up->desc(), gate->desc()));
op_handle, &desc, out->desc(), up->desc(), gate->desc()));
cache_manager->putSwiGLUDescriptor(key, desc);
}
......@@ -170,7 +221,7 @@ void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
infiniopRandomSampleDescriptor_t desc;
if (!cache_manager->getRandomSampleDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateRandomSampleDescriptor(
rsrc->handle, &desc, out->desc(), prob->desc()));
op_handle, &desc, out->desc(), prob->desc()));
cache_manager->putRandomSampleDescriptor(key, desc);
}
......@@ -209,8 +260,8 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
if (beta == 0.0) {
gemm(c, a, b, alpha, 1.0);
} else {
auto c_copy = Tensor::buffer(c->dtype(), c->shape(), rsrc->memory_pool);
c_copy->copyFrom(c, rsrc->handle, stream);
auto c_copy = Tensor::buffer(c->dtype(), c->shape(), memory_pool);
c_copy->copyFrom(c, op_handle, stream);
gemm(c, a, b, alpha, beta);
add(c, c, c_copy);
}
......@@ -231,3 +282,26 @@ void InferenceContext::linear(std::shared_ptr<Tensor> c,
add(c, c, bias->view_as(c->shape(), strides));
}
}
void InferenceContext::dequant(std::shared_ptr<Tensor> weight,
std::shared_ptr<Tensor> in_w,
std::shared_ptr<Tensor> in_s,
std::shared_ptr<Tensor> in_z) {
size_t key = CacheManager::createDescriptorKey(weight, in_w, in_s, in_z);
infiniopDequantizeDescriptor_t desc;
if (!cache_manager->getDequantizeDescriptor(key, desc)) {
RUN_INFINI(infiniopCreateDequantizeDescriptor(op_handle, &desc, weight->desc(), in_w->desc(), in_s->desc(), in_z->desc()));
cache_manager->putDequantizeDescriptor(key, desc);
}
size_t workspace_size = 0;
RUN_INFINI(infiniopGetDequantizeWorkspaceSize(desc, &workspace_size));
ensure_workspace(workspace_size);
void *workspace = workspace_storage->memory();
RUN_INFINI(infiniopDequantize(
desc, workspace, workspace_size,
weight->data(), in_w->data(), in_s->data(), in_z->data(), 0, 0, 0, stream));
}
#pragma once
#include "cache_manager.hpp"
#include "jiuge/jiuge_impl.hpp"
#include "jiuge/jiuge_weight.hpp"
#include "../cache_manager/opcache_manager.hpp"
#include <cassert>
struct InferenceContext {
DeviceResource *rsrc;
infiniopHandle_t op_handle;
std::shared_ptr<MemoryPool> memory_pool;
CacheManager *cache_manager;
infinirtStream_t stream;
std::shared_ptr<Storage> workspace_storage;
size_t current_workspace_size = 0;
InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream);
InferenceContext(infiniopHandle_t op_handle, std::shared_ptr<MemoryPool> memory_pool, CacheManager *cache_manager, infinirtStream_t stream);
void ensure_workspace(size_t required_size);
......@@ -34,8 +34,21 @@ struct InferenceContext {
std::shared_ptr<Tensor> pos,
std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos);
void rope_v2(std::shared_ptr<Tensor> q,
std::shared_ptr<Tensor> k,
std::shared_ptr<Tensor> pos,
std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos);
void causalSoftmax(std::shared_ptr<Tensor> y,
std::shared_ptr<Tensor> x);
void topkrouter(std::shared_ptr<Tensor> values, // F32
std::shared_ptr<Tensor> indices, // I32
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> correction_bias, // F32
float routed_scaling_factor,
size_t topk);
void swiglu(std::shared_ptr<Tensor> out,
std::shared_ptr<Tensor> up,
std::shared_ptr<Tensor> gate);
......@@ -49,6 +62,10 @@ struct InferenceContext {
float alpha, float beta,
std::shared_ptr<Tensor> residual,
std::shared_ptr<Tensor> bias);
void dequant(std::shared_ptr<Tensor> weight,
std::shared_ptr<Tensor> in_w,
std::shared_ptr<Tensor> in_s,
std::shared_ptr<Tensor> in_z);
};
namespace {
......@@ -88,10 +105,31 @@ inline void rope(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
getInferenceContext().rope(q, k, pos, sin, cos);
}
inline void rope_v2(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
std::shared_ptr<Tensor> pos, std::shared_ptr<Tensor> sin,
std::shared_ptr<Tensor> cos) {
getInferenceContext().rope_v2(q, k, pos, sin, cos);
}
inline void causalSoftmax(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x) {
getInferenceContext().causalSoftmax(y, x);
}
inline void topkrouter(std::shared_ptr<Tensor> values, // F32
std::shared_ptr<Tensor> indices, // I32
std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> correction_bias, // F32
float routed_scaling_factor,
size_t topk) {
getInferenceContext().topkrouter(values, // F32
indices, // I32
x,
correction_bias, // F32
routed_scaling_factor,
topk);
}
inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
std::shared_ptr<Tensor> gate) {
getInferenceContext().swiglu(out, up, gate);
......@@ -107,3 +145,11 @@ inline void linear(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
getInferenceContext().linear(c, a, b, alpha, beta, residual, bias);
}
inline void dequant_linear(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> x,
std::shared_ptr<Tensor> w_w, std::shared_ptr<Tensor> w_s, std::shared_ptr<Tensor> w_z,
float alpha, float beta, std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
auto w = Tensor::buffer(x->dtype(), {x->shape()[1], out->shape()[1]}, getInferenceContext().memory_pool);
getInferenceContext().dequant(w, w_w, w_s, w_z);
getInferenceContext().linear(out, x, w, alpha, beta, residual, bias);
}
......@@ -10,7 +10,7 @@
#include <thread>
#include <vector>
void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
void createDeviceResource(JiugeDeviceResource *rsrc, const JiugeMeta *meta,
const JiugeWeights *weights,
infiniDevice_t device, int idev,
int ndev, int dev_id,
......@@ -44,7 +44,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
*rsrc = DeviceResource{
*rsrc = JiugeDeviceResource{
device,
dev_id,
handle,
......@@ -67,7 +67,7 @@ void createDeviceResource(DeviceResource *rsrc, const JiugeMeta *meta,
RUN_INFINI(infinirtDeviceSynchronize());
}
void releaseDeviceResource(DeviceResource &res) {
void releaseDeviceResource(JiugeDeviceResource &res) {
infinirtDeviceSynchronize();
// Release individual Tensors
res.w_in_embd.reset();
......@@ -111,7 +111,7 @@ void releaseDeviceResource(DeviceResource &res) {
res.comm = nullptr;
}
void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
void inferDeviceBatch(const JiugeMeta &meta, JiugeDeviceResource &rsrc,
uint32_t idev, uint32_t ndev,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
......@@ -298,7 +298,7 @@ void inferDeviceBatch(const JiugeMeta &meta, DeviceResource &rsrc,
}
__C void
inferBatch(struct JiugeModel *model,
inferBatchJiuge(struct JiugeModel *model,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
......@@ -331,7 +331,7 @@ inferBatch(struct JiugeModel *model,
}
__C void
forwardBatch(struct JiugeModel *model,
forwardBatchJiuge(struct JiugeModel *model,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
......@@ -362,16 +362,17 @@ forwardBatch(struct JiugeModel *model,
}
}
void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
void launchDevice(const JiugeMeta &meta, const JiugeWeights *weights, JiugeDeviceResource *rsrc, InferState &state, InferRequest &req,
infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
// Create Device Resource
createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
CacheManager cache_manager(100);
InferenceContext ctx(rsrc, &cache_manager, rsrc->stream);
InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream);
// Set the inference context for this thread
setInferenceContext(&ctx);
// Create Device Resource
createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm);
{
std::unique_lock<std::mutex> lock(state.mtx);
state.loaded = true;
......@@ -406,7 +407,7 @@ JiugeModel::JiugeModel(const JiugeMeta *_meta, const JiugeWeights *weights, infi
int ndev = int(device_ids.size());
device = device_;
dev_ids = device_ids;
dev_resources = std::vector<DeviceResource>(ndev);
dev_resources = std::vector<JiugeDeviceResource>(ndev);
states = std::vector<InferState>(ndev);
threads.resize(ndev);
RUN_INFINI(infinirtInit());
......
......@@ -12,7 +12,7 @@
#include <thread>
#include <vector>
struct DeviceResource {
struct JiugeDeviceResource {
// Device
infiniDevice_t device;
int device_id;
......@@ -56,7 +56,7 @@ struct JiugeModel {
JiugeMeta meta;
infiniDevice_t device;
std::vector<int> dev_ids;
std::vector<DeviceResource> dev_resources;
std::vector<JiugeDeviceResource> dev_resources;
std::vector<InferState> states;
std::vector<std::thread> threads;
InferRequest req;
......@@ -64,8 +64,6 @@ struct JiugeModel {
JiugeModel(const JiugeMeta *, const JiugeWeights *, infiniDevice_t device, std::vector<int> device_ids);
};
struct KVCache {
std::vector<std::vector<std::shared_ptr<Tensor>>> k, v;
};
#include "../../cache.hpp"
#endif
#include "jiuge_awq.hpp"
#include "../../tensor.hpp"
#include "../../utils.hpp"
#include "../inference_context.hpp"
#include <random>
#include <thread>
#include <vector>
void createDeviceResource(DeviceResource *rsrc, const JiugeAWQMeta *meta,
std::shared_ptr<JiugeAWQDeviceWeight> weights,
infiniDevice_t device, int idev,
int ndev, int dev_id,
infinicclComm_t comm) {
RUN_INFINI(infinirtSetDevice(device, dev_id));
infiniopHandle_t handle;
infiniopCreateHandle(&handle);
infinirtStream_t stream;
infinirtStreamCreate(&stream);
auto memory_pool = std::make_shared<MemoryPool>(128 * 1024 * 1024);
*rsrc = DeviceResource{
device,
dev_id,
handle,
weights,
stream,
comm,
memory_pool,
};
RUN_INFINI(infinirtDeviceSynchronize());
}
void releaseDeviceResource(DeviceResource &res) {
infinirtDeviceSynchronize();
// Release individual Tensors
infiniopDestroyHandle(res.handle);
res.handle = nullptr;
infinirtStreamDestroy(res.stream);
res.stream = nullptr;
infinicclCommDestroy(res.comm);
res.comm = nullptr;
}
void inferDeviceBatch(const JiugeAWQMeta *meta, DeviceResource &rsrc,
uint32_t idev, uint32_t ndev,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output, void *last_logits) {
auto nlayer = meta->nlayer;
auto nkvh = meta->nkvh / ndev;
auto nh = meta->nh / ndev;
auto ngroup = nh / nkvh;
// auto dctx = meta.dctx;
auto dh = meta->dh;
auto d = meta->d;
auto dt_logits = meta->dt_logits;
auto di = meta->di / ndev;
auto dvoc = meta->dvoc;
auto stream = rsrc.stream;
auto weight = rsrc.weights;
bool has_qkv_bias = meta->has_qkv_bias;
// Allocate buffers
auto logits_in = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
auto logits_out = Tensor::buffer(dt_logits, {ntok, d}, rsrc.memory_pool);
auto q_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
auto k_buf = Tensor::buffer(dt_logits, {ntok, nkvh * dh}, rsrc.memory_pool);
auto v_buf = Tensor::buffer(dt_logits, {ntok, nkvh * dh}, rsrc.memory_pool);
auto gate_buf = Tensor::buffer(dt_logits, {ntok, di}, rsrc.memory_pool);
auto up_buf = Tensor::buffer(dt_logits, {ntok, di}, rsrc.memory_pool);
auto o_buf = Tensor::buffer(dt_logits, {ntok, nh * dh}, rsrc.memory_pool);
auto prob_buf = Tensor::buffer(dt_logits, {nreq, dvoc}, rsrc.memory_pool);
auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool);
auto result_cpu = std::vector<int64_t>(nreq);
// Prepare inputs
auto batch_pos_ids = std::vector<uint32_t>(ntok);
size_t req_start = 0;
for (uint32_t req = 0; req < nreq; req++) {
for (uint32_t i = 0; i < req_lens[req]; i++) {
batch_pos_ids[req_start + i] = req_pos[req] + i;
}
req_start += req_lens[req];
}
std::shared_ptr<Tensor> pos_ids_buf;
if (rsrc.device == INFINI_DEVICE_CPU) {
pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok});
} else {
pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool);
RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok,
INFINIRT_MEMCPY_H2D, stream));
}
for (uint32_t i = 0; i < ntok; i++) {
RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d),
weight->w_in_embd->data(tokens[i] * d),
dsize(dt_logits) * d, INFINIRT_MEMCPY_D2D, stream));
}
// Attention
// attention inner
size_t max_qk_size = 0;
size_t max_seq_len = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto past_len = req_pos[req];
auto seq_len = req_lens[req];
auto total_len = past_len + seq_len;
max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len));
max_seq_len = std::max(max_seq_len, size_t(seq_len));
}
auto qk_buf = Tensor::buffer(dt_logits, {nh, max_qk_size}, rsrc.memory_pool);
auto rearrange_q_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh});
auto attn_val_buf = Tensor::buffer(dt_logits, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool);
auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh});
// Compute
for (uint32_t layer = 0; layer < nlayer; layer++) {
// 1. Attention
// rms norm
rmsnorm(logits_out, logits_in, weight->w_attn_norm[layer], meta->epsilon);
// qkv_proj
dequant_linear(q_buf, logits_out,
weight->w_attn_q[layer]->w, weight->w_attn_q[layer]->s, weight->w_attn_q[layer]->z,
1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_q[layer] : nullptr);
dequant_linear(k_buf, logits_out,
weight->w_attn_k[layer]->w, weight->w_attn_k[layer]->s, weight->w_attn_k[layer]->z,
1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_k[layer] : nullptr);
dequant_linear(v_buf, logits_out,
weight->w_attn_v[layer]->w, weight->w_attn_v[layer]->s, weight->w_attn_v[layer]->z,
1.0, 0.0, nullptr, has_qkv_bias ? weight->b_attn_v[layer] : nullptr);
// rope
rope_v2(q_buf->view({ntok, nh, dh}), q_buf->view({ntok, nh, dh}), pos_ids_buf, weight->sin_table, weight->cos_table);
rope_v2(k_buf->view({ntok, nkvh, dh}), k_buf->view({ntok, nkvh, dh}), pos_ids_buf, weight->sin_table, weight->cos_table);
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto past_len = req_pos[req];
auto seq_len = req_lens[req];
auto total_len = past_len + seq_len;
auto o = o_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
auto q = q_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});
auto k = k_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh});
auto v = v_buf->slice({{0, token_offset, seq_len}})->view({seq_len, nkvh, dh});
// self attention
// concat
rearrange(kv_caches[req]->k[idev][layer]->slice(0, past_len, seq_len), k);
rearrange(kv_caches[req]->v[idev][layer]->slice(0, past_len, seq_len), v);
// qk
rearrange(q_rearrange->slice(2, 0, seq_len), q);
auto qk_gemm = qk_buf->slice(1, 0, seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len});
auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0});
linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr);
// softmax
auto qk_softmax = qk_buf->slice(1, 0, seq_len * total_len)->view({nh, seq_len, total_len});
causalSoftmax(qk_softmax, qk_softmax);
auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2});
linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr);
// rearrange attn val
rearrange(o, attn_val_gemm->slice(2, 0, seq_len));
token_offset += seq_len;
}
// o_proj
dequant_linear(logits_in, o_buf, weight->w_attn_out[layer]->w, weight->w_attn_out[layer]->s, weight->w_attn_out[layer]->z,
1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dt_logits,
INFINICCL_SUM, rsrc.comm, stream));
RUN_INFINI(infinirtStreamSynchronize(stream));
}
// 2. FFN
rmsnorm(logits_out, logits_in, weight->w_ffn_norm[layer], meta->epsilon);
dequant_linear(gate_buf, logits_out,
weight->w_ffn_gate[layer]->w, weight->w_ffn_gate[layer]->s, weight->w_ffn_gate[layer]->z,
1.0, 0.0, nullptr, nullptr);
dequant_linear(up_buf, logits_out,
weight->w_ffn_up[layer]->w, weight->w_ffn_up[layer]->s, weight->w_ffn_up[layer]->z,
1.0, 0.0, nullptr, nullptr);
swiglu(gate_buf, up_buf, gate_buf);
dequant_linear(logits_in, gate_buf,
weight->w_ffn_down[layer]->w, weight->w_ffn_down[layer]->s, weight->w_ffn_down[layer]->z,
1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); // only rank 0 adds residual
// All_reduce if distributed
if (rsrc.comm != nullptr) {
RUN_INFINI(infinicclAllReduce(
logits_in->data(), logits_in->data(), ntok * d, dt_logits,
INFINICCL_SUM, rsrc.comm, stream));
RUN_INFINI(infinirtStreamSynchronize(stream));
}
}
// Sample and Output
if (idev == 0) {
if (last_logits != nullptr) {
rmsnorm(logits_out, logits_in, weight->w_out_norm, meta->epsilon);
auto last_logits_buf = Tensor::buffer(dt_logits, {ntok, dvoc}, rsrc.memory_pool);
linear(last_logits_buf, logits_out, weight->w_out_embd, 1.0, 0.0, nullptr, nullptr);
RUN_INFINI(infinirtStreamSynchronize(stream));
RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dt_logits) * ntok * dvoc, INFINIRT_MEMCPY_D2H));
}
if (output != nullptr) {
size_t token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
token_offset += seq_len;
rmsnorm(logits_out->slice(0, req, 1),
logits_in->slice(0, token_offset - 1, 1),
weight->w_out_norm,
meta->epsilon);
}
linear(prob_buf, logits_out->slice(0, 0, nreq), weight->w_out_embd, 1.0, 0.0, nullptr, nullptr);
std::random_device _rd;
std::mt19937 gen(_rd());
token_offset = 0;
for (uint32_t req = 0; req < nreq; req++) {
auto seq_len = req_lens[req];
float random_val = std::uniform_real_distribution<float>(0, 1)(gen);
randomSample(result_buf->slice(0, req, 1)->view_as({}, {}),
prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}),
random_val, topp[req], topk[req], temperature[req]);
token_offset += seq_len;
}
RUN_INFINI(infinirtStreamSynchronize(stream));
RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(),
sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H));
for (uint32_t req = 0; req < nreq; req++) {
output[req] = uint32_t(result_cpu[req]);
}
}
}
}
__C void
inferBatchJiugeAWQ(struct JiugeAWQModel *model,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output) {
model->req.tokens = tokens;
model->req.ntok = ntok;
model->req.req_lens = req_lens;
model->req.nreq = nreq;
model->req.req_pos = req_pos;
model->req.kv_caches = kv_caches;
model->req.output = output;
model->req.logits = nullptr;
model->req.temperature = temperature;
model->req.topk = topk;
model->req.topp = topp;
for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].proceed = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t i = model->dev_ids.size(); i > 0; i--) {
auto idev = i - 1;
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
lock.unlock();
}
}
__C void
forwardBatchJiugeAWQ(struct JiugeAWQModel *model,
const uint32_t *tokens, uint32_t ntok,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct KVCache **kv_caches,
void *logits) {
model->req.tokens = tokens;
model->req.ntok = ntok;
model->req.req_lens = req_lens;
model->req.nreq = nreq;
model->req.req_pos = req_pos;
model->req.kv_caches = kv_caches;
model->req.output = nullptr;
model->req.logits = logits;
model->req.temperature = nullptr;
model->req.topk = nullptr;
model->req.topp = nullptr;
for (size_t idev = 0; idev < model->dev_ids.size(); idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].proceed = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t i = model->dev_ids.size(); i > 0; i--) {
auto idev = i - 1;
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); });
lock.unlock();
}
}
void launchDevice(const JiugeAWQMeta *meta, std::shared_ptr<JiugeAWQDeviceWeight> weights, DeviceResource *rsrc, InferState &state, InferRequest &req,
infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) {
// Create Device Resource
createDeviceResource(rsrc, meta, weights, device, idev, ndev, dev_id, comm);
CacheManager cache_manager(100);
InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream);
// Set the inference context for this thread
setInferenceContext(&ctx);
{
std::unique_lock<std::mutex> lock(state.mtx);
state.loaded = true;
lock.unlock();
state.cv_load.notify_one();
}
// Infer Loop
while (true) {
std::unique_lock<std::mutex> lock(state.mtx);
state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; });
// quit if exit_flag is set
if (state.exit_flag) {
break;
}
inferDeviceBatch(meta, *rsrc, idev, ndev, req.tokens, req.ntok,
req.req_lens, req.nreq, req.req_pos, req.kv_caches,
req.temperature, req.topk, req.topp, req.output, req.logits);
state.proceed = false;
lock.unlock();
state.cv_done.notify_one();
}
// Clean-Up
releaseDeviceResource(*rsrc);
setInferenceContext(nullptr); // Clear the context when done
}
JiugeAWQModel::JiugeAWQModel(const JiugeAWQMeta *meta, const ModelWeights *weights_) {
auto weights = (JiugeAWQWeights *)(weights_);
device = weights->device();
dev_ids = weights->dev_ids();
int ndev = int(dev_ids.size());
dev_resources = std::vector<DeviceResource>(ndev);
states = std::vector<InferState>(ndev);
threads.resize(ndev);
auto comms = std::vector<infinicclComm_t>(ndev, nullptr);
if (ndev > 1) {
RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data()));
}
for (int i = 0; i < ndev; i++) {
threads[i] = std::thread(launchDevice, meta, weights->device_weights()[i], &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]);
}
for (int i = 0; i < ndev; i++) {
std::unique_lock<std::mutex> lock(states[i].mtx);
states[i].cv_load.wait(lock, [&] { return states[i].loaded; });
lock.unlock();
}
}
__C struct JiugeAWQModel *
createJiugeAWQModel(const JiugeAWQMeta *meta,
const ModelWeights *weights) {
JiugeAWQModel *model = new JiugeAWQModel(meta, weights);
return model;
}
__C void destroyJiugeAWQModel(struct JiugeAWQModel *model) {
auto ndev = model->dev_resources.size();
for (size_t idev = 0; idev < ndev; idev++) {
std::unique_lock<std::mutex> lock(model->states[idev].mtx);
model->states[idev].exit_flag = true;
lock.unlock();
model->states[idev].cv_start.notify_one();
}
for (size_t idev = 0; idev < ndev; idev++) {
model->threads[idev].join();
}
delete model;
}
#pragma once
#include "infinicore_infer/models/jiuge_awq.h"
#include "../../cache.hpp"
#include "../../dataloader/weights_loader.hpp"
#include <condition_variable>
#include <mutex>
#include <thread>
struct QuantInt4Weight {
std::shared_ptr<Tensor> w, s, z;
};
struct JiugeAWQDeviceWeight {
std::shared_ptr<Tensor> w_in_embd, w_out_norm, w_out_embd, sin_table,
cos_table;
std::vector<std::shared_ptr<Tensor>> w_attn_norm, b_attn_q, b_attn_k, b_attn_v, w_ffn_norm;
std::vector<std::shared_ptr<QuantInt4Weight>> w_attn_q, w_attn_k, w_attn_v, w_attn_out, w_ffn_gate, w_ffn_up, w_ffn_down;
};
class JiugeAWQWeights : public infinicore::WeightsLoader {
private:
std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> _device_weights;
public:
JiugeAWQWeights(const JiugeAWQMeta *meta,
infiniDevice_t device,
const std::vector<int> &dev_ids);
std::vector<std::shared_ptr<JiugeAWQDeviceWeight>> &device_weights() {
return _device_weights;
}
};
struct DeviceResource {
// Device
infiniDevice_t device;
int device_id;
infiniopHandle_t handle;
// Weights
std::shared_ptr<JiugeAWQDeviceWeight> weights;
// Streams
infinirtStream_t stream;
// Communicator
infinicclComm_t comm;
std::shared_ptr<MemoryPool> memory_pool;
};
struct InferRequest {
const uint32_t *tokens;
uint32_t ntok;
const uint32_t *req_lens;
uint32_t nreq;
const uint32_t *req_pos;
struct KVCache **kv_caches;
const float *temperature;
const uint32_t *topk;
const float *topp;
uint32_t *output;
void *logits;
};
struct InferState {
std::mutex mtx;
std::condition_variable cv_load, cv_start, cv_done;
bool loaded = false;
bool proceed = false;
bool exit_flag = false;
};
struct JiugeAWQModel {
JiugeAWQMeta meta;
infiniDevice_t device;
std::vector<int> dev_ids;
std::vector<DeviceResource> dev_resources;
std::vector<InferState> states;
std::vector<std::thread> threads;
InferRequest req;
JiugeAWQModel(const JiugeAWQMeta *, const ModelWeights *);
};
\ No newline at end of file
#include "jiuge_awq.hpp"
#include <cmath>
inline std::shared_ptr<Tensor> getSinTable(size_t dctx, size_t dh, float theta) {
auto half_dh = dh / 2;
auto unit = dsize(INFINI_DTYPE_F16);
void *table = std::malloc(dctx * half_dh * unit);
for (size_t i = 0; i < dctx; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _sin = std::sin(
static_cast<float>(i) / std::pow(theta, static_cast<float>(j) / half_dh));
((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin);
}
}
auto shape = std::vector<size_t>({dctx, half_dh});
auto tensor = Tensor::weight(table, INFINI_DTYPE_F16, shape);
std::free(table);
return tensor;
}
inline std::shared_ptr<Tensor> getCosTable(size_t dctx, size_t dh, float theta) {
auto half_dh = dh / 2;
auto unit = dsize(INFINI_DTYPE_F16);
void *table = std::malloc(dctx * half_dh * unit);
for (size_t i = 0; i < dctx; i++) {
for (size_t j = 0; j < half_dh; j++) {
float _cos = std::cos(
static_cast<float>(i) / std::pow(theta, static_cast<float>(j) / half_dh));
((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos);
}
}
auto shape = std::vector<size_t>({dctx, half_dh});
auto tensor = Tensor::weight(table, INFINI_DTYPE_F16, shape);
std::free(table);
return tensor;
}
JiugeAWQWeights::JiugeAWQWeights(
const JiugeAWQMeta *meta,
infiniDevice_t device,
const std::vector<int> &dev_ids) : infinicore::WeightsLoader(device, dev_ids) {
auto ndev = dev_ids.size();
_device_weights.resize(ndev);
infiniDtype_t dt_logits = meta->dt_logits;
infiniDtype_t dt_norm_w = meta->dt_norm_w;
size_t nlayer = meta->nlayer;
size_t d = meta->d;
size_t nh = meta->nh / ndev;
size_t nkvh = meta->nkvh / ndev;
size_t dh = meta->dh;
size_t di = meta->di / ndev;
size_t dctx = meta->dctx;
size_t dvoc = meta->dvoc;
size_t nbit = meta->nbit;
size_t quant_group_size = meta->quant_group_size;
for (size_t i = 0; i < ndev; i++) {
RUN_INFINI(infinirtSetDevice(device, dev_ids[i]));
auto weight = std::make_shared<JiugeAWQDeviceWeight>();
_device_weights[i] = weight;
auto w_in_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d});
this->resigter("model.embed_tokens.weight", w_in_embd, i);
weight->w_in_embd = w_in_embd;
auto w_out_norm = Tensor::weight(nullptr, dt_norm_w, {d});
this->resigter("model.norm.weight", w_out_norm, i);
weight->w_out_norm = w_out_norm;
auto w_out_embd = Tensor::weight(nullptr, dt_logits, {dvoc, d})->permute({1, 0});
this->resigter("lm_head.weight", w_out_embd, i);
weight->w_out_embd = w_out_embd;
weight->sin_table = getSinTable(dctx, dh, meta->theta);
weight->cos_table = getCosTable(dctx, dh, meta->theta);
for (size_t layer = 0; layer < nlayer; layer++) {
#define RIGISTER_LAYER_WEIGHT(W_NAME, W_VAR, W_SHAPE, W_DTYPE) \
auto W_VAR = Tensor::weight(nullptr, W_DTYPE, W_SHAPE); \
this->resigter(W_NAME, W_VAR, i); \
weight->W_VAR.push_back(W_VAR);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".input_layernorm.weight", w_attn_norm, {d}, dt_norm_w);
#define REGISTER_LAYER_QUANT_WEIGHT(W_NAME, W_VAR, W_IN, W_OUT) \
auto W_VAR = std::make_shared<QuantInt4Weight>(); \
W_VAR->w = Tensor::weight(nullptr, INFINI_DTYPE_I32, {W_IN, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qweight", W_VAR->w, i); \
W_VAR->s = Tensor::weight(nullptr, INFINI_DTYPE_F16, {(W_IN) / quant_group_size, (W_OUT)}); \
this->resigter(W_NAME + ".scales", W_VAR->s, i); \
W_VAR->z = Tensor::weight(nullptr, INFINI_DTYPE_I32, {(W_IN) / quant_group_size, (W_OUT)*nbit / 32}); \
this->resigter(W_NAME + ".qzeros", W_VAR->z, i); \
weight->W_VAR.push_back(W_VAR);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj", w_attn_q, d, nh * dh);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj", w_attn_k, d, nkvh * dh);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj", w_attn_v, d, nkvh * dh);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.q_proj.bias", b_attn_q, {nh * dh}, INFINI_DTYPE_F16);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.k_proj.bias", b_attn_k, {nkvh * dh}, INFINI_DTYPE_F16);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.v_proj.bias", b_attn_v, {nkvh * dh}, INFINI_DTYPE_F16);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".self_attn.o_proj", w_attn_out, nh * dh, d);
RIGISTER_LAYER_WEIGHT("model.layers." + std::to_string(layer) + ".post_attention_layernorm.weight", w_ffn_norm, {d}, dt_norm_w);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.gate_proj", w_ffn_gate, d, di);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.up_proj", w_ffn_up, d, di);
REGISTER_LAYER_QUANT_WEIGHT("model.layers." + std::to_string(layer) + ".mlp.down_proj", w_ffn_down, di, d);
}
}
#undef RIGISTER_LAYER_WEIGHT
#undef REGISTER_LAYER_QUANT_WEIGHT
}
__C struct ModelWeights *
createJiugeAWQWeights(const JiugeAWQMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids) {
JiugeAWQWeights *weights = new JiugeAWQWeights(meta, device, std::vector<int>(dev_ids, dev_ids + ndev));
return (struct ModelWeights *)weights;
}
......@@ -2,7 +2,6 @@
#define INFER_TENSOR_H
#include "allocator.hpp"
#include "infinicore_infer.h"
#include "utils.hpp"
#include <memory>
#include <string>
......@@ -101,6 +100,7 @@ public:
static std::shared_ptr<Tensor> weight(void *host_data,
infiniDtype_t dtype,
const std::vector<size_t> &shape);
void load(const void *host_data, infinirtStream_t stream = nullptr);
std::shared_ptr<Tensor> memShare(const std::vector<size_t> &shape,
infiniDtype_t dtype = INFINI_DTYPE_INVALID) const;
std::shared_ptr<Tensor> slice(size_t dim, size_t start, size_t len);
......@@ -126,6 +126,7 @@ public:
ptrdiff_t dataOffset() const;
infiniDevice_t deviceType() const;
int deviceId() const;
size_t numel() const;
void debug(const std::string &filename) const;
void debug() const;
......
......@@ -113,6 +113,10 @@ infiniDevice_t Tensor::deviceType() const { return this->_storage->deviceType();
int Tensor::deviceId() const { return this->_storage->deviceId(); }
Tensor::~Tensor() {}
size_t Tensor::numel() const {
return std::accumulate(this->shape().begin(), this->shape().end(), size_t(1), std::multiplies<size_t>());
}
ptrdiff_t Tensor::dataOffset() const {
return _offset;
}
......@@ -154,16 +158,26 @@ std::shared_ptr<Tensor> Tensor::weight(void *data, infiniDtype_t dtype,
tensor->_storage = Storage::create(size);
tensor->_desc = TensorDesc::create(dtype, shape, strides);
if (data != nullptr) {
tensor->load(data);
}
tensor->_offset = 0;
return tensor;
}
void Tensor::load(const void *data, infinirtStream_t stream) {
if (stream) {
RUN_INFINI(infinirtMemcpyAsync(this->_storage->memory(), data, this->_storage->size(), INFINIRT_MEMCPY_H2D, stream));
return;
}
// NOTE: 为兼容部分平台(沐曦)多线程并发对同一host数据执行memcpy卡死问题
static std::mutex mutex;
{
std::lock_guard<std::mutex> lock(mutex);
RUN_INFINI(infinirtMemcpy(tensor->_storage->memory(),
data, size, INFINIRT_MEMCPY_H2D));
RUN_INFINI(infinirtMemcpy(this->_storage->memory(),
data, this->_storage->size(), INFINIRT_MEMCPY_H2D));
}
tensor->_offset = 0;
return tensor;
}
std::shared_ptr<Tensor> Tensor::memShare(const std::vector<size_t> &shape, infiniDtype_t dtype_) const {
......
......@@ -16,6 +16,8 @@ target("infinicore_infer")
add_files("src/models/*/*.cpp")
add_files("src/tensor/*.cpp")
add_files("src/allocator/*.cpp")
add_files("src/dataloader/*.cpp")
add_files("src/cache_manager/*.cpp")
add_includedirs("include")
set_installdir(INFINI_ROOT)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment