Commit f3d842a0 authored by chenht2022's avatar chenht2022
Browse files

support AMX

parent b90362b5
---
BasedOnStyle: LLVM
ColumnLimit: 120 # 设置最大行宽为 100
IndentWidth: 2
---
...@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2) ...@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)
aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/amx SOURCE_DIR6)
set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5}) set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5} ${SOURCE_DIR6})
file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h") file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
......
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2025-04-25 18:28:12
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2025-04-25 18:28:12
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch
expert_num = 8
hidden_size = 7168
intermediate_size = 2048
max_len = 25600
n_routed_experts = 8
layer_num = 10
qlen = 1024
CPUInfer = cpuinfer_ext.CPUInfer(65)
warm_up_iter = 100
test_iter = 100
def bench_moe(quant_mode: str):
with torch.inference_mode(mode=True):
if quant_mode == "bf16":
bytes_per_elem = 2.000000
elif quant_mode == "int8":
bytes_per_elem = 1.000000
else:
assert(False)
moes = []
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
config = cpuinfer_ext.moe.AMX_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr())
if quant_mode == "bf16":
moe = cpuinfer_ext.moe.AMXBF16_MOE(config)
CPUInfer.submit(moe.load_weights())
CPUInfer.sync()
elif quant_mode == "int8":
moe = cpuinfer_ext.moe.AMXInt8_MOE(config)
CPUInfer.submit(moe.load_weights())
CPUInfer.sync()
gate_projs.append(gate_proj)
up_projs.append(up_proj)
down_projs.append(down_proj)
moes.append(moe)
expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
# warm up
for i in range(warm_up_iter):
CPUInfer.submit(
moes[i % layer_num].forward(
qlen,
n_routed_experts,
expert_ids[i % layer_num].data_ptr(),
weights[i % layer_num].data_ptr(),
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr(),
qlen_tensor.data_ptr()
)
)
CPUInfer.sync()
# test
start = time.perf_counter()
for i in range(test_iter):
CPUInfer.submit(
moes[i % layer_num].forward(
qlen,
n_routed_experts,
expert_ids[i % layer_num].data_ptr(),
weights[i % layer_num].data_ptr(),
input[i % layer_num].data_ptr(),
output[i % layer_num].data_ptr(),
qlen_tensor.data_ptr()
)
)
CPUInfer.sync()
end = time.perf_counter()
total_time = end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('Flops: ', hidden_size * intermediate_size * qlen * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GFLOPS')
print('')
bench_moe("bf16")
bench_moe("int8")
...@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t ...@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t
if (buffer_) { if (buffer_) {
free(buffer_); free(buffer_);
} }
buffer_ = malloc(size); buffer_ = std::aligned_alloc(64, size);
size_ = size; size_ = size;
for (auto& obj_requests : hist_requests_) { for (auto& obj_requests : hist_requests_) {
for (auto& requests : obj_requests.second) { for (auto& requests : obj_requests.second) {
...@@ -52,4 +53,4 @@ void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) ...@@ -52,4 +53,4 @@ void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests)
*(request.first) = (uint8_t*)buffer_ + offset; *(request.first) = (uint8_t*)buffer_ + offset;
offset += request.second; offset += request.second;
} }
} }
\ No newline at end of file
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class SharedMemBuffer {
public:
SharedMemBuffer();
~SharedMemBuffer();
void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
void dealloc(void* object);
private:
void* buffer_;
uint64_t size_;
std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
void arrange(std::vector<std::pair<void**, uint64_t>> requests);
};
static SharedMemBuffer shared_mem_buffer;
#endif
\ No newline at end of file
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "operators/llamafile/linear.h" #include "operators/llamafile/linear.h"
#include "operators/llamafile/mlp.h" #include "operators/llamafile/mlp.h"
#include "operators/llamafile/moe.h" #include "operators/llamafile/moe.h"
#include "operators/amx/moe.hpp"
#include "pybind11/functional.h" #include "pybind11/functional.h"
#include "pybind11/operators.h" #include "pybind11/operators.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -563,6 +564,75 @@ class MOEBindings { ...@@ -563,6 +564,75 @@ class MOEBindings {
}; };
}; };
template<class T>
class AMX_MOEBindings {
public:
class WarmUpBindings {
public:
struct Args {
CPUInfer *cpuinfer;
AMX_MOE<T> *moe;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&AMX_MOE<T>::warm_up, args_->moe);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
Args *args = new Args{nullptr, &moe};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class LoadWeightsBindings {
public:
struct Args {
CPUInfer *cpuinfer;
AMX_MOE<T> *moe;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(&AMX_MOE<T>::load_weights, args_->moe);
}
static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
Args *args = new Args{nullptr, &moe};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
class ForwardBindings {
public:
struct Args {
CPUInfer *cpuinfer;
AMX_MOE<T> *moe;
int qlen;
int k;
const uint64_t *expert_ids;
const float *weights;
const void *input;
void *output;
int *batch_size_tensor;
};
static void inner(void *args) {
Args *args_ = (Args *)args;
args_->cpuinfer->enqueue(
&AMX_MOE<T>::forward, args_->moe, args_->qlen, args_->k,
args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
}
static std::pair<intptr_t, intptr_t>
cpuinfer_interface(AMX_MOE<T> &moe, int qlen, int k, intptr_t expert_ids,
intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
Args *args = new Args{nullptr,
&moe,
qlen,
k,
(const uint64_t *)expert_ids,
(const float *)weights,
(const void *)input,
(void *)output,
(int *)batch_size_tensor};
return std::make_pair((intptr_t)&inner, (intptr_t)args);
}
};
};
PYBIND11_MODULE(cpuinfer_ext, m) { PYBIND11_MODULE(cpuinfer_ext, m) {
py::class_<CPUInfer>(m, "CPUInfer") py::class_<CPUInfer>(m, "CPUInfer")
.def(py::init<int>()) .def(py::init<int>())
...@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) { ...@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
.def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface) .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
.def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface); .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);
py::class_<AMX_MOEConfig>(moe_module, "AMX_MOEConfig")
.def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
int intermediate_size,
int max_len, intptr_t gate_proj,
intptr_t up_proj, intptr_t down_proj) {
return AMX_MOEConfig(expert_num, routed_expert_num, hidden_size,
intermediate_size,
max_len, (void *)gate_proj,
(void *)up_proj, (void *)down_proj);
}));
py::class_<AMX_MOE<amx::GemmKernel224BF>>(moe_module, "AMXBF16_MOE")
.def(py::init<AMX_MOEConfig>())
.def("warm_up", &AMX_MOEBindings<amx::GemmKernel224BF>::WarmUpBindings::cpuinfer_interface)
.def("load_weights", &AMX_MOEBindings<amx::GemmKernel224BF>::LoadWeightsBindings::cpuinfer_interface)
.def("forward", &AMX_MOEBindings<amx::GemmKernel224BF>::ForwardBindings::cpuinfer_interface);
py::class_<AMX_MOE<amx::GemmKernel224Int8>>(moe_module, "AMXInt8_MOE")
.def(py::init<AMX_MOEConfig>())
.def("warm_up", &AMX_MOEBindings<amx::GemmKernel224Int8>::WarmUpBindings::cpuinfer_interface)
.def("load_weights", &AMX_MOEBindings<amx::GemmKernel224Int8>::LoadWeightsBindings::cpuinfer_interface)
.def("forward", &AMX_MOEBindings<amx::GemmKernel224Int8>::ForwardBindings::cpuinfer_interface);
auto kvcache_module = m.def_submodule("kvcache"); auto kvcache_module = m.def_submodule("kvcache");
py::enum_<AnchorType>(kvcache_module, "AnchorType") py::enum_<AnchorType>(kvcache_module, "AnchorType")
......
This diff is collapsed.
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <cstdint>
template <typename T>
T* offset_pointer(T* ptr, std::size_t byte_offset) {
return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
}
template <typename T>
const T* offset_pointer(const T* ptr, std::size_t byte_offset) {
return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
}
template <typename T>
T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) {
return offset_pointer(t, row * ld) + col;
}
template <typename T>
T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) {
return offset_pointer(t, col * ld) + row;
}
static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
_mm512_storeu_si512(dst, _mm512_loadu_si512(src));
}
static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
_mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
}
static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
_mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16)));
_mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16)));
}
\ No newline at end of file
/**
* @Description :
* @Author : chenht2022
* @Date : 2025-04-25 18:28:12
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2025-04-25 18:28:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_OPERATOR_AMX_MOE_H
#define CPUINFER_OPERATOR_AMX_MOE_H
#include <cmath>
#include <cstdio>
#include <functional>
#include <mutex>
#include <vector>
#include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h"
#include "la/amx.hpp"
#ifdef USE_NUMA
#include <numa.h>
#include <numaif.h>
void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
void *ptr = numa_alloc_onnode(size, node);
assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
return ptr;
}
#endif
static inline __m512 exp_avx512(__m512 x) {
const __m512 log2e = _mm512_set1_ps(1.44269504089f);
const __m512 c1 = _mm512_set1_ps(0.69314718056f);
__m512 y = _mm512_mul_ps(x, log2e);
__m512i int_part = _mm512_cvtps_epi32(y);
__m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));
const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);
__m512 frac_exp = _mm512_fmadd_ps(
frac_part, poly_6,
_mm512_fmadd_ps(frac_part, poly_5,
_mm512_fmadd_ps(frac_part, poly_4,
_mm512_fmadd_ps(frac_part, poly_3, _mm512_fmadd_ps(frac_part, poly_2, poly_1)))));
__m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
return _mm512_mul_ps(two_pow_i, frac_exp);
}
static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
__m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
__m512 exp_neg_gate = exp_avx512(neg_gate_val);
__m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
__m512 act_val = _mm512_div_ps(gate_val, denom);
return _mm512_mul_ps(act_val, up_val);
}
struct AMX_MOEConfig {
int expert_num;
int routed_expert_num;
int hidden_size;
int intermediate_size;
int max_len;
void *gate_proj;
void *up_proj;
void *down_proj;
AMX_MOEConfig() {}
AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len,
void *gate_proj, void *up_proj, void *down_proj)
: expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
intermediate_size(intermediate_size), max_len(max_len), gate_proj(gate_proj), up_proj(up_proj),
down_proj(down_proj) {}
};
template <class T> class AMX_MOE {
private:
AMX_MOEConfig config_;
void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void *up_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
ggml_bf16_t *m_local_input_; // [routed_expert_num * max_len * hidden_size]
ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t *m_local_up_output_; // [routed_expert_num * max_len * intermediate_size]
ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]
std::vector<std::vector<int>> m_local_pos_; // [max_len, routed_expert_num]
std::vector<int> m_local_num_; // [expert_num]
std::vector<int> m_expert_id_map_; // [expert_num]
std::vector<ggml_bf16_t *> m_local_input_ptr_; // [expert_num]
std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
std::vector<ggml_bf16_t *> m_local_up_output_ptr_; // [expert_num]
std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]
std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;
#ifdef USE_NUMA
std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
#else
std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
#endif
public:
AMX_MOE(AMX_MOEConfig config) {
config_ = config;
gate_proj_ = config_.gate_proj;
up_proj_ = config_.up_proj;
down_proj_ = config_.down_proj;
std::vector<std::pair<void **, uint64_t>> m_mem_requests;
m_mem_requests.push_back({(void **)&m_local_input_,
sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
config_.max_len * config_.intermediate_size});
m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
config_.max_len * config_.intermediate_size});
m_mem_requests.push_back({(void **)&m_local_down_output_,
sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
std::vector<void *> gate_up_ba_ptr(config_.expert_num);
std::vector<void *> gate_bc_ptr(config_.expert_num);
std::vector<void *> up_bc_ptr(config_.expert_num);
std::vector<void *> down_ba_ptr(config_.expert_num);
std::vector<void *> down_bc_ptr(config_.expert_num);
for (int i = 0; i < config_.expert_num; i++) {
m_mem_requests.push_back(
{(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
m_mem_requests.push_back(
{(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
m_mem_requests.push_back(
{(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
m_mem_requests.push_back(
{(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
m_mem_requests.push_back(
{(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
}
shared_mem_buffer.alloc(this, m_mem_requests);
m_local_pos_.resize(config_.max_len);
for (int i = 0; i < config_.max_len; i++) {
m_local_pos_[i].resize(config_.routed_expert_num);
}
m_expert_id_map_.resize(config_.expert_num);
m_local_num_.resize(config_.expert_num);
m_local_input_ptr_.resize(config_.expert_num);
m_local_gate_output_ptr_.resize(config_.expert_num);
m_local_up_output_ptr_.resize(config_.expert_num);
m_local_down_output_ptr_.resize(config_.expert_num);
for (uint64_t i = 0; i < config_.expert_num; i++) {
gate_up_ba_.push_back(
std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
gate_bc_.push_back(
std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
down_ba_.push_back(
std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));
#ifdef USE_NUMA
int numa_nodes = numa_num_configured_nodes();
gate_bb_numa_.resize(numa_nodes);
up_bb_numa_.resize(numa_nodes);
down_bb_numa_.resize(numa_nodes);
for (int j = 0; j < numa_nodes; j++) {
void *gate_bb_ptr =
numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
gate_bb_numa_[j].push_back(
std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
void *up_bb_ptr =
numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
up_bb_numa_[j].push_back(
std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
void *down_bb_ptr =
numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
down_bb_numa_[j].push_back(
std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
}
#else
void *gate_bb_ptr =
std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
gate_bb_.push_back(
std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
void *up_bb_ptr =
std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
up_bb_.push_back(
std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
void *down_bb_ptr =
std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
down_bb_.push_back(
std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
#endif
}
}
~AMX_MOE() { shared_mem_buffer.dealloc(this); }
void load_weights(Backend *backend) {
int nth = T::recommended_nth(config_.intermediate_size);
backend->do_work_stealing_job(
nth * config_.expert_num, nullptr,
[&](int task_id) {
uint64_t expert_idx = task_id / nth;
int ith = task_id % nth;
#ifdef USE_NUMA
int numa_nodes = numa_num_configured_nodes();
for (int j = 0; j < numa_nodes; j++) {
gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
expert_idx * config_.intermediate_size * config_.hidden_size,
ith, nth);
up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
expert_idx * config_.intermediate_size * config_.hidden_size,
ith, nth);
}
#else
gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
expert_idx * config_.intermediate_size * config_.hidden_size,
ith, nth);
up_bb_[expert_idx]->from_mat(
(ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
#endif
},
nullptr);
nth = T::recommended_nth(config_.hidden_size);
backend->do_work_stealing_job(
nth * config_.expert_num, nullptr,
[&](int task_id) {
uint64_t expert_idx = task_id / nth;
int ith = task_id % nth;
#ifdef USE_NUMA
int numa_nodes = numa_num_configured_nodes();
for (int j = 0; j < numa_nodes; j++) {
down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
expert_idx * config_.hidden_size * config_.intermediate_size,
ith, nth);
}
#else
down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
expert_idx * config_.hidden_size * config_.intermediate_size,
ith, nth);
#endif
},
nullptr);
}
void warm_up(Backend *backend) {}
void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output,
int *batch_size_tensor, Backend *backend) {
bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
qlen = batch_size_tensor[0];
int activated_expert = 0;
for (int i = 0; i < config_.expert_num; i++) {
m_local_num_[i] = 0;
}
for (int i = 0; i < qlen; i++) {
for (int j = 0; j < k; j++) {
m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
}
}
for (int i = 0; i < config_.expert_num; i++) {
if (m_local_num_[i] > 0) {
m_expert_id_map_[activated_expert] = i;
activated_expert++;
}
}
uint64_t offset = 0;
for (int i = 0; i < config_.expert_num; i++) {
m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
offset += m_local_num_[i];
}
backend->do_work_stealing_job(
qlen, nullptr,
[&](int i) {
for (int j = 0; j < k; j++) {
memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
(ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
}
},
nullptr);
backend->do_work_stealing_job(
activated_expert, nullptr,
[&](int task_id) {
int expert_idx = m_expert_id_map_[task_id];
gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
},
nullptr);
int nth = T::recommended_nth(config_.intermediate_size);
backend->do_work_stealing_job(
nth * activated_expert, [&](int _) { T::config(); },
[&](int task_id) {
int expert_idx = m_expert_id_map_[task_id / nth];
int ith = task_id % nth;
#ifdef USE_NUMA
amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
ith, nth, use_amx);
amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
nth, use_amx);
#else
amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
#endif
gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
for (int i = 0; i < m_local_num_[expert_idx]; i++) {
ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
for (int j = n_start; j < n_end; j += 32) {
__m512 gate_val0, gate_val1, up_val0, up_val1;
avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
__m512 result0 = act_fn(gate_val0, up_val0);
__m512 result1 = act_fn(gate_val1, up_val1);
avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
}
}
},
nullptr);
backend->do_work_stealing_job(
activated_expert, nullptr,
[&](int task_id) {
int expert_idx = m_expert_id_map_[task_id];
down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
},
nullptr);
nth = T::recommended_nth(config_.hidden_size);
backend->do_work_stealing_job(
nth * activated_expert, [&](int _) { T::config(); },
[&](int task_id) {
int expert_idx = m_expert_id_map_[task_id / nth];
int ith = task_id % nth;
#ifdef USE_NUMA
amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#else
amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
#endif
down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
},
nullptr);
backend->do_work_stealing_job(
qlen, nullptr,
[&](int i) {
for (int e = 0; e < config_.hidden_size; e += 32) {
__m512 x0 = _mm512_setzero_ps();
__m512 x1 = _mm512_setzero_ps();
for (int j = 0; j < k; j++) {
__m512 weight = _mm512_set1_ps(weights[i * k + j]);
__m512 down_output0, down_output1;
avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
m_local_pos_[i][j] * config_.hidden_size + e),
&down_output0, &down_output1);
x0 = _mm512_fmadd_ps(down_output0, weight, x0);
x1 = _mm512_fmadd_ps(down_output1, weight, x1);
}
avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
}
},
nullptr);
}
};
#endif
\ No newline at end of file
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
#include <vector> #include <vector>
#include "../../cpu_backend/backend.h" #include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h" #include "conversion.h"
#include "llama.cpp/ggml-impl.h" #include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h" #include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h" #include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct LinearConfig { struct LinearConfig {
int input_size; int input_size;
......
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
#include <vector> #include <vector>
#include "../../cpu_backend/backend.h" #include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h" #include "conversion.h"
#include "llama.cpp/ggml-impl.h" #include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h" #include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h" #include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct MLPConfig { struct MLPConfig {
int hidden_size; int hidden_size;
......
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
#include <vector> #include <vector>
#include "../../cpu_backend/backend.h" #include "../../cpu_backend/backend.h"
#include "../../cpu_backend/shared_mem_buffer.h"
#include "conversion.h" #include "conversion.h"
#include "llama.cpp/ggml-impl.h" #include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h" #include "llama.cpp/ggml-quants.h"
#include "llama.cpp/ggml.h" #include "llama.cpp/ggml.h"
#include "llamafile/sgemm.h" #include "llamafile/sgemm.h"
#include "shared_mem_buffer.h"
struct MOEConfig { struct MOEConfig {
int expert_num; int expert_num;
......
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-08-05 04:49:08
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-08-05 06:36:41
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_SHAREDMEMBUFFER_H
#define CPUINFER_SHAREDMEMBUFFER_H
#include <cstdint>
#include <cstdlib>
#include <map>
#include <vector>
class SharedMemBuffer {
public:
SharedMemBuffer();
~SharedMemBuffer();
void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
void dealloc(void* object);
private:
void* buffer_;
uint64_t size_;
std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
void arrange(std::vector<std::pair<void**, uint64_t>> requests);
};
static SharedMemBuffer shared_mem_buffer;
#endif
\ No newline at end of file
...@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext ...@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug")) sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
import cpuinfer_ext import cpuinfer_ext
from cpuinfer_ext.moe import MOEConfig, MOE from cpuinfer_ext.moe import MOEConfig, MOE
from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE, AMXInt8_MOE
import ctypes import ctypes
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.server.config.config import Config from ktransformers.server.config.config import Config
from transformers.activations import ACT2FN from transformers.activations import ACT2FN
...@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase): ...@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase):
assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU" assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
self.n_routed_experts = n_routed_experts self.n_routed_experts = n_routed_experts
self.out_device = out_device self.out_device = out_device
self.backend = kwargs.get("backend", "llamafile")
def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False): def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
if device: if device:
...@@ -163,27 +165,62 @@ class KExpertsCPU(KExpertsBase): ...@@ -163,27 +165,62 @@ class KExpertsCPU(KExpertsBase):
) )
# print(self.gate_qtype, self.up_qtype, self.down_qtype) # print(self.gate_qtype, self.up_qtype, self.down_qtype)
n_routed_experts = self.n_routed_experts n_routed_experts = self.n_routed_experts
self.cpu_infer = KExpertsCPU.CPU_INFER
# n_routed_experts = len(self.orig_module) # n_routed_experts = len(self.orig_module)
moe_config = MOEConfig( if self.backend == "llamafile":
n_routed_experts, moe_config = MOEConfig(
self.config.num_experts_per_tok, n_routed_experts,
self.config.hidden_size, self.config.num_experts_per_tok,
self.config.moe_intermediate_size, self.config.hidden_size,
64, self.config.moe_intermediate_size,
10, 64,
1024, 10,
gate_ptr, 1024,
up_ptr, gate_ptr,
down_ptr, up_ptr,
self.gate_type, down_ptr,
self.up_type, self.gate_type,
self.down_type, self.up_type,
30, # TODO: get from model.dtype self.down_type,
) 30, # TODO: get from model.dtype
)
self.moe = MOE(moe_config)
elif self.backend == "AMXBF16":
assert self.gate_type == GGMLQuantizationType.BF16
assert self.up_type == GGMLQuantizationType.BF16
assert self.down_type == GGMLQuantizationType.BF16
moe_config = AMX_MOEConfig(
n_routed_experts,
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
25600,
gate_ptr,
up_ptr,
down_ptr,
)
self.moe = AMXBF16_MOE(moe_config)
self.cpu_infer.submit(self.moe.load_weights())
self.cpu_infer.sync()
elif self.backend == "AMXInt8":
assert self.gate_type == GGMLQuantizationType.BF16
assert self.up_type == GGMLQuantizationType.BF16
assert self.down_type == GGMLQuantizationType.BF16
moe_config = AMX_MOEConfig(
n_routed_experts,
self.config.num_experts_per_tok,
self.config.hidden_size,
self.config.moe_intermediate_size,
25600,
gate_ptr,
up_ptr,
down_ptr,
)
self.moe = AMXInt8_MOE(moe_config)
self.cpu_infer.submit(self.moe.load_weights())
self.cpu_infer.sync()
# print(n_routed_experts, hidden_size, moe_intermediate_size) # print(n_routed_experts, hidden_size, moe_intermediate_size)
num_experts_per_tok = self.config.num_experts_per_tok num_experts_per_tok = self.config.num_experts_per_tok
self.moe = MOE(moe_config)
self.cpu_infer = KExpertsCPU.CPU_INFER
if warmup: if warmup:
self.cpu_infer.submit(self.moe.warm_up()) self.cpu_infer.submit(self.moe.warm_up())
self.cpu_infer.sync() self.cpu_infer.sync()
......
- match:
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
name: "^lm_head$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "KLinearMarlin"
prefill_op: "KLinearTorch"
- match:
name: "^model\\.layers\\..*\\.mlp$"
class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
replace:
class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_deepseek_v3.MoEGate
replace:
class: ktransformers.operators.gate.KMoEGate
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda"
prefill_op: "KExpertsTorch"
generate_device: "cpu"
generate_op: "KExpertsCPU"
out_device: "cuda"
backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\..*\\.self_attn$"
replace:
class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
- match:
name: "^model$"
replace:
class: "ktransformers.operators.models.KDeepseekV2Model"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment