support AMX

f3d842a0 · chenht2022 · b90362b5 · f3d842a0 · f3d842a0 · f3d842a0
Commit f3d842a0 authored Apr 25, 2025 by chenht2022
15 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+BasedOnStyle: LLVM
+ColumnLimit: 120  # 设置最大行宽为 100
+IndentWidth: 2
+---
--- a/csrc/ktransformers_ext/CMakeLists.txt
+++ b/csrc/ktransformers_ext/CMakeLists.txt
@@ -293,9 +293,10 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/cpu_backend SOURCE_DIR2)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/llamafile SOURCE_DIR3)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/llamafile SOURCE_DIR4)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/kvcache SOURCE_DIR5)
+aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/operators/amx SOURCE_DIR6)
-set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5})
+set(ALL_SOURCES ${SOURCE_DIR1} ${SOURCE_DIR2} ${SOURCE_DIR3} ${SOURCE_DIR4} ${SOURCE_DIR5} ${SOURCE_DIR6})
 file(GLOB_RECURSE FMT_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.hpp" "${CMAKE_CURRENT_SOURCE_DIR}/*.h")

--- a/csrc/ktransformers_ext/bench/bench_moe_amx.py
+++ b/csrc/ktransformers_ext/bench/bench_moe_amx.py
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2025-04-25 18:28:12
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2025-04-25 18:28:12
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+expert_num = 8
+hidden_size = 7168
+intermediate_size = 2048
+max_len = 25600
+n_routed_experts = 8
+layer_num = 10
+qlen = 1024
+CPUInfer = cpuinfer_ext.CPUInfer(65)
+warm_up_iter = 100
+test_iter = 100
+def bench_moe(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        if quant_mode == "bf16":
+            bytes_per_elem = 2.000000
+        elif quant_mode == "int8":
+            bytes_per_elem = 1.000000
+        else:
+            assert(False)
+        moes = []
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            config = cpuinfer_ext.moe.AMX_MOEConfig(expert_num, n_routed_experts, hidden_size, intermediate_size, max_len, gate_proj.data_ptr(), up_proj.data_ptr(), down_proj.data_ptr())
+            if quant_mode == "bf16":
+                moe = cpuinfer_ext.moe.AMXBF16_MOE(config)
+                CPUInfer.submit(moe.load_weights())
+                CPUInfer.sync()
+            elif quant_mode == "int8":
+                moe = cpuinfer_ext.moe.AMXInt8_MOE(config)
+                CPUInfer.submit(moe.load_weights())
+                CPUInfer.sync()
+            gate_projs.append(gate_proj)
+            up_projs.append(up_proj)
+            down_projs.append(down_proj)
+            moes.append(moe)
+        expert_ids = torch.stack([torch.stack([torch.randperm(expert_num, dtype=torch.int64, device = "cuda")[:n_routed_experts] for _ in range(qlen)]) for _ in range(layer_num)]).to("cpu").contiguous()
+        weights = torch.rand((layer_num, qlen, n_routed_experts), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+        input = torch.randn((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
+        output = torch.empty((layer_num, qlen, hidden_size), dtype=torch.bfloat16, device = "cuda").to("cpu").contiguous()
+        qlen_tensor = torch.tensor([qlen], dtype=torch.int32)
+        # warm up
+        for i in range(warm_up_iter):
+            CPUInfer.submit(
+                moes[i % layer_num].forward( 
+                    qlen, 
+                    n_routed_experts, 
+                    expert_ids[i % layer_num].data_ptr(), 
+                    weights[i % layer_num].data_ptr(),
+                    input[i % layer_num].data_ptr(), 
+                    output[i % layer_num].data_ptr(),
+                    qlen_tensor.data_ptr()
+                )
+            )
+            CPUInfer.sync()
+        # test
+        start = time.perf_counter()
+        for i in range(test_iter):
+            CPUInfer.submit(
+                moes[i % layer_num].forward( 
+                    qlen, 
+                    n_routed_experts, 
+                    expert_ids[i % layer_num].data_ptr(), 
+                    weights[i % layer_num].data_ptr(),
+                    input[i % layer_num].data_ptr(), 
+                    output[i % layer_num].data_ptr(),
+                    qlen_tensor.data_ptr()
+                )
+            )
+            CPUInfer.sync()
+        end = time.perf_counter()
+        total_time = end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('Flops: ', hidden_size * intermediate_size * qlen * 3 * n_routed_experts * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GFLOPS')
+        print('')
+bench_moe("bf16")
+bench_moe("int8")
--- a/csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+++ b/csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
@@ -30,7 +30,8 @@ void SharedMemBuffer::alloc(void* object, std::vector<std::pair<void**, uint64_t
        if (buffer_) {
            free(buffer_);
        }
-        buffer_ = malloc(size);
+        buffer_ = std::aligned_alloc(64, size);
        size_ = size;
        for (auto& obj_requests : hist_requests_) {
            for (auto& requests : obj_requests.second) {

--- a/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
+++ b/csrc/ktransformers_ext/cpu_backend/shared_mem_buffer.h
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-08-05 04:49:08
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-08-05 06:36:41
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+ #ifndef CPUINFER_SHAREDMEMBUFFER_H
+ #define CPUINFER_SHAREDMEMBUFFER_H
+ #include <cstdint>
+ #include <cstdlib>
+ #include <map>
+ #include <vector>
+ class SharedMemBuffer {
+    public:
+     SharedMemBuffer();
+     ~SharedMemBuffer();
+     void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
+     void dealloc(void* object);
+    private:
+     void* buffer_;
+     uint64_t size_;
+     std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
+     void arrange(std::vector<std::pair<void**, uint64_t>> requests);
+ };
+ static SharedMemBuffer shared_mem_buffer;
+ #endif
\ No newline at end of file
--- a/csrc/ktransformers_ext/ext_bindings.cpp
+++ b/csrc/ktransformers_ext/ext_bindings.cpp
@@ -17,6 +17,7 @@
 #include "operators/llamafile/linear.h"
 #include "operators/llamafile/mlp.h"
 #include "operators/llamafile/moe.h"
+#include "operators/amx/moe.hpp"
 #include "pybind11/functional.h"
 #include "pybind11/operators.h"
 #include "pybind11/pybind11.h"
@@ -563,6 +564,75 @@ class MOEBindings {
    };
 };
+template<class T>
+class AMX_MOEBindings {
+  public:
+    class WarmUpBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            AMX_MOE<T> *moe;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&AMX_MOE<T>::warm_up, args_->moe);
+        }
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
+            Args *args = new Args{nullptr, &moe};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+    class LoadWeightsBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            AMX_MOE<T> *moe;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(&AMX_MOE<T>::load_weights, args_->moe);
+        }
+        static std::pair<intptr_t, intptr_t> cpuinfer_interface(AMX_MOE<T> &moe) {
+            Args *args = new Args{nullptr, &moe};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+    class ForwardBindings {
+      public:
+        struct Args {
+            CPUInfer *cpuinfer;
+            AMX_MOE<T> *moe;
+            int qlen;
+            int k;
+            const uint64_t *expert_ids;
+            const float *weights;
+            const void *input;
+            void *output;
+            int *batch_size_tensor;
+        };
+        static void inner(void *args) {
+            Args *args_ = (Args *)args;
+            args_->cpuinfer->enqueue(
+                &AMX_MOE<T>::forward, args_->moe, args_->qlen, args_->k,
+                args_->expert_ids, args_->weights, args_->input, args_->output, args_->batch_size_tensor);
+        }
+        static std::pair<intptr_t, intptr_t>
+        cpuinfer_interface(AMX_MOE<T> &moe, int qlen, int k, intptr_t expert_ids,
+                        intptr_t weights, intptr_t input, intptr_t output, intptr_t batch_size_tensor) {
+            Args *args = new Args{nullptr,
+                                &moe,
+                                qlen,
+                                k,
+                                (const uint64_t *)expert_ids,
+                                (const float *)weights,
+                                (const void *)input,
+                                (void *)output,
+                                (int *)batch_size_tensor};
+            return std::make_pair((intptr_t)&inner, (intptr_t)args);
+        }
+    };
+};
 PYBIND11_MODULE(cpuinfer_ext, m) {
    py::class_<CPUInfer>(m, "CPUInfer")
        .def(py::init<int>())
@@ -621,6 +691,27 @@ PYBIND11_MODULE(cpuinfer_ext, m) {
        .def("warm_up", &MOEBindings::WarmUpBindinds::cpuinfer_interface)
        .def("forward", &MOEBindings::ForwardBindings::cpuinfer_interface);
+    py::class_<AMX_MOEConfig>(moe_module, "AMX_MOEConfig")
+        .def(py::init([](int expert_num, int routed_expert_num, int hidden_size,
+                         int intermediate_size,
+                         int max_len, intptr_t gate_proj,
+                         intptr_t up_proj, intptr_t down_proj) {
+            return AMX_MOEConfig(expert_num, routed_expert_num, hidden_size,
+                                 intermediate_size, 
+                                 max_len, (void *)gate_proj,
+                                 (void *)up_proj, (void *)down_proj);
+        }));
+    py::class_<AMX_MOE<amx::GemmKernel224BF>>(moe_module, "AMXBF16_MOE")
+        .def(py::init<AMX_MOEConfig>())
+        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224BF>::WarmUpBindings::cpuinfer_interface)
+        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224BF>::LoadWeightsBindings::cpuinfer_interface)
+        .def("forward", &AMX_MOEBindings<amx::GemmKernel224BF>::ForwardBindings::cpuinfer_interface);
+    py::class_<AMX_MOE<amx::GemmKernel224Int8>>(moe_module, "AMXInt8_MOE")
+        .def(py::init<AMX_MOEConfig>())
+        .def("warm_up", &AMX_MOEBindings<amx::GemmKernel224Int8>::WarmUpBindings::cpuinfer_interface)
+        .def("load_weights", &AMX_MOEBindings<amx::GemmKernel224Int8>::LoadWeightsBindings::cpuinfer_interface)
+        .def("forward", &AMX_MOEBindings<amx::GemmKernel224Int8>::ForwardBindings::cpuinfer_interface);
    auto kvcache_module = m.def_submodule("kvcache");
    py::enum_<AnchorType>(kvcache_module, "AnchorType")

--- a/csrc/ktransformers_ext/operators/amx/la/amx.hpp
+++ b/csrc/ktransformers_ext/operators/amx/la/amx.hpp
--- a/csrc/ktransformers_ext/operators/amx/la/utils.hpp
+++ b/csrc/ktransformers_ext/operators/amx/la/utils.hpp
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2025-04-25 18:28:12
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022
+ * @LastEditTime : 2025-04-25 18:28:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#pragma once
+#include <cstdint>
+template <typename T>
+T* offset_pointer(T* ptr, std::size_t byte_offset) {
+  return reinterpret_cast<T*>(reinterpret_cast<char*>(ptr) + byte_offset);
+}
+template <typename T>
+const T* offset_pointer(const T* ptr, std::size_t byte_offset) {
+  return reinterpret_cast<const T*>(reinterpret_cast<const char*>(ptr) + byte_offset);
+}
+template <typename T>
+T* offset_pointer_row_major(T* t, int row, int col, std::size_t ld) {
+  return offset_pointer(t, row * ld) + col;
+}
+template <typename T>
+T* offset_pointer_col_major(T* t, int row, int col, std::size_t ld) {
+  return offset_pointer(t, col * ld) + row;
+}
+static inline void avx512_copy_32xbf16(__m512i* src, __m512i* dst) {
+  _mm512_storeu_si512(dst, _mm512_loadu_si512(src));
+}
+static inline void avx512_32xfp32_to_32xbf16(__m512* src0, __m512* src1, __m512i* dst) {
+  _mm512_storeu_si512(dst, __m512i(_mm512_cvtne2ps_pbh(*src1, *src0)));
+}
+static inline void avx512_32xbf16_to_32xfp32(__m512i* src, __m512* dst0, __m512* dst1) {
+  _mm512_storeu_ps(dst0, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src))), 16)));
+  _mm512_storeu_ps(dst1, _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(src) + 1)), 16)));
+}
\ No newline at end of file
--- a/csrc/ktransformers_ext/operators/amx/moe.hpp
+++ b/csrc/ktransformers_ext/operators/amx/moe.hpp
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2025-04-25 18:28:12
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022
+ * @LastEditTime : 2025-04-25 18:28:12
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_OPERATOR_AMX_MOE_H
+#define CPUINFER_OPERATOR_AMX_MOE_H
+#include <cmath>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <vector>
+#include "../../cpu_backend/backend.h"
+#include "../../cpu_backend/shared_mem_buffer.h"
+#include "llama.cpp/ggml-impl.h"
+#include "llama.cpp/ggml-quants.h"
+#include "llama.cpp/ggml.h"
+#include "llamafile/sgemm.h"
+#include "la/amx.hpp"
+#ifdef USE_NUMA
+#include <numa.h>
+#include <numaif.h>
+void *numa_alloc_aligned(size_t size, int node, size_t alignment) {
+  void *ptr = numa_alloc_onnode(size, node);
+  assert(reinterpret_cast<intptr_t>(ptr) % 64 == 0);
+  return ptr;
+}
+#endif
+static inline __m512 exp_avx512(__m512 x) {
+  const __m512 log2e = _mm512_set1_ps(1.44269504089f);
+  const __m512 c1 = _mm512_set1_ps(0.69314718056f);
+  __m512 y = _mm512_mul_ps(x, log2e);
+  __m512i int_part = _mm512_cvtps_epi32(y);
+  __m512 frac_part = _mm512_sub_ps(y, _mm512_cvtepi32_ps(int_part));
+  const __m512 poly_1 = _mm512_set1_ps(0.9999999995f);
+  const __m512 poly_2 = _mm512_set1_ps(0.6931471805f);
+  const __m512 poly_3 = _mm512_set1_ps(0.2402265069f);
+  const __m512 poly_4 = _mm512_set1_ps(0.0555041087f);
+  const __m512 poly_5 = _mm512_set1_ps(0.0096181291f);
+  const __m512 poly_6 = _mm512_set1_ps(0.0013333558f);
+  __m512 frac_exp = _mm512_fmadd_ps(
+      frac_part, poly_6,
+      _mm512_fmadd_ps(frac_part, poly_5,
+                      _mm512_fmadd_ps(frac_part, poly_4,
+                                      _mm512_fmadd_ps(frac_part, poly_3, _mm512_fmadd_ps(frac_part, poly_2, poly_1)))));
+  __m512 two_pow_i = _mm512_scalef_ps(_mm512_set1_ps(1.0f), _mm512_cvtepi32_ps(int_part));
+  return _mm512_mul_ps(two_pow_i, frac_exp);
+}
+static inline __m512 act_fn(__m512 gate_val, __m512 up_val) {
+  __m512 neg_gate_val = _mm512_sub_ps(_mm512_setzero_ps(), gate_val);
+  __m512 exp_neg_gate = exp_avx512(neg_gate_val);
+  __m512 denom = _mm512_add_ps(_mm512_set1_ps(1.0f), exp_neg_gate);
+  __m512 act_val = _mm512_div_ps(gate_val, denom);
+  return _mm512_mul_ps(act_val, up_val);
+}
+struct AMX_MOEConfig {
+  int expert_num;
+  int routed_expert_num;
+  int hidden_size;
+  int intermediate_size;
+  int max_len;
+  void *gate_proj;
+  void *up_proj;
+  void *down_proj;
+  AMX_MOEConfig() {}
+  AMX_MOEConfig(int expert_num, int routed_expert_num, int hidden_size, int intermediate_size, int max_len,
+                void *gate_proj, void *up_proj, void *down_proj)
+      : expert_num(expert_num), routed_expert_num(routed_expert_num), hidden_size(hidden_size),
+        intermediate_size(intermediate_size), max_len(max_len), gate_proj(gate_proj), up_proj(up_proj),
+        down_proj(down_proj) {}
+};
+template <class T> class AMX_MOE {
+private:
+  AMX_MOEConfig config_;
+  void *gate_proj_; // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+  void *up_proj_;   // [expert_num * intermediate_size * hidden_size ( /32 if quantized)]
+  void *down_proj_; // [expert_num * hidden_size * intermediate_size ( /32 if quantized)]
+  ggml_bf16_t *m_local_input_;       // [routed_expert_num * max_len * hidden_size]
+  ggml_bf16_t *m_local_gate_output_; // [routed_expert_num * max_len * intermediate_size]
+  ggml_bf16_t *m_local_up_output_;   // [routed_expert_num * max_len * intermediate_size]
+  ggml_bf16_t *m_local_down_output_; // [routed_expert_num * max_len * hidden_size]
+  std::vector<std::vector<int>> m_local_pos_;          // [max_len, routed_expert_num]
+  std::vector<int> m_local_num_;                       // [expert_num]
+  std::vector<int> m_expert_id_map_;                   // [expert_num]
+  std::vector<ggml_bf16_t *> m_local_input_ptr_;       // [expert_num]
+  std::vector<ggml_bf16_t *> m_local_gate_output_ptr_; // [expert_num]
+  std::vector<ggml_bf16_t *> m_local_up_output_ptr_;   // [expert_num]
+  std::vector<ggml_bf16_t *> m_local_down_output_ptr_; // [expert_num]
+  std::vector<std::shared_ptr<typename T::BufferA>> gate_up_ba_;
+  std::vector<std::shared_ptr<typename T::BufferC>> gate_bc_;
+  std::vector<std::shared_ptr<typename T::BufferC>> up_bc_;
+  std::vector<std::shared_ptr<typename T::BufferA>> down_ba_;
+  std::vector<std::shared_ptr<typename T::BufferC>> down_bc_;
+#ifdef USE_NUMA
+  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> gate_bb_numa_;
+  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> up_bb_numa_;
+  std::vector<std::vector<std::shared_ptr<typename T::BufferB>>> down_bb_numa_;
+#else
+  std::vector<std::shared_ptr<typename T::BufferB>> gate_bb_;
+  std::vector<std::shared_ptr<typename T::BufferB>> up_bb_;
+  std::vector<std::shared_ptr<typename T::BufferB>> down_bb_;
+#endif
+public:
+  AMX_MOE(AMX_MOEConfig config) {
+    config_ = config;
+    gate_proj_ = config_.gate_proj;
+    up_proj_ = config_.up_proj;
+    down_proj_ = config_.down_proj;
+    std::vector<std::pair<void **, uint64_t>> m_mem_requests;
+    m_mem_requests.push_back({(void **)&m_local_input_,
+                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
+    m_mem_requests.push_back({(void **)&m_local_gate_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
+                                                                  config_.max_len * config_.intermediate_size});
+    m_mem_requests.push_back({(void **)&m_local_up_output_, sizeof(ggml_bf16_t) * config_.routed_expert_num *
+                                                                config_.max_len * config_.intermediate_size});
+    m_mem_requests.push_back({(void **)&m_local_down_output_,
+                              sizeof(ggml_bf16_t) * config_.routed_expert_num * config_.max_len * config_.hidden_size});
+    std::vector<void *> gate_up_ba_ptr(config_.expert_num);
+    std::vector<void *> gate_bc_ptr(config_.expert_num);
+    std::vector<void *> up_bc_ptr(config_.expert_num);
+    std::vector<void *> down_ba_ptr(config_.expert_num);
+    std::vector<void *> down_bc_ptr(config_.expert_num);
+    for (int i = 0; i < config_.expert_num; i++) {
+      m_mem_requests.push_back(
+          {(void **)&gate_up_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.hidden_size)});
+      m_mem_requests.push_back(
+          {(void **)&gate_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
+      m_mem_requests.push_back(
+          {(void **)&up_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.intermediate_size)});
+      m_mem_requests.push_back(
+          {(void **)&down_ba_ptr[i], T::BufferA::required_size(config_.max_len, config_.intermediate_size)});
+      m_mem_requests.push_back(
+          {(void **)&down_bc_ptr[i], T::BufferC::required_size(config_.max_len, config_.hidden_size)});
+    }
+    shared_mem_buffer.alloc(this, m_mem_requests);
+    m_local_pos_.resize(config_.max_len);
+    for (int i = 0; i < config_.max_len; i++) {
+      m_local_pos_[i].resize(config_.routed_expert_num);
+    }
+    m_expert_id_map_.resize(config_.expert_num);
+    m_local_num_.resize(config_.expert_num);
+    m_local_input_ptr_.resize(config_.expert_num);
+    m_local_gate_output_ptr_.resize(config_.expert_num);
+    m_local_up_output_ptr_.resize(config_.expert_num);
+    m_local_down_output_ptr_.resize(config_.expert_num);
+    for (uint64_t i = 0; i < config_.expert_num; i++) {
+      gate_up_ba_.push_back(
+          std::make_shared<typename T::BufferA>(config_.max_len, config_.hidden_size, gate_up_ba_ptr[i]));
+      gate_bc_.push_back(
+          std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, gate_bc_ptr[i]));
+      up_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.intermediate_size, up_bc_ptr[i]));
+      down_ba_.push_back(
+          std::make_shared<typename T::BufferA>(config_.max_len, config_.intermediate_size, down_ba_ptr[i]));
+      down_bc_.push_back(std::make_shared<typename T::BufferC>(config_.max_len, config_.hidden_size, down_bc_ptr[i]));
+#ifdef USE_NUMA
+      int numa_nodes = numa_num_configured_nodes();
+      gate_bb_numa_.resize(numa_nodes);
+      up_bb_numa_.resize(numa_nodes);
+      down_bb_numa_.resize(numa_nodes);
+      for (int j = 0; j < numa_nodes; j++) {
+        void *gate_bb_ptr =
+            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
+        gate_bb_numa_[j].push_back(
+            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
+        void *up_bb_ptr =
+            numa_alloc_aligned(T::BufferB::required_size(config_.intermediate_size, config_.hidden_size), j, 64);
+        up_bb_numa_[j].push_back(
+            std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
+        void *down_bb_ptr =
+            numa_alloc_aligned(T::BufferB::required_size(config_.hidden_size, config_.intermediate_size), j, 64);
+        down_bb_numa_[j].push_back(
+            std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
+      }
+#else
+      void *gate_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
+      gate_bb_.push_back(
+          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, gate_bb_ptr));
+      void *up_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.intermediate_size, config_.hidden_size));
+      up_bb_.push_back(
+          std::make_shared<typename T::BufferB>(config_.intermediate_size, config_.hidden_size, up_bb_ptr));
+      void *down_bb_ptr =
+          std::aligned_alloc(64, T::BufferB::required_size(config_.hidden_size, config_.intermediate_size));
+      down_bb_.push_back(
+          std::make_shared<typename T::BufferB>(config_.hidden_size, config_.intermediate_size, down_bb_ptr));
+#endif
+    }
+  }
+  ~AMX_MOE() { shared_mem_buffer.dealloc(this); }
+  void load_weights(Backend *backend) {
+    int nth = T::recommended_nth(config_.intermediate_size);
+    backend->do_work_stealing_job(
+        nth * config_.expert_num, nullptr,
+        [&](int task_id) {
+          uint64_t expert_idx = task_id / nth;
+          int ith = task_id % nth;
+#ifdef USE_NUMA
+          int numa_nodes = numa_num_configured_nodes();
+          for (int j = 0; j < numa_nodes; j++) {
+            gate_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
+                                                       expert_idx * config_.intermediate_size * config_.hidden_size,
+                                                   ith, nth);
+            up_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.up_proj +
+                                                     expert_idx * config_.intermediate_size * config_.hidden_size,
+                                                 ith, nth);
+          }
+#else
+          gate_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.gate_proj +
+                                             expert_idx * config_.intermediate_size * config_.hidden_size,
+                                         ith, nth);
+          up_bb_[expert_idx]->from_mat(
+              (ggml_bf16_t *)config_.up_proj + expert_idx * config_.intermediate_size * config_.hidden_size, ith, nth);
+#endif
+        },
+        nullptr);
+    nth = T::recommended_nth(config_.hidden_size);
+    backend->do_work_stealing_job(
+        nth * config_.expert_num, nullptr,
+        [&](int task_id) {
+          uint64_t expert_idx = task_id / nth;
+          int ith = task_id % nth;
+#ifdef USE_NUMA
+          int numa_nodes = numa_num_configured_nodes();
+          for (int j = 0; j < numa_nodes; j++) {
+            down_bb_numa_[j][expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
+                                                       expert_idx * config_.hidden_size * config_.intermediate_size,
+                                                   ith, nth);
+          }
+#else
+          down_bb_[expert_idx]->from_mat((ggml_bf16_t *)config_.down_proj +
+                                             expert_idx * config_.hidden_size * config_.intermediate_size,
+                                         ith, nth);
+#endif
+        },
+        nullptr);
+  }
+  void warm_up(Backend *backend) {}
+  void forward(int qlen, int k, const uint64_t *expert_ids, const float *weights, const void *input, void *output,
+               int *batch_size_tensor, Backend *backend) {
+    bool use_amx = (qlen > 4 * config_.expert_num / config_.routed_expert_num);
+    qlen = batch_size_tensor[0];
+    int activated_expert = 0;
+    for (int i = 0; i < config_.expert_num; i++) {
+      m_local_num_[i] = 0;
+    }
+    for (int i = 0; i < qlen; i++) {
+      for (int j = 0; j < k; j++) {
+        m_local_pos_[i][j] = m_local_num_[expert_ids[i * k + j]]++;
+      }
+    }
+    for (int i = 0; i < config_.expert_num; i++) {
+      if (m_local_num_[i] > 0) {
+        m_expert_id_map_[activated_expert] = i;
+        activated_expert++;
+      }
+    }
+    uint64_t offset = 0;
+    for (int i = 0; i < config_.expert_num; i++) {
+      m_local_input_ptr_[i] = m_local_input_ + offset * config_.hidden_size;
+      m_local_gate_output_ptr_[i] = m_local_gate_output_ + offset * config_.intermediate_size;
+      m_local_up_output_ptr_[i] = m_local_up_output_ + offset * config_.intermediate_size;
+      m_local_down_output_ptr_[i] = m_local_down_output_ + offset * config_.hidden_size;
+      offset += m_local_num_[i];
+    }
+    backend->do_work_stealing_job(
+        qlen, nullptr,
+        [&](int i) {
+          for (int j = 0; j < k; j++) {
+            memcpy(m_local_input_ptr_[expert_ids[i * k + j]] + m_local_pos_[i][j] * config_.hidden_size,
+                   (ggml_bf16_t *)input + i * config_.hidden_size, sizeof(ggml_bf16_t) * config_.hidden_size);
+          }
+        },
+        nullptr);
+    backend->do_work_stealing_job(
+        activated_expert, nullptr,
+        [&](int task_id) {
+          int expert_idx = m_expert_id_map_[task_id];
+          gate_up_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_input_ptr_[expert_idx], 0, 1);
+        },
+        nullptr);
+    int nth = T::recommended_nth(config_.intermediate_size);
+    backend->do_work_stealing_job(
+        nth * activated_expert, [&](int _) { T::config(); },
+        [&](int task_id) {
+          int expert_idx = m_expert_id_map_[task_id / nth];
+          int ith = task_id % nth;
+#ifdef USE_NUMA
+          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
+                       gate_up_ba_[expert_idx], gate_bb_numa_[Backend::numa_node][expert_idx], gate_bc_[expert_idx],
+                       ith, nth, use_amx);
+          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
+                       gate_up_ba_[expert_idx], up_bb_numa_[Backend::numa_node][expert_idx], up_bc_[expert_idx], ith,
+                       nth, use_amx);
+#else
+          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
+                       gate_up_ba_[expert_idx], gate_bb_[expert_idx], gate_bc_[expert_idx], ith, nth, use_amx);
+          amx::mat_mul(m_local_num_[expert_idx], config_.intermediate_size, config_.hidden_size,
+                       gate_up_ba_[expert_idx], up_bb_[expert_idx], up_bc_[expert_idx], ith, nth, use_amx);
+#endif
+          gate_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], ith, nth);
+          up_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_up_output_ptr_[expert_idx], ith, nth);
+          auto [n_start, n_end] = T::split_range_n(config_.intermediate_size, ith, nth);
+          for (int i = 0; i < m_local_num_[expert_idx]; i++) {
+            ggml_bf16_t *gate_output_ptr = &m_local_gate_output_ptr_[expert_idx][i * config_.intermediate_size];
+            ggml_bf16_t *up_output_ptr = &m_local_up_output_ptr_[expert_idx][i * config_.intermediate_size];
+            for (int j = n_start; j < n_end; j += 32) {
+              __m512 gate_val0, gate_val1, up_val0, up_val1;
+              avx512_32xbf16_to_32xfp32((__m512i *)(gate_output_ptr + j), &gate_val0, &gate_val1);
+              avx512_32xbf16_to_32xfp32((__m512i *)(up_output_ptr + j), &up_val0, &up_val1);
+              __m512 result0 = act_fn(gate_val0, up_val0);
+              __m512 result1 = act_fn(gate_val1, up_val1);
+              avx512_32xfp32_to_32xbf16(&result0, &result1, (__m512i *)(gate_output_ptr + j));
+            }
+          }
+        },
+        nullptr);
+    backend->do_work_stealing_job(
+        activated_expert, nullptr,
+        [&](int task_id) {
+          int expert_idx = m_expert_id_map_[task_id];
+          down_ba_[expert_idx]->from_mat(m_local_num_[expert_idx], m_local_gate_output_ptr_[expert_idx], 0, 1);
+        },
+        nullptr);
+    nth = T::recommended_nth(config_.hidden_size);
+    backend->do_work_stealing_job(
+        nth * activated_expert, [&](int _) { T::config(); },
+        [&](int task_id) {
+          int expert_idx = m_expert_id_map_[task_id / nth];
+          int ith = task_id % nth;
+#ifdef USE_NUMA
+          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
+                       down_bb_numa_[Backend::numa_node][expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
+#else
+          amx::mat_mul(m_local_num_[expert_idx], config_.hidden_size, config_.intermediate_size, down_ba_[expert_idx],
+                       down_bb_[expert_idx], down_bc_[expert_idx], ith, nth, use_amx);
+#endif
+          down_bc_[expert_idx]->to_mat(m_local_num_[expert_idx], m_local_down_output_ptr_[expert_idx], ith, nth);
+        },
+        nullptr);
+    backend->do_work_stealing_job(
+        qlen, nullptr,
+        [&](int i) {
+          for (int e = 0; e < config_.hidden_size; e += 32) {
+            __m512 x0 = _mm512_setzero_ps();
+            __m512 x1 = _mm512_setzero_ps();
+            for (int j = 0; j < k; j++) {
+              __m512 weight = _mm512_set1_ps(weights[i * k + j]);
+              __m512 down_output0, down_output1;
+              avx512_32xbf16_to_32xfp32((__m512i *)(m_local_down_output_ptr_[expert_ids[i * k + j]] +
+                                                    m_local_pos_[i][j] * config_.hidden_size + e),
+                                        &down_output0, &down_output1);
+              x0 = _mm512_fmadd_ps(down_output0, weight, x0);
+              x1 = _mm512_fmadd_ps(down_output1, weight, x1);
+            }
+            avx512_32xfp32_to_32xbf16(&x0, &x1, (__m512i *)((ggml_bf16_t *)output + i * config_.hidden_size + e));
+          }
+        },
+        nullptr);
+  }
+};
+#endif
\ No newline at end of file
--- a/csrc/ktransformers_ext/operators/llamafile/linear.h
+++ b/csrc/ktransformers_ext/operators/llamafile/linear.h
@@ -17,12 +17,12 @@
 #include <vector>
 #include "../../cpu_backend/backend.h"
+#include "../../cpu_backend/shared_mem_buffer.h"
 #include "conversion.h"
 #include "llama.cpp/ggml-impl.h"
 #include "llama.cpp/ggml-quants.h"
 #include "llama.cpp/ggml.h"
 #include "llamafile/sgemm.h"
-#include "shared_mem_buffer.h"
 struct LinearConfig {
    int input_size;

--- a/csrc/ktransformers_ext/operators/llamafile/mlp.h
+++ b/csrc/ktransformers_ext/operators/llamafile/mlp.h
@@ -17,12 +17,12 @@
 #include <vector>
 #include "../../cpu_backend/backend.h"
+#include "../../cpu_backend/shared_mem_buffer.h"
 #include "conversion.h"
 #include "llama.cpp/ggml-impl.h"
 #include "llama.cpp/ggml-quants.h"
 #include "llama.cpp/ggml.h"
 #include "llamafile/sgemm.h"
-#include "shared_mem_buffer.h"
 struct MLPConfig {
    int hidden_size;

--- a/csrc/ktransformers_ext/operators/llamafile/moe.h
+++ b/csrc/ktransformers_ext/operators/llamafile/moe.h
@@ -17,12 +17,12 @@
 #include <vector>
 #include "../../cpu_backend/backend.h"
+#include "../../cpu_backend/shared_mem_buffer.h"
 #include "conversion.h"
 #include "llama.cpp/ggml-impl.h"
 #include "llama.cpp/ggml-quants.h"
 #include "llama.cpp/ggml.h"
 #include "llamafile/sgemm.h"
-#include "shared_mem_buffer.h"
 struct MOEConfig {
    int expert_num;

--- a/csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
+++ b/csrc/ktransformers_ext/operators/llamafile/shared_mem_buffer.h
-/**
- * @Description  :
- * @Author       : chenht2022
- * @Date         : 2024-08-05 04:49:08
- * @Version      : 1.0.0
- * @LastEditors  : chenht2022 
- * @LastEditTime : 2024-08-05 06:36:41
- * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
- **/
-#ifndef CPUINFER_SHAREDMEMBUFFER_H
-#define CPUINFER_SHAREDMEMBUFFER_H
-#include <cstdint>
-#include <cstdlib>
-#include <map>
-#include <vector>
-class SharedMemBuffer {
-   public:
-    SharedMemBuffer();
-    ~SharedMemBuffer();
-    void alloc(void* object, std::vector<std::pair<void**, uint64_t>> requests);
-    void dealloc(void* object);
-   private:
-    void* buffer_;
-    uint64_t size_;
-    std::map<void*, std::vector<std::vector<std::pair<void**, uint64_t>>>> hist_requests_;
-    void arrange(std::vector<std::pair<void**, uint64_t>> requests);
-};
-static SharedMemBuffer shared_mem_buffer;
-#endif
\ No newline at end of file
--- a/ktransformers/operators/experts.py
+++ b/ktransformers/operators/experts.py
@@ -25,8 +25,9 @@ sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", "ktransformers_ext", "build", "Debug"))
 import cpuinfer_ext
 from cpuinfer_ext.moe import MOEConfig, MOE
+from cpuinfer_ext.moe import AMX_MOEConfig, AMXBF16_MOE, AMXInt8_MOE
 import ctypes
-from ktransformers.util.custom_gguf import GGUFLoader
+from ktransformers.util.custom_gguf import GGMLQuantizationType, GGUFLoader
 from ktransformers.util.utils import InferenceState
 from ktransformers.server.config.config import Config
 from transformers.activations import ACT2FN
@@ -141,6 +142,7 @@ class KExpertsCPU(KExpertsBase):
        assert device.lower() == "cpu", "KExpertsCPU can only be loaded on CPU"
        self.n_routed_experts = n_routed_experts
        self.out_device = out_device
+        self.backend = kwargs.get("backend", "llamafile")
    def load(self, w: dict | nn.Parameter | tuple | None = None, device:str|None = None, warmup:bool = False):
        if device:
@@ -163,7 +165,9 @@ class KExpertsCPU(KExpertsBase):
        )
        # print(self.gate_qtype, self.up_qtype, self.down_qtype)
        n_routed_experts = self.n_routed_experts
+        self.cpu_infer = KExpertsCPU.CPU_INFER
        # n_routed_experts = len(self.orig_module)
+        if self.backend == "llamafile":
            moe_config = MOEConfig(
                n_routed_experts,
                self.config.num_experts_per_tok,
@@ -180,10 +184,43 @@ class KExpertsCPU(KExpertsBase):
                self.down_type,
                30, # TODO: get from model.dtype
            )
+            self.moe = MOE(moe_config)
+        elif self.backend == "AMXBF16":
+            assert self.gate_type == GGMLQuantizationType.BF16
+            assert self.up_type == GGMLQuantizationType.BF16
+            assert self.down_type == GGMLQuantizationType.BF16
+            moe_config = AMX_MOEConfig(
+                n_routed_experts,
+                self.config.num_experts_per_tok,
+                self.config.hidden_size,
+                self.config.moe_intermediate_size,
+                25600,
+                gate_ptr,
+                up_ptr,
+                down_ptr,
+            )
+            self.moe = AMXBF16_MOE(moe_config)
+            self.cpu_infer.submit(self.moe.load_weights())
+            self.cpu_infer.sync()
+        elif self.backend == "AMXInt8":
+            assert self.gate_type == GGMLQuantizationType.BF16
+            assert self.up_type == GGMLQuantizationType.BF16
+            assert self.down_type == GGMLQuantizationType.BF16
+            moe_config = AMX_MOEConfig(
+                n_routed_experts,
+                self.config.num_experts_per_tok,
+                self.config.hidden_size,
+                self.config.moe_intermediate_size,
+                25600,
+                gate_ptr,
+                up_ptr,
+                down_ptr,
+            )
+            self.moe = AMXInt8_MOE(moe_config)
+            self.cpu_infer.submit(self.moe.load_weights())
+            self.cpu_infer.sync()
        # print(n_routed_experts, hidden_size, moe_intermediate_size)
        num_experts_per_tok = self.config.num_experts_per_tok
-        self.moe = MOE(moe_config)
-        self.cpu_infer = KExpertsCPU.CPU_INFER
        if warmup:
            self.cpu_infer.submit(self.moe.warm_up())
            self.cpu_infer.sync()

--- a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
+++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-amx.yaml
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding
+  replace:
+    class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    name: "^lm_head$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
+    class: torch.nn.Linear  # only match modules matching name and class simultaneously
+  replace:
+    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      generate_op: "KLinearMarlin"
+      prefill_op: "KLinearTorch"
+- match:
+    name: "^model\\.layers\\..*\\.mlp$"
+    class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE
+  replace:
+    class: ktransformers.operators.experts.KDeepseekV3MoE     # mlp module with custom forward function
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+- match:
+    class: ktransformers.models.modeling_deepseek_v3.MoEGate
+  replace:
+    class: ktransformers.operators.gate.KMoEGate
+    kwargs:
+      generate_device: "cuda:0"
+      prefill_device: "cuda:0"
+- match:
+    name: "^model\\.layers\\..*\\.mlp\\.experts$"
+  replace:
+    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
+    kwargs:
+      prefill_device: "cuda"
+      prefill_op: "KExpertsTorch"
+      generate_device: "cpu"
+      generate_op: "KExpertsCPU"
+      out_device: "cuda"
+      backend: "AMXInt8" # or "AMXBF16" or "llamafile" (default)
+  recursive: False # don't recursively inject submodules of this module
+- match:
+    name: "^model\\.layers\\..*\\.self_attn$"
+  replace:
+    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
+    kwargs:
+      generate_device: "cuda"
+      prefill_device: "cuda"
+      absorb_for_prefill: False # change this to True to enable long context(prefill may slower).
+- match:
+    name: "^model$"
+  replace:
+    class: "ktransformers.operators.models.KDeepseekV2Model"
+    kwargs:
+      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
+- match:
+    name: "^model.embed_tokens"
+  replace:
+    class: "default"
+    kwargs:
+      generate_device: "cpu"
+      prefill_device: "cpu"
\ No newline at end of file