Initial commit

18c42e67 · chenxl · 18c42e67 · 18c42e67 · 18c42e67 · 18c42e67
Commit 18c42e67 authored Jul 27, 2024 by chenxl
20 changed files
--- a/ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+++ b/ktransformers/ktransformers_ext/bench/bench_moe_torch.py
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:32:57
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+import torch
+import torch.nn.quantized as nnq
+def act_fn(x):
+    return x / (1.0 + torch.exp(-x))
+def bench_moe(quant_mode: str):
+    with torch.inference_mode(mode=True):
+        expert_num = 10
+        hidden_size = 5120
+        intermediate_size = 1536
+        n_routed_experts = 6
+        layer_num = 10
+        warm_up_iter = 1000
+        test_iter = 10000
+        if quant_mode == "fp32":
+            proj_type = torch.float32
+            bytes_per_elem = 4.000000
+        elif quant_mode == "fp16":
+            proj_type = torch.float16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "bf16":
+            proj_type = torch.bfloat16
+            bytes_per_elem = 2.000000
+        elif quant_mode == "qint8":
+            proj_type = torch.qint8
+            bytes_per_elem = 1.000000
+        else:
+            assert(False)
+        gate_projs = []
+        up_projs = []
+        down_projs = []
+        for _ in range(layer_num):
+            gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
+            if quant_mode == "qint8":
+                scale, zero_point = 0.1, 0  # Adjust scale and zero_point based on your dataset
+                quantized_gate_proj = []
+                quantized_up_proj = []
+                quantized_down_proj = []
+                for i in range(expert_num):
+                    gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
+                    quantized_gate = nnq.Linear(hidden_size, intermediate_size)
+                    quantized_gate.set_weight_bias(gate_proj_q, None)
+                    quantized_gate_proj.append(quantized_gate)
+                    up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
+                    quantized_up = nnq.Linear(hidden_size, intermediate_size)
+                    quantized_up.set_weight_bias(up_proj_q, None)
+                    quantized_up_proj.append(quantized_up)
+                    down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
+                    quantized_down = nnq.Linear(intermediate_size, hidden_size)
+                    quantized_down.set_weight_bias(down_proj_q, None)
+                    quantized_down_proj.append(quantized_down)
+                gate_projs.append(quantized_gate_proj)
+                up_projs.append(quantized_up_proj)
+                down_projs.append(quantized_down_proj)
+            else:
+                gate_projs.append(gate_proj.to(proj_type))
+                up_projs.append(up_proj.to(proj_type))
+                down_projs.append(down_proj.to(proj_type))
+        # warm up
+        for i in range(warm_up_iter):
+            expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
+            weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    quantized_gate = gate_proj[expert_id]
+                    gate_buf = quantized_gate(input_q)
+                    quantized_up = up_proj[expert_id]
+                    up_buf = quantized_up(input_q)
+                    gate_buf = gate_buf.dequantize()
+                    up_buf = up_buf.dequantize()
+                    intermediate = act_fn(gate_buf) * up_buf
+                    intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                    quantized_down = down_proj[expert_id]
+                    expert_output = quantized_down(intermediate_q)
+                    expert_output = expert_output.dequantize()
+                    t_output += weights[i] * expert_output
+            else:
+                t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
+                    up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
+                    intermediate = act_fn(gate_buf) * up_buf
+                    expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
+                    t_output += weights[i] * expert_output
+        # test
+        total_time = 0
+        for i in range(test_iter):
+            expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
+            weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
+            input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
+            start = time.perf_counter()
+            if quant_mode == "qint8":
+                input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
+                t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    quantized_gate = gate_proj[expert_id]
+                    gate_buf = quantized_gate(input_q)
+                    quantized_up = up_proj[expert_id]
+                    up_buf = quantized_up(input_q)
+                    gate_buf = gate_buf.dequantize()
+                    up_buf = up_buf.dequantize()
+                    intermediate = act_fn(gate_buf) * up_buf
+                    intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
+                    quantized_down = down_proj[expert_id]
+                    expert_output = quantized_down(intermediate_q)
+                    expert_output = expert_output.dequantize()
+                    t_output += weights[i] * expert_output
+            else:
+                t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
+                gate_proj = gate_projs[i%layer_num]
+                up_proj = up_projs[i%layer_num]
+                down_proj = down_projs[i%layer_num]
+                for i, expert_id in enumerate(expert_ids):
+                    gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
+                    up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
+                    intermediate = act_fn(gate_buf) * up_buf
+                    expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
+                    t_output += weights[i] * expert_output
+            end = time.perf_counter()
+            total_time += end - start
+        print('Quant mode: ', quant_mode)
+        print('Time(s): ', total_time)
+        print('Iteration: ', test_iter) 
+        print('Time(us) per iteration: ', total_time / test_iter * 1000000)
+        print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+        print('')
+bench_moe("fp32")
+bench_moe("fp16")
+bench_moe("bf16")
+bench_moe("qint8")
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.cpp
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:05
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:34
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "backend.h"
+Backend::Backend(int thread_num) {
+    thread_num_ = thread_num;
+    thread_state_.resize(thread_num);
+    for (int i = 0; i < thread_num; i++) {
+        thread_state_[i].curr = std::make_unique<std::atomic<int>>();
+        thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
+    }
+    workers_.resize(thread_num);
+    for (int i = 1; i < thread_num; i++) {
+        workers_[i] = std::thread(&Backend::worker_thread, this, i);
+    }
+}
+Backend::~Backend() {
+    for (int i = 0; i < thread_num_; i++) {
+        thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
+    }
+    for (int i = 1; i < thread_num_; i++) {
+        if (workers_[i].joinable()) {
+            workers_[i].join();
+        }
+    }
+}
+int Backend::get_thread_num() {
+    return thread_num_;
+}
+void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
+    func_ = func;
+    int base = task_num / thread_num_;
+    int remain = task_num % thread_num_;
+    thread_state_[0].end = base + (0 < remain);
+    for (int i = 1; i < thread_num_; i++) {
+        thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
+        thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
+        thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    }
+    thread_state_[0].curr->store(0, std::memory_order_relaxed);
+    thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
+    process_tasks(0);
+    for (int i = 1; i < thread_num_; i++) {
+        while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
+        }
+    }
+}
+void Backend::process_tasks(int thread_id) {
+    while (true) {
+        int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
+        if (task_id >= thread_state_[thread_id].end) {
+            break;
+        }
+        func_(task_id);
+    }
+    for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
+        int t_i = (thread_id + t_offset) % thread_num_;
+        if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
+            continue;
+        }
+        while (true) {
+            int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
+            if (task_id >= thread_state_[t_i].end) {
+                break;
+            }
+            func_(task_id);
+        }
+    }
+    thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
+}
+void Backend::worker_thread(int thread_id) {
+    auto start = std::chrono::steady_clock::now();
+    while (true) {
+        ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
+        if (status == ThreadStatus::WORKING) {
+            process_tasks(thread_id);
+            start = std::chrono::steady_clock::now();
+        } else if (status == ThreadStatus::WAITING) {
+            auto now = std::chrono::steady_clock::now();
+            auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
+            if (duration > 50) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            }
+        } else if (status == ThreadStatus::EXIT) {
+            return;
+        }
+    }
+}
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/backend.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/backend.h
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-22 02:03:05
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:38
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_BACKEND_H
+#define CPUINFER_BACKEND_H
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <vector>
+enum ThreadStatus {
+    WORKING,
+    WAITING,
+    EXIT,
+};
+struct ThreadState {
+    std::unique_ptr<std::atomic<ThreadStatus>> status;
+    std::unique_ptr<std::atomic<int>> curr;
+    int end;
+};
+class Backend {
+   public:
+    Backend(int);
+    ~Backend();
+    int get_thread_num();
+    void do_work_stealing_job(int, std::function<void(int)>);
+   private:
+    int thread_num_;
+    std::vector<ThreadState> thread_state_;  // [thread_num]
+    std::function<void(int)> func_;
+    std::vector<std::thread> workers_;
+    void process_tasks(int);
+    void worker_thread(int);
+};
+#endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/cpuinfer.h
+/**
+ * @Description  :  
+ * @Author       : chenht2022
+ * @Date         : 2024-07-16 10:43:18
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:42
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#ifndef CPUINFER_CPUINFER_H
+#define CPUINFER_CPUINFER_H
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+#include "backend.h"
+#include "task_queue.h"
+#include "llama.cpp/ggml-impl.h"
+class CPUInfer {
+   public:
+    CPUInfer(int thread_num) {
+        backend_ = new Backend(thread_num - 1);
+        task_queue_ = new TaskQueue();
+        for (int i = 0; i < (1 << 16); ++i) {
+            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
+        }
+    }
+    ~CPUInfer() {
+        delete backend_;
+        delete task_queue_;
+    }
+    template <typename Func, typename Obj, typename... Args>
+    void submit(Func f, Obj* obj, Args... args) {
+        task_queue_->enqueue([=]() {
+            std::invoke(f, *obj, args..., backend_);
+        });
+    }
+    void sync() {
+        task_queue_->sync();
+    }
+   public:
+    Backend* backend_;
+    TaskQueue* task_queue_;
+};
+#endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.cpp
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-17 12:25:51
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022
+ * @LastEditTime : 2024-07-25 10:33:44
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#include "task_queue.h"
+TaskQueue::TaskQueue() {
+    worker = std::thread(&TaskQueue::processTasks, this);
+    sync_flag.store(true, std::memory_order_seq_cst);
+    exit_flag.store(false, std::memory_order_seq_cst);
+}
+TaskQueue::~TaskQueue() {
+    exit_flag.store(true, std::memory_order_seq_cst);
+    if (worker.joinable()) {
+        worker.join();
+    }
+}
+void TaskQueue::enqueue(std::function<void()> task) {
+    mutex.lock();
+    tasks.push(task);
+    sync_flag.store(false, std::memory_order_seq_cst);
+    mutex.unlock();
+}
+void TaskQueue::sync() {
+    while (!sync_flag.load(std::memory_order_seq_cst))
+        ;
+}
+void TaskQueue::processTasks() {
+    while (true) {
+        mutex.lock();
+        if (tasks.empty()) {
+            if (exit_flag.load(std::memory_order_seq_cst)) {
+                return;
+            }
+            mutex.unlock();
+            continue;
+        }
+        std::function<void()> task = tasks.front();
+        mutex.unlock();
+        task();
+        mutex.lock();
+        tasks.pop();
+        if (tasks.empty()) {
+            sync_flag.store(true, std::memory_order_seq_cst);
+        }
+        mutex.unlock();
+    }
+}
--- a/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+++ b/ktransformers/ktransformers_ext/cpu_backend/task_queue.h
+/**
+ * @Description  :
+ * @Author       : chenht2022
+ * @Date         : 2024-07-16 10:43:18
+ * @Version      : 1.0.0
+ * @LastEditors  : chenht2022 
+ * @LastEditTime : 2024-07-25 10:33:47
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+ **/
+#ifndef CPUINFER_TASKQUEUE_H
+#define CPUINFER_TASKQUEUE_H
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+class TaskQueue {
+   public:
+    TaskQueue();
+    ~TaskQueue();
+    void enqueue(std::function<void()>);
+    void sync();
+   private:
+    void processTasks();
+    std::queue<std::function<void()>> tasks;
+    std::thread worker;
+    std::mutex mutex;
+    std::atomic<bool> sync_flag;
+    std::atomic<bool> exit_flag;
+};
+#endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cuda/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/binding.cpp
+/**
+ * @Description  :  
+ * @Author       : Azure-Tang
+ * @Date         : 2024-07-25 13:38:30
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:36:03
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#include "custom_gguf/ops.h"
+#include "gptq_marlin/ops.h"
+// Python bindings
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// namespace py = pybind11;
+PYBIND11_MODULE(KTransformersOps, m) {
+      m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
+            py::arg("data"), py::arg("blk_size"), py::arg("device"));
+      m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
+            py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
+            py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
+            py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/binding.cpp
+#include "ops.h"
+// Python bindings
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+// namespace py = pybind11;
+int test(){
+    return 5;
+}
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+PYBIND11_MODULE(cudaops, m) {
+    m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("dequantize_q4_k",  &dequantize_q4_k, "Function to dequantize q4_k data.",
+          py::arg("data"), py::arg("blk_size"), py::arg("device"));
+    m.def("test", &test, "Function to test.");
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/custom_ggml.h
+#include <cuda_fp16.h>
+__device__ float ggml_compute_fp16_to_fp32(uint16_t h) {
+    return __uint2float_rd(h);
+}
+static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
+    uint16_t tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+// define the global table for fp16 to fp32 conversion
+__device__ float ggml_table_f32_f16[1 << 16];
+// CUDA Kernel to init the table
+__global__ void init_fp16_to_fp32_table() {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto blk_id = idx; blk_id<(1 << 16); blk_id+=blockDim.x * gridDim.x){
+        ggml_table_f32_f16[blk_id] = GGML_COMPUTE_FP16_TO_FP32(blk_id);
+    }
+}
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+extern __device__ float ggml_table_f32_f16[1 << 16]; // Declare as __device__ if used within device code
+// This version of the function is designed to be called from within a CUDA kernel
+#if !defined(GGML_FP16_TO_FP32)
+__device__ float ggml_lookup_fp16_to_fp32(uint16_t f) {
+    return ggml_table_f32_f16[f];
+}
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu
+/*
+ * @Description  :  
+ * @Author       : Azure-Tang, Boxin Zhang
+ * @Date         : 2024-07-25 13:38:30
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 11:58:50
+ * Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
+ * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+ */
+#include <cuda_runtime.h>
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <cstdint>
+__global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        for(int i=0;i<blk_size;i++){
+            float scale = scales[block_id];
+            output[block_id * blk_size + i] = scale * qs[block_id * blk_size + i];
+        }
+    }
+}
+// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
+__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
+    if (j < 4) {
+        *d = q[j] & 63; *m = q[j + 4] & 63;
+    } else {
+        *d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        *m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+__global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+        // const uint8_t * q = data[i].qs;
+        const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
+        const float d   = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 0)));
+        const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 2)));
+        int is = 0;
+        uint8_t sc, m;
+        for (int j = 0; j < blk_size; j += 64) {
+            uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
+            get_scale_min_k4(is + 0, scales, &sc, &m);
+            const float d1 = d * sc; const float m1 = min * m;
+            get_scale_min_k4(is + 1, scales, &sc, &m);
+            const float d2 = d * sc; const float m2 = min * m;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
+            for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l]  >> 4) - m2;
+            q += 32; is += 2;
+        }
+    }
+}
+__global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
+    int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
+        float* __restrict__ output_blk = (float*)(output + block_id * 256);
+        const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 208)));
+        const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
+        const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
+        const int8_t  * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
+        //if (blk_size == 256){
+            for (int n = 0; n < blk_size; n += 128) {
+                for (int l = 0; l < 32; ++l) {
+                    int is = l/16;
+                    const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                    const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                    const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                    const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                    output_blk[l +  0] = d * sc[is + 0] * q1;
+                    output_blk[l + 32] = d * sc[is + 2] * q2;
+                    output_blk[l + 64] = d * sc[is + 4] * q3;
+                    output_blk[l + 96] = d * sc[is + 6] * q4;
+                }
+                output_blk += 128;
+                ql += 64;
+                qh += 32;
+                sc += 8;
+            }
+    }
+}
+torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
+    int num_blocks = data.numel() / blk_size;
+    // create gpu
+    auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto scales_gpu = torch::empty({{num_blocks, 1}}, options_scales);
+    auto qs_gpu = torch::empty({num_blocks, 32}, options_qs);
+    // read on cpu
+    options_scales = torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
+    options_qs = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+    // // reinterpret
+    auto scales = torch::from_blob(data.data_ptr(), {num_blocks, 1 + 16}, options_scales).slice(1, 0, 1);
+    auto qs = torch::from_blob(data.data_ptr(), {num_blocks, 2 + 32}, options_qs).slice(1, 2);
+    auto scales_f32 = scales.to(torch::kFloat32);
+    scales_gpu.copy_(scales_f32, false);
+    qs_gpu.copy_(qs, false);
+    // Create output tensor
+    auto output = torch::zeros_like(qs, torch::dtype(torch::kFloat32).device(device));
+    // Launch kernel
+    dequantize_q8_0_kernel<<< 512, 256 >>>(
+        output.data_ptr<float>(), scales_gpu.data_ptr<float>(), qs_gpu.data_ptr<int8_t>(), num_blocks, 32);
+    cudaDeviceSynchronize();
+    return output;
+}
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device) {
+    // data.numel%blk_size should be 0, else raise err
+    int num_blocks = data.numel() / blk_size;
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({data.numel()}, options);
+    data_gpu.copy_(data, false);
+    // Create output tensor
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
+    // Launch kernel
+    dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
+    // dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    cudaDeviceSynchronize();
+    return output;
+}
+torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
+    // data.numel%blk_size should be 0, else raise err
+    int num_blocks = data.numel() / blk_size;
+    auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
+    auto data_gpu = torch::empty({data.numel()}, options);
+    data_gpu.copy_(data, false);
+    // Create output tensor
+    auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
+    // Launch kernel
+    dequantize_q4_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
+    cudaDeviceSynchronize();
+    return output;
+}
--- a/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/custom_gguf/ops.h
+/**
+ * @Description  :  
+ * @Author       : Azure-Tang
+ * @Date         : 2024-07-22 09:27:55
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:38:20
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#pragma once
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
+torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cuh
+// Adapted from
+// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
+// Copyrigth 2024 The vLLM team.
+// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+#pragma once
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <iostream>
+namespace gptq_marlin {
+// 8 warps are a good choice since every SM has 4 schedulers and having more
+// than 1 warp per schedule allows some more latency hiding. At the same time,
+// we want relatively few warps to have many registers per warp and small tiles.
+static constexpr int default_threads = 256;
+static constexpr int pipe_stages =
+    4;  // 4 pipeline stages fit into shared memory
+static constexpr int min_thread_n = 64;
+static constexpr int min_thread_k = 64;
+static constexpr int tile_size = 16;
+static constexpr int max_par = 16;
+template <typename T, int n>
+struct Vec {
+  T elems[n];
+  __device__ T& operator[](int i) { return elems[i]; }
+};
+using I4 = Vec<int, 4>;
+constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+// No support for async
+#else
+__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
+                                      bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   cp.async.cg.shared.global [%0], [%1], %2;\n"
+      "}\n" ::"r"(smem),
+      "l"(glob_ptr), "n"(BYTES));
+}
+__device__ inline void cp_async_fence() {
+  asm volatile("cp.async.commit_group;\n" ::);
+}
+template <int n>
+__device__ inline void cp_async_wait() {
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
+}
+#endif
+}  // namespace gptq_marlin
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh
+// Adapted from
+// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
+// Copyrigth 2024 The vLLM team.
+// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
+#ifndef _data_types_cuh
+#define _data_types_cuh
+#include "gptq_marlin.cuh"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+namespace gptq_marlin {
+template <typename scalar_t>
+class ScalarType {};
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+  // Matrix fragments for tensor core instructions; their precise layout is
+  // documented here:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
+  using FragA = Vec<half2, 4>;
+  using FragB = Vec<half2, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<half2, 1>;
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+};
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+  using FragA = Vec<nv_bfloat162, 4>;
+  using FragB = Vec<nv_bfloat162, 2>;
+  using FragC = Vec<float, 4>;
+  using FragS = Vec<nv_bfloat162, 1>;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+#endif
+};
+}  // namespace gptq_marlin
+#endif
--- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
+++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/ops.h
+/**
+ * @Description  :  
+ * @Author       : Azure
+ * @Date         : 2024-07-22 09:27:55
+ * @Version      : 1.0.0
+ * @LastEditors  : Azure 
+ * @LastEditTime : 2024-07-26 08:35:00
+ * @Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+**/
+#pragma once
+#include <torch/library.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
+                               torch::Tensor& b_scales, torch::Tensor& g_idx,
+                               torch::Tensor& perm, torch::Tensor& workspace,
+                               int64_t num_bits, int64_t size_m, int64_t size_n,
+                               int64_t size_k, bool is_k_full);
+// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
+//                                  int64_t size_k, int64_t size_n,
+//                                  int64_t num_bits);
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/cuda/setup.py
+++ b/ktransformers/ktransformers_ext/cuda/setup.py
+from setuptools import setup, Extension
+from torch.utils import cpp_extension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+# setup marlin gemm
+setup(name='KTransformersOps',
+      ext_modules=[
+          CUDAExtension('KTransformersOps', [
+                'custom_gguf/dequant.cu',
+                'binding.cpp',
+                'gptq_marlin/gptq_marlin.cu',
+                # 'gptq_marlin_repack.cu',
+      ])
+      ],
+      cmdclass={'build_ext': BuildExtension
+})
--- a/ktransformers/ktransformers_ext/examples/test_linear.py
+++ b/ktransformers/ktransformers_ext/examples/test_linear.py
+#!/usr/bin/env python
+# coding=utf-8
+'''
+Description  :  
+Author       : chenht2022
+Date         : 2024-07-25 10:32:05
+Version      : 1.0.0
+LastEditors  : chenht2022 
+LastEditTime : 2024-07-25 10:34:00
+Copyright (c) 2024 by KVCache.AI, All Rights Reserved. 
+'''
+import os, sys
+import time
+sys.path.append(os.path.dirname(__file__) + '/../build')
+import cpuinfer_ext
+import torch
+with torch.inference_mode(mode=True):
+    input_size = 16384
+    output_size = 5120
+    stride = 32
+    proj_type = 1 # ggml_type::GGML_TYPE_F16
+    hidden_type = 1 # ggml_type::GGML_TYPE_F16
+    layer_num = 10
+    CPUInfer = cpuinfer_ext.CPUInfer(48)
+    validation_iter = 100
+    warm_up_iter = 1000
+    test_iter = 10000
+    linears = []
+    projs = []
+    for _ in range(layer_num):
+        proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
+        config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
+        linear = cpuinfer_ext.linear.Linear(config)
+        projs.append(proj)
+        linears.append(linear)
+    # validation
+    for i in range(validation_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        # print('cpuinfer output', output)
+        proj = projs[i%layer_num]
+        t_output = torch.mm(input, proj.t())
+        # print('torch output', t_output)
+        diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
+        print('diff = ', diff)
+        assert(diff < 0.001)
+    # warm up
+    for i in range(warm_up_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+    # test
+    total_time = 0
+    for i in range(test_iter):
+        linear = linears[i % layer_num]
+        input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
+        output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
+        input = input / 100
+        start = time.perf_counter()
+        CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
+        CPUInfer.sync()
+        end = time.perf_counter()
+        total_time += end - start
+    print('Time: ', total_time)
+    print('Iteration: ', test_iter) 
+    print('Time per iteration: ', total_time / test_iter)
+    print('Bandwidth: ', input_size * output_size * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
+    print("All tasks completed.")
\ No newline at end of file
--- a/ktransformers/ktransformers_ext/examples/test_mlp.py
+++ b/ktransformers/ktransformers_ext/examples/test_mlp.py
--- a/ktransformers/ktransformers_ext/examples/test_moe.py
+++ b/ktransformers/ktransformers_ext/examples/test_moe.py
--- a/ktransformers/ktransformers_ext/ext_bindings.cpp
+++ b/ktransformers/ktransformers_ext/ext_bindings.cpp