Commit 18c42e67 authored by chenxl's avatar chenxl
Browse files

Initial commit

parents
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-07-25 10:32:57
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import os, sys
import time
import torch
import torch.nn.quantized as nnq
def act_fn(x):
return x / (1.0 + torch.exp(-x))
def bench_moe(quant_mode: str):
with torch.inference_mode(mode=True):
expert_num = 10
hidden_size = 5120
intermediate_size = 1536
n_routed_experts = 6
layer_num = 10
warm_up_iter = 1000
test_iter = 10000
if quant_mode == "fp32":
proj_type = torch.float32
bytes_per_elem = 4.000000
elif quant_mode == "fp16":
proj_type = torch.float16
bytes_per_elem = 2.000000
elif quant_mode == "bf16":
proj_type = torch.bfloat16
bytes_per_elem = 2.000000
elif quant_mode == "qint8":
proj_type = torch.qint8
bytes_per_elem = 1.000000
else:
assert(False)
gate_projs = []
up_projs = []
down_projs = []
for _ in range(layer_num):
gate_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
up_proj = torch.randn((expert_num, intermediate_size, hidden_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
down_proj = torch.randn((expert_num, hidden_size, intermediate_size), dtype=torch.float32, device = "cuda").to("cpu").contiguous()
if quant_mode == "qint8":
scale, zero_point = 0.1, 0 # Adjust scale and zero_point based on your dataset
quantized_gate_proj = []
quantized_up_proj = []
quantized_down_proj = []
for i in range(expert_num):
gate_proj_q = torch.quantize_per_tensor(gate_proj[i], scale, zero_point, torch.qint8)
quantized_gate = nnq.Linear(hidden_size, intermediate_size)
quantized_gate.set_weight_bias(gate_proj_q, None)
quantized_gate_proj.append(quantized_gate)
up_proj_q = torch.quantize_per_tensor(up_proj[i], scale, zero_point, torch.qint8)
quantized_up = nnq.Linear(hidden_size, intermediate_size)
quantized_up.set_weight_bias(up_proj_q, None)
quantized_up_proj.append(quantized_up)
down_proj_q = torch.quantize_per_tensor(down_proj[i], scale, zero_point, torch.qint8)
quantized_down = nnq.Linear(intermediate_size, hidden_size)
quantized_down.set_weight_bias(down_proj_q, None)
quantized_down_proj.append(quantized_down)
gate_projs.append(quantized_gate_proj)
up_projs.append(quantized_up_proj)
down_projs.append(quantized_down_proj)
else:
gate_projs.append(gate_proj.to(proj_type))
up_projs.append(up_proj.to(proj_type))
down_projs.append(down_proj.to(proj_type))
# warm up
for i in range(warm_up_iter):
expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
if quant_mode == "qint8":
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
for i, expert_id in enumerate(expert_ids):
quantized_gate = gate_proj[expert_id]
gate_buf = quantized_gate(input_q)
quantized_up = up_proj[expert_id]
up_buf = quantized_up(input_q)
gate_buf = gate_buf.dequantize()
up_buf = up_buf.dequantize()
intermediate = act_fn(gate_buf) * up_buf
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
quantized_down = down_proj[expert_id]
expert_output = quantized_down(intermediate_q)
expert_output = expert_output.dequantize()
t_output += weights[i] * expert_output
else:
t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
for i, expert_id in enumerate(expert_ids):
gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
intermediate = act_fn(gate_buf) * up_buf
expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
t_output += weights[i] * expert_output
# test
total_time = 0
for i in range(test_iter):
expert_ids = torch.randint(0, expert_num, (n_routed_experts,), dtype=torch.int64).contiguous()
weights = torch.rand((n_routed_experts,), dtype=torch.float32).contiguous()
input = torch.randn((1, hidden_size), dtype=torch.float32).contiguous()
start = time.perf_counter()
if quant_mode == "qint8":
input_q = torch.quantize_per_tensor(input, scale, zero_point, torch.quint8)
t_output = torch.zeros((1, hidden_size), dtype=torch.float32).contiguous()
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
for i, expert_id in enumerate(expert_ids):
quantized_gate = gate_proj[expert_id]
gate_buf = quantized_gate(input_q)
quantized_up = up_proj[expert_id]
up_buf = quantized_up(input_q)
gate_buf = gate_buf.dequantize()
up_buf = up_buf.dequantize()
intermediate = act_fn(gate_buf) * up_buf
intermediate_q = torch.quantize_per_tensor(intermediate, scale, zero_point, torch.quint8)
quantized_down = down_proj[expert_id]
expert_output = quantized_down(intermediate_q)
expert_output = expert_output.dequantize()
t_output += weights[i] * expert_output
else:
t_output = torch.zeros((1, hidden_size), dtype=proj_type).contiguous()
gate_proj = gate_projs[i%layer_num]
up_proj = up_projs[i%layer_num]
down_proj = down_projs[i%layer_num]
for i, expert_id in enumerate(expert_ids):
gate_buf = torch.mm(input.to(proj_type), gate_proj[expert_id].t())
up_buf = torch.mm(input.to(proj_type), up_proj[expert_id].t())
intermediate = act_fn(gate_buf) * up_buf
expert_output = torch.mm(intermediate.to(proj_type), down_proj[expert_id].t())
t_output += weights[i] * expert_output
end = time.perf_counter()
total_time += end - start
print('Quant mode: ', quant_mode)
print('Time(s): ', total_time)
print('Iteration: ', test_iter)
print('Time(us) per iteration: ', total_time / test_iter * 1000000)
print('Bandwidth: ', hidden_size * intermediate_size * 3 * n_routed_experts * bytes_per_elem * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print('')
bench_moe("fp32")
bench_moe("fp16")
bench_moe("bf16")
bench_moe("qint8")
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:34
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "backend.h"
Backend::Backend(int thread_num) {
thread_num_ = thread_num;
thread_state_.resize(thread_num);
for (int i = 0; i < thread_num; i++) {
thread_state_[i].curr = std::make_unique<std::atomic<int>>();
thread_state_[i].status = std::make_unique<std::atomic<ThreadStatus>>(ThreadStatus::WAITING);
}
workers_.resize(thread_num);
for (int i = 1; i < thread_num; i++) {
workers_[i] = std::thread(&Backend::worker_thread, this, i);
}
}
Backend::~Backend() {
for (int i = 0; i < thread_num_; i++) {
thread_state_[i].status->store(ThreadStatus::EXIT, std::memory_order_release);
}
for (int i = 1; i < thread_num_; i++) {
if (workers_[i].joinable()) {
workers_[i].join();
}
}
}
int Backend::get_thread_num() {
return thread_num_;
}
void Backend::do_work_stealing_job(int task_num, std::function<void(int)> func) {
func_ = func;
int base = task_num / thread_num_;
int remain = task_num % thread_num_;
thread_state_[0].end = base + (0 < remain);
for (int i = 1; i < thread_num_; i++) {
thread_state_[i].curr->store(thread_state_[i - 1].end, std::memory_order_relaxed);
thread_state_[i].end = thread_state_[i - 1].end + base + (i < remain);
thread_state_[i].status->store(ThreadStatus::WORKING, std::memory_order_release);
}
thread_state_[0].curr->store(0, std::memory_order_relaxed);
thread_state_[0].status->store(ThreadStatus::WORKING, std::memory_order_release);
process_tasks(0);
for (int i = 1; i < thread_num_; i++) {
while (thread_state_[i].status->load(std::memory_order_acquire) == ThreadStatus::WORKING) {
}
}
}
void Backend::process_tasks(int thread_id) {
while (true) {
int task_id = thread_state_[thread_id].curr->fetch_add(1, std::memory_order_acq_rel);
if (task_id >= thread_state_[thread_id].end) {
break;
}
func_(task_id);
}
for (int t_offset = 1; t_offset < thread_num_; t_offset++) {
int t_i = (thread_id + t_offset) % thread_num_;
if (thread_state_[t_i].status->load(std::memory_order_acquire) != ThreadStatus::WORKING) {
continue;
}
while (true) {
int task_id = thread_state_[t_i].curr->fetch_add(1, std::memory_order_acq_rel);
if (task_id >= thread_state_[t_i].end) {
break;
}
func_(task_id);
}
}
thread_state_[thread_id].status->store(ThreadStatus::WAITING, std::memory_order_release);
}
void Backend::worker_thread(int thread_id) {
auto start = std::chrono::steady_clock::now();
while (true) {
ThreadStatus status = thread_state_[thread_id].status->load(std::memory_order_acquire);
if (status == ThreadStatus::WORKING) {
process_tasks(thread_id);
start = std::chrono::steady_clock::now();
} else if (status == ThreadStatus::WAITING) {
auto now = std::chrono::steady_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(now - start).count();
if (duration > 50) {
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
} else if (status == ThreadStatus::EXIT) {
return;
}
}
}
\ No newline at end of file
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-22 02:03:05
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:38
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_BACKEND_H
#define CPUINFER_BACKEND_H
#include <atomic>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <mutex>
#include <thread>
#include <vector>
enum ThreadStatus {
WORKING,
WAITING,
EXIT,
};
struct ThreadState {
std::unique_ptr<std::atomic<ThreadStatus>> status;
std::unique_ptr<std::atomic<int>> curr;
int end;
};
class Backend {
public:
Backend(int);
~Backend();
int get_thread_num();
void do_work_stealing_job(int, std::function<void(int)>);
private:
int thread_num_;
std::vector<ThreadState> thread_state_; // [thread_num]
std::function<void(int)> func_;
std::vector<std::thread> workers_;
void process_tasks(int);
void worker_thread(int);
};
#endif
\ No newline at end of file
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:42
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_CPUINFER_H
#define CPUINFER_CPUINFER_H
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
#include "backend.h"
#include "task_queue.h"
#include "llama.cpp/ggml-impl.h"
class CPUInfer {
public:
CPUInfer(int thread_num) {
backend_ = new Backend(thread_num - 1);
task_queue_ = new TaskQueue();
for (int i = 0; i < (1 << 16); ++i) {
ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(i);
}
}
~CPUInfer() {
delete backend_;
delete task_queue_;
}
template <typename Func, typename Obj, typename... Args>
void submit(Func f, Obj* obj, Args... args) {
task_queue_->enqueue([=]() {
std::invoke(f, *obj, args..., backend_);
});
}
void sync() {
task_queue_->sync();
}
public:
Backend* backend_;
TaskQueue* task_queue_;
};
#endif
\ No newline at end of file
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-17 12:25:51
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:44
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "task_queue.h"
TaskQueue::TaskQueue() {
worker = std::thread(&TaskQueue::processTasks, this);
sync_flag.store(true, std::memory_order_seq_cst);
exit_flag.store(false, std::memory_order_seq_cst);
}
TaskQueue::~TaskQueue() {
exit_flag.store(true, std::memory_order_seq_cst);
if (worker.joinable()) {
worker.join();
}
}
void TaskQueue::enqueue(std::function<void()> task) {
mutex.lock();
tasks.push(task);
sync_flag.store(false, std::memory_order_seq_cst);
mutex.unlock();
}
void TaskQueue::sync() {
while (!sync_flag.load(std::memory_order_seq_cst))
;
}
void TaskQueue::processTasks() {
while (true) {
mutex.lock();
if (tasks.empty()) {
if (exit_flag.load(std::memory_order_seq_cst)) {
return;
}
mutex.unlock();
continue;
}
std::function<void()> task = tasks.front();
mutex.unlock();
task();
mutex.lock();
tasks.pop();
if (tasks.empty()) {
sync_flag.store(true, std::memory_order_seq_cst);
}
mutex.unlock();
}
}
/**
* @Description :
* @Author : chenht2022
* @Date : 2024-07-16 10:43:18
* @Version : 1.0.0
* @LastEditors : chenht2022
* @LastEditTime : 2024-07-25 10:33:47
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#ifndef CPUINFER_TASKQUEUE_H
#define CPUINFER_TASKQUEUE_H
#include <atomic>
#include <condition_variable>
#include <functional>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
class TaskQueue {
public:
TaskQueue();
~TaskQueue();
void enqueue(std::function<void()>);
void sync();
private:
void processTasks();
std::queue<std::function<void()>> tasks;
std::thread worker;
std::mutex mutex;
std::atomic<bool> sync_flag;
std::atomic<bool> exit_flag;
};
#endif
\ No newline at end of file
/**
* @Description :
* @Author : Azure-Tang
* @Date : 2024-07-25 13:38:30
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:36:03
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "custom_gguf/ops.h"
#include "gptq_marlin/ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;
PYBIND11_MODULE(KTransformersOps, m) {
m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("dequantize_q4_k", &dequantize_q4_k, "Function to dequantize q4_k data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("gptq_marlin_gemm", &gptq_marlin_gemm, "Function to perform GEMM using Marlin quantization.",
py::arg("a"), py::arg("b_q_weight"), py::arg("b_scales"), py::arg("g_idx"),
py::arg("perm"), py::arg("workspace"), py::arg("num_bits"), py::arg("size_m"),
py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"));
}
#include "ops.h"
// Python bindings
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
// namespace py = pybind11;
int test(){
return 5;
}
torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
PYBIND11_MODULE(cudaops, m) {
m.def("dequantize_q8_0", &dequantize_q8_0, "Function to dequantize q8_0 data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("dequantize_q6_k", &dequantize_q6_k, "Function to dequantize q6_k data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("dequantize_q4_k", &dequantize_q4_k, "Function to dequantize q4_k data.",
py::arg("data"), py::arg("blk_size"), py::arg("device"));
m.def("test", &test, "Function to test.");
}
#include <cuda_fp16.h>
__device__ float ggml_compute_fp16_to_fp32(uint16_t h) {
return __uint2float_rd(h);
}
static inline float ggml_compute_fp16_to_fp32(uint16_t h) {
uint16_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
// define the global table for fp16 to fp32 conversion
__device__ float ggml_table_f32_f16[1 << 16];
// CUDA Kernel to init the table
__global__ void init_fp16_to_fp32_table() {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto blk_id = idx; blk_id<(1 << 16); blk_id+=blockDim.x * gridDim.x){
ggml_table_f32_f16[blk_id] = GGML_COMPUTE_FP16_TO_FP32(blk_id);
}
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
extern __device__ float ggml_table_f32_f16[1 << 16]; // Declare as __device__ if used within device code
// This version of the function is designed to be called from within a CUDA kernel
#if !defined(GGML_FP16_TO_FP32)
__device__ float ggml_lookup_fp16_to_fp32(uint16_t f) {
return ggml_table_f32_f16[f];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
\ No newline at end of file
/*
* @Description :
* @Author : Azure-Tang, Boxin Zhang
* @Date : 2024-07-25 13:38:30
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 11:58:50
* Adapted from https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c
* Copyright (c) 2023-2024 The ggml authors
* Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
*/
#include <cuda_runtime.h>
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
#include <cstdint>
__global__ void dequantize_q8_0_kernel(float* output, const float* scales, const int8_t* qs, int num_blocks, int blk_size) {
int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
for(int i=0;i<blk_size;i++){
float scale = scales[block_id];
output[block_id * blk_size + i] = scale * qs[block_id * blk_size + i];
}
}
}
// __device__ void get_scale_min_k4(int j, const uint8_t * __restrict__ q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
__device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t * __restrict__ d, uint8_t * __restrict__ m) {
if (j < 4) {
*d = q[j] & 63; *m = q[j + 4] & 63;
} else {
*d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
*m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
}
}
__global__ void dequantize_q4_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
float* __restrict__ output_blk = (float*)(output + block_id * 256);
// const uint8_t * q = data[i].qs;
const uint8_t * q = (uint8_t*)(data + block_id * 144 + 16);
const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 0)));
const float min = __half2float(*(reinterpret_cast<half*>(data + block_id * 144 + 2)));
int is = 0;
uint8_t sc, m;
for (int j = 0; j < blk_size; j += 64) {
uint8_t* scales = (uint8_t*)(data + block_id * 144 + 4);
get_scale_min_k4(is + 0, scales, &sc, &m);
const float d1 = d * sc; const float m1 = min * m;
get_scale_min_k4(is + 1, scales, &sc, &m);
const float d2 = d * sc; const float m2 = min * m;
for (int l = 0; l < 32; ++l) *output_blk++ = d1 * (q[l] & 0xF) - m1;
for (int l = 0; l < 32; ++l) *output_blk++ = d2 * (q[l] >> 4) - m2;
q += 32; is += 2;
}
}
}
__global__ void dequantize_q6_k_kernel(int8_t* data, float* output, int blk_size, int num_blocks) {
int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
for (auto block_id=global_idx; block_id<num_blocks;block_id+=blockDim.x * gridDim.x){
float* __restrict__ output_blk = (float*)(output + block_id * 256);
const float d = __half2float(*(reinterpret_cast<half*>(data + block_id * blk_size + 208)));
const uint8_t * __restrict__ ql = (uint8_t*)(data + block_id * blk_size);
const uint8_t * __restrict__ qh = (uint8_t*)(data + block_id * blk_size + 128);
const int8_t * __restrict__ sc = (int8_t*)(data + block_id * blk_size + 192);
//if (blk_size == 256){
for (int n = 0; n < blk_size; n += 128) {
for (int l = 0; l < 32; ++l) {
int is = l/16;
const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
output_blk[l + 0] = d * sc[is + 0] * q1;
output_blk[l + 32] = d * sc[is + 2] * q2;
output_blk[l + 64] = d * sc[is + 4] * q3;
output_blk[l + 96] = d * sc[is + 6] * q4;
}
output_blk += 128;
ql += 64;
qh += 32;
sc += 8;
}
}
}
torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device) {
int num_blocks = data.numel() / blk_size;
// create gpu
auto options_scales = torch::TensorOptions().dtype(torch::kFloat32).device(device).memory_format(torch::MemoryFormat::Contiguous);
auto options_qs = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
auto scales_gpu = torch::empty({{num_blocks, 1}}, options_scales);
auto qs_gpu = torch::empty({num_blocks, 32}, options_qs);
// read on cpu
options_scales = torch::TensorOptions().dtype(torch::kFloat16).device(torch::kCPU);
options_qs = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
// // reinterpret
auto scales = torch::from_blob(data.data_ptr(), {num_blocks, 1 + 16}, options_scales).slice(1, 0, 1);
auto qs = torch::from_blob(data.data_ptr(), {num_blocks, 2 + 32}, options_qs).slice(1, 2);
auto scales_f32 = scales.to(torch::kFloat32);
scales_gpu.copy_(scales_f32, false);
qs_gpu.copy_(qs, false);
// Create output tensor
auto output = torch::zeros_like(qs, torch::dtype(torch::kFloat32).device(device));
// Launch kernel
dequantize_q8_0_kernel<<< 512, 256 >>>(
output.data_ptr<float>(), scales_gpu.data_ptr<float>(), qs_gpu.data_ptr<int8_t>(), num_blocks, 32);
cudaDeviceSynchronize();
return output;
}
torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device) {
// data.numel%blk_size should be 0, else raise err
int num_blocks = data.numel() / blk_size;
auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
auto data_gpu = torch::empty({data.numel()}, options);
data_gpu.copy_(data, false);
// Create output tensor
auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
// Launch kernel
dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), blk_size, num_blocks);
// dequantize_q6_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
cudaDeviceSynchronize();
return output;
}
torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device) {
// data.numel%blk_size should be 0, else raise err
int num_blocks = data.numel() / blk_size;
auto options = torch::TensorOptions().dtype(torch::kInt8).device(device).memory_format(torch::MemoryFormat::Contiguous);
auto data_gpu = torch::empty({data.numel()}, options);
data_gpu.copy_(data, false);
// Create output tensor
auto output = torch::zeros({num_blocks, 256}, torch::dtype(torch::kFloat32).device(device));
// Launch kernel
dequantize_q4_k_kernel<<< 512, 256 >>>(data_gpu.data_ptr<int8_t>(), output.data_ptr<float>(), 256, num_blocks);
cudaDeviceSynchronize();
return output;
}
/**
* @Description :
* @Author : Azure-Tang
* @Date : 2024-07-22 09:27:55
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:38:20
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
torch::Tensor dequantize_q8_0(torch::Tensor data, int blk_size, torch::Device device);
torch::Tensor dequantize_q6_k(torch::Tensor data, int blk_size, torch::Device device);
torch::Tensor dequantize_q4_k(torch::Tensor data, int blk_size, torch::Device device);
\ No newline at end of file
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
namespace gptq_marlin {
// 8 warps are a good choice since every SM has 4 schedulers and having more
// than 1 warp per schedule allows some more latency hiding. At the same time,
// we want relatively few warps to have many registers per warp and small tiles.
static constexpr int default_threads = 256;
static constexpr int pipe_stages =
4; // 4 pipeline stages fit into shared memory
static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;
static constexpr int tile_size = 16;
static constexpr int max_par = 16;
template <typename T, int n>
struct Vec {
T elems[n];
__device__ T& operator[](int i) { return elems[i]; }
};
using I4 = Vec<int, 4>;
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile(
"{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async_fence() {
asm volatile("cp.async.commit_group;\n" ::);
}
template <int n>
__device__ inline void cp_async_wait() {
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}
#endif
} // namespace gptq_marlin
// Adapted from
// https://github.com/vllm-project/vllm/tree/main/csrc/quantization/gptq_marlin
// Copyrigth 2024 The vLLM team.
// Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
#ifndef _data_types_cuh
#define _data_types_cuh
#include "gptq_marlin.cuh"
#include <cuda_fp16.h>
#include <cuda_bf16.h>
namespace gptq_marlin {
template <typename scalar_t>
class ScalarType {};
template <>
class ScalarType<half> {
public:
using scalar_t = half;
using scalar_t2 = half2;
// Matrix fragments for tensor core instructions; their precise layout is
// documented here:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type
using FragA = Vec<half2, 4>;
using FragB = Vec<half2, 2>;
using FragC = Vec<float, 4>;
using FragS = Vec<half2, 1>;
static __device__ float inline num2float(const half x) {
return __half2float(x);
}
static __device__ half2 inline num2num2(const half x) {
return __half2half2(x);
}
static __device__ half2 inline nums2num2(const half x1, const half x2) {
return __halves2half2(x1, x2);
}
static __host__ __device__ half inline float2num(const float x) {
return __float2half(x);
}
};
template <>
class ScalarType<nv_bfloat16> {
public:
using scalar_t = nv_bfloat16;
using scalar_t2 = nv_bfloat162;
using FragA = Vec<nv_bfloat162, 4>;
using FragB = Vec<nv_bfloat162, 2>;
using FragC = Vec<float, 4>;
using FragS = Vec<nv_bfloat162, 1>;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
static __device__ float inline num2float(const nv_bfloat16 x) {
return __bfloat162float(x);
}
static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
return __bfloat162bfloat162(x);
}
static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
const nv_bfloat16 x2) {
return __halves2bfloat162(x1, x2);
}
static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
return __float2bfloat16(x);
}
#endif
};
} // namespace gptq_marlin
#endif
/**
* @Description :
* @Author : Azure
* @Date : 2024-07-22 09:27:55
* @Version : 1.0.0
* @LastEditors : Azure
* @LastEditTime : 2024-07-26 08:35:00
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <torch/library.h>
#include <torch/extension.h>
#include <torch/torch.h>
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_scales, torch::Tensor& g_idx,
torch::Tensor& perm, torch::Tensor& workspace,
int64_t num_bits, int64_t size_m, int64_t size_n,
int64_t size_k, bool is_k_full);
// torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
// int64_t size_k, int64_t size_n,
// int64_t num_bits);
\ No newline at end of file
from setuptools import setup, Extension
from torch.utils import cpp_extension
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
# setup marlin gemm
setup(name='KTransformersOps',
ext_modules=[
CUDAExtension('KTransformersOps', [
'custom_gguf/dequant.cu',
'binding.cpp',
'gptq_marlin/gptq_marlin.cu',
# 'gptq_marlin_repack.cu',
])
],
cmdclass={'build_ext': BuildExtension
})
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenht2022
Date : 2024-07-25 10:32:05
Version : 1.0.0
LastEditors : chenht2022
LastEditTime : 2024-07-25 10:34:00
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
import os, sys
import time
sys.path.append(os.path.dirname(__file__) + '/../build')
import cpuinfer_ext
import torch
with torch.inference_mode(mode=True):
input_size = 16384
output_size = 5120
stride = 32
proj_type = 1 # ggml_type::GGML_TYPE_F16
hidden_type = 1 # ggml_type::GGML_TYPE_F16
layer_num = 10
CPUInfer = cpuinfer_ext.CPUInfer(48)
validation_iter = 100
warm_up_iter = 1000
test_iter = 10000
linears = []
projs = []
for _ in range(layer_num):
proj = torch.randn((output_size, input_size), dtype=torch.float16, device = "cuda").to("cpu").contiguous()
config = cpuinfer_ext.linear.LinearConfig(input_size, output_size, stride, proj.data_ptr(), proj_type, hidden_type)
linear = cpuinfer_ext.linear.Linear(config)
projs.append(proj)
linears.append(linear)
# validation
for i in range(validation_iter):
linear = linears[i % layer_num]
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
CPUInfer.sync()
# print('cpuinfer output', output)
proj = projs[i%layer_num]
t_output = torch.mm(input, proj.t())
# print('torch output', t_output)
diff = torch.mean(torch.abs(output - t_output)) / torch.mean(torch.abs(t_output))
print('diff = ', diff)
assert(diff < 0.001)
# warm up
for i in range(warm_up_iter):
linear = linears[i % layer_num]
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
input = input / 100
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
CPUInfer.sync()
# test
total_time = 0
for i in range(test_iter):
linear = linears[i % layer_num]
input = torch.randn((1, input_size), dtype=torch.float16).contiguous()
output = torch.empty((1, output_size), dtype=torch.float16).contiguous()
input = input / 100
start = time.perf_counter()
CPUInfer.submit(linear.forward, input.data_ptr(), output.data_ptr())
CPUInfer.sync()
end = time.perf_counter()
total_time += end - start
print('Time: ', total_time)
print('Iteration: ', test_iter)
print('Time per iteration: ', total_time / test_iter)
print('Bandwidth: ', input_size * output_size * 2 * test_iter / total_time / 1000 / 1000 / 1000, 'GB/s')
print("All tasks completed.")
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment