"docs/vscode:/vscode.git/clone" did not exist on "9242e7e65d3c1c841333ece7b7469cd677100e34"
Commit f5f79f5c authored by chenxl's avatar chenxl
Browse files

[ADD] support multi-gpu qlen>1 q5_k

parent f2938031
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
**/ **/
#include "moe.h" #include "moe.h"
#include <iostream> #include <iostream>
#include "unistd.h" #include <cstdint>
MOE::MOE(MOEConfig config) { MOE::MOE(MOEConfig config) {
config_ = config; config_ = config;
...@@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) { ...@@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) {
m_local_pos_.resize(config_.group_max_len); m_local_pos_.resize(config_.group_max_len);
for (int i = 0; i < config_.group_max_len; i++) { for (int i = 0; i < config_.group_max_len; i++) {
m_local_pos_[i].reserve(config_.expert_num); m_local_pos_[i].resize(config_.routed_expert_num);
} }
m_local_num_.resize(config_.expert_num); m_local_num_.resize(config_.expert_num);
m_local_gate_input_ptr_.resize(config_.expert_num); m_local_gate_input_ptr_.resize(config_.expert_num);
...@@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c ...@@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
int expert_idx = task_id / nth; int expert_idx = task_id / nth;
uint64_t expert_id = expert_ids[expert_idx]; uint64_t expert_id = expert_ids[expert_idx];
int ith = task_id % nth; int ith = task_id % nth;
void* gate_proj_ptr = gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride; float* gate_output_ptr = s_gate_output_[expert_idx] + ith * config_.stride;
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
void* up_proj_ptr = up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_id * config_.intermediate_size + ith * config_.stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride; float* up_output_ptr = s_up_output_[expert_idx] + ith * config_.stride;
llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(config_.stride, 1, config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
...@@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c ...@@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
} }
for (int expert_idx = 0; expert_idx < k; expert_idx++) { for (int expert_idx = 0; expert_idx < k; expert_idx++) {
uint64_t expert_id = expert_ids[expert_idx]; uint64_t expert_id = expert_ids[expert_idx];
void* down_proj_ptr = down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_id * config_.hidden_size + ith * config_.stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride; float* down_output_ptr = s_down_output_[expert_idx] + ith * config_.stride;
llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(config_.stride, 1, config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), s_down_input_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.stride, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) { for (int i = ith * config_.stride; i < (ith + 1) * config_.stride; i++) {
...@@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c ...@@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
} }
if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) { if (config_.stride % ggml_blck_size(config_.hidden_type) == 0) {
float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride; float* output_fp32_ptr = s_output_fp32_ + ith * config_.stride;
void* output_ptr = output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); void* output_ptr = (uint8_t*)output + ith * config_.stride * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type); from_float(output_fp32_ptr, output_ptr, config_.stride, config_.hidden_type);
} }
}); });
...@@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* ...@@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
const void* gate_input_ptr; const void* gate_input_ptr;
const void* up_input_ptr; const void* up_input_ptr;
if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { if (config_.hidden_type == ggml_internal_get_type_traits(config_.gate_type).vec_dot_type && config_.hidden_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
gate_input_ptr = up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); gate_input_ptr = up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
} else { } else {
to_float(input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type); to_float((uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), m_input_fp32_[i], config_.hidden_size, config_.hidden_type);
if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { if (ggml_internal_get_type_traits(config_.gate_type).vec_dot_type == ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
gate_input_ptr = up_input_ptr = m_gate_input_[i]; gate_input_ptr = up_input_ptr = m_gate_input_[i];
...@@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* ...@@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type); from_float(m_input_fp32_[i], m_gate_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type);
gate_input_ptr = m_gate_input_[i]; gate_input_ptr = m_gate_input_[i];
} else { } else {
gate_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); gate_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
} }
if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) { if (config_.hidden_type != ggml_internal_get_type_traits(config_.up_type).vec_dot_type) {
from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type); from_float(m_input_fp32_[i], m_up_input_[i], config_.hidden_size, ggml_internal_get_type_traits(config_.up_type).vec_dot_type);
up_input_ptr = m_up_input_[i]; up_input_ptr = m_up_input_[i];
} else { } else {
up_input_ptr = input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type); up_input_ptr = (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type);
} }
} }
} }
...@@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* ...@@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int expert_idx = task_id / nth; int expert_idx = task_id / nth;
int ith = task_id % nth; int ith = task_id % nth;
void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx]; void* gate_input_ptr = m_local_gate_input_ptr_[expert_idx];
void* gate_proj_ptr = gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type); void* gate_proj_ptr = (uint8_t*)gate_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.gate_type) / ggml_blck_size(config_.gate_type);
float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride; float* gate_output_ptr = m_local_gate_output_ptr_[expert_idx] + ith * stride;
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.gate_type), gate_proj_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_input_ptr, config_.hidden_size / ggml_blck_size(config_.gate_type), gate_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.gate_type, ggml_internal_get_type_traits(config_.gate_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
void* up_input_ptr = m_local_up_input_ptr_[expert_idx]; void* up_input_ptr = m_local_up_input_ptr_[expert_idx];
void* up_proj_ptr = up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type); void* up_proj_ptr = (uint8_t*)up_proj_ + (expert_idx * config_.intermediate_size + ith * stride) * config_.hidden_size * ggml_type_size(config_.up_type) / ggml_blck_size(config_.up_type);
float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride; float* up_output_ptr = m_local_up_output_ptr_[expert_idx] + ith * stride;
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(stride, m_local_num_[expert_idx], config_.hidden_size / ggml_blck_size(config_.up_type), up_proj_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_input_ptr, config_.hidden_size / ggml_blck_size(config_.up_type), up_output_ptr, config_.intermediate_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.up_type, ggml_internal_get_type_traits(config_.up_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
for (int i = 0; i < m_local_num_[expert_idx]; i++) { for (int i = 0; i < m_local_num_[expert_idx]; i++) {
...@@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* ...@@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int expert_idx = task_id / nth; int expert_idx = task_id / nth;
int ith = task_id % nth; int ith = task_id % nth;
void* down_input_ptr = m_local_down_input_ptr_[expert_idx]; void* down_input_ptr = m_local_down_input_ptr_[expert_idx];
void* down_proj_ptr = down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type); void* down_proj_ptr = (uint8_t*)down_proj_ + (expert_idx * config_.hidden_size + ith * stride) * config_.intermediate_size * ggml_type_size(config_.down_type) / ggml_blck_size(config_.down_type);
float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride; float* down_output_ptr = m_local_down_output_ptr_[expert_idx] + ith * stride;
llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT); llamafile_sgemm(stride, m_local_num_[expert_idx], config_.intermediate_size / ggml_blck_size(config_.down_type), down_proj_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_input_ptr, config_.intermediate_size / ggml_blck_size(config_.down_type), down_output_ptr, config_.hidden_size, 0, 1, GGML_TASK_TYPE_COMPUTE, config_.down_type, ggml_internal_get_type_traits(config_.down_type).vec_dot_type, GGML_TYPE_F32, GGML_PREC_DEFAULT);
}); });
...@@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float* ...@@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j]; m_output_fp32_[i][e] += m_local_down_output_ptr_[expert_ids[i * k + j]][m_local_pos_[i][j] * config_.hidden_size + e] * weights[i * k + j];
} }
} }
from_float(m_output_fp32_[i], output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type); from_float(m_output_fp32_[i], (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), config_.hidden_size, config_.hidden_type);
}); });
} }
void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) { void MOE::forward(int qlen, int k, const uint64_t* expert_ids, const float* weights, const void* input, void* output, Backend* backend) {
if (qlen < config_.group_min_len) { if (qlen < config_.group_min_len) {
for (int i = 0; i < qlen; i++) { for (int i = 0; i < qlen; i++) {
forward_one(k, expert_ids + i * k, weights + i * k, input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); forward_one(k, expert_ids + i * k, weights + i * k, (uint8_t*)input + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + i * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
} }
return; return;
} }
int forward_len = std::min(config_.group_max_len, qlen); int forward_len = std::min(config_.group_max_len, qlen);
forward_many(forward_len, k, expert_ids, weights, input, output, backend); forward_many(forward_len, k, expert_ids, weights, input, output, backend);
forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend); forward(qlen - forward_len, k, expert_ids + forward_len * k, weights + forward_len * k, (uint8_t*)input + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), (uint8_t*)output + forward_len * config_.hidden_size * ggml_type_size(config_.hidden_type) / ggml_blck_size(config_.hidden_type), backend);
} }
\ No newline at end of file
...@@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) { ...@@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) {
void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) { void SharedMemBuffer::arrange(std::vector<std::pair<void**, uint64_t>> requests) {
uint64_t offset = 0; uint64_t offset = 0;
for (auto& request : requests) { for (auto& request : requests) {
*(request.first) = buffer_ + offset; *(request.first) = (uint8_t*)buffer_ + offset;
offset += request.second; offset += request.second;
} }
} }
...@@ -31,18 +31,21 @@ import fire ...@@ -31,18 +31,21 @@ import fire
from ktransformers.optimize.optimize import optimize_and_load_gguf from ktransformers.optimize.optimize import optimize_and_load_gguf
from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM from ktransformers.models.modeling_deepseek import DeepseekV2ForCausalLM
from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM from ktransformers.models.modeling_qwen2_moe import Qwen2MoeForCausalLM
from ktransformers.models.modeling_mixtral import MixtralForCausalLM
from ktransformers.util.utils import prefill_and_generate from ktransformers.util.utils import prefill_and_generate
from ktransformers.server.config.config import Config from ktransformers.server.config.config import Config
custom_models = { custom_models = {
"DeepseekV2ForCausalLM": DeepseekV2ForCausalLM, "DeepseekV2ForCausalLM": DeepseekV2ForCausalLM,
"Qwen2MoeForCausalLM": Qwen2MoeForCausalLM, "Qwen2MoeForCausalLM": Qwen2MoeForCausalLM,
"MixtralForCausalLM": MixtralForCausalLM,
} }
ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/" ktransformer_rules_dir = os.path.dirname(os.path.abspath(__file__)) + "/optimize/optimize_rules/"
default_optimize_rules ={ default_optimize_rules ={
"DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml", "DeepseekV2ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
"Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml", "Qwen2MoeForCausalLM": ktransformer_rules_dir + "Qwen2-57B-A14B-Instruct.yaml",
"MixtralForCausalLM": ktransformer_rules_dir + "Mixtral.yaml",
} }
def local_chat( def local_chat(
...@@ -50,7 +53,8 @@ def local_chat( ...@@ -50,7 +53,8 @@ def local_chat(
optimize_rule_path: str = None, optimize_rule_path: str = None,
gguf_path: str = None, gguf_path: str = None,
max_new_tokens: int = 1000, max_new_tokens: int = 1000,
cpu_infer: int = Config().cpu_infer cpu_infer: int = Config().cpu_infer,
use_cuda_graph: bool = True,
): ):
torch.set_grad_enabled(False) torch.set_grad_enabled(False)
...@@ -64,6 +68,8 @@ def local_chat( ...@@ -64,6 +68,8 @@ def local_chat(
print("using custom modeling_xxx.py.") print("using custom modeling_xxx.py.")
if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow. if "Qwen2Moe" in config.architectures[0]: # Qwen2Moe must use flash_attention_2 to avoid overflow.
config._attn_implementation = "flash_attention_2" config._attn_implementation = "flash_attention_2"
if "Mixtral" in config.architectures[0]:
config._attn_implementation = "flash_attention_2"
model = custom_models[config.architectures[0]](config) model = custom_models[config.architectures[0]](config)
else: else:
model = AutoModelForCausalLM.from_config( model = AutoModelForCausalLM.from_config(
...@@ -100,7 +106,6 @@ def local_chat( ...@@ -100,7 +106,6 @@ def local_chat(
while True: while True:
content = input("Chat: ") content = input("Chat: ")
# if content is num
if content == "": if content == "":
content = "Please write a piece of quicksort code in C++." content = "Please write a piece of quicksort code in C++."
...@@ -109,7 +114,7 @@ def local_chat( ...@@ -109,7 +114,7 @@ def local_chat(
messages, add_generation_prompt=True, return_tensors="pt" messages, add_generation_prompt=True, return_tensors="pt"
) )
torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config torch.set_default_dtype(torch.bfloat16) # TODO: Remove this, replace dtype using config
generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens) generated = prefill_and_generate(model, tokenizer, input_tensor.cuda(), max_new_tokens, use_cuda_graph)
if __name__ == "__main__": if __name__ == "__main__":
fire.Fire(local_chat) fire.Fire(local_chat)
\ No newline at end of file
...@@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache): ...@@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache):
The maximum batch size with which the model will be used. The maximum batch size with which the model will be used.
max_cache_len (`int`): max_cache_len (`int`):
The maximum sequence length with which the model will be used. The maximum sequence length with which the model will be used.
device (`torch.device`): device (`torch.device` or `dict`):
The device on which the cache should be initialized. Should be the same as the layer. The device on which the cache should be initialized. Should be the same as the layer.
If a `dict`, it should contain the `device` key with the device name as the value.
dtype (*optional*, defaults to `torch.float32`): dtype (*optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer. The default `dtype` to use when initializing the layer.
""" """
def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None: def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device: torch.device| dict, dtype=None) -> None:
Cache.__init__(self) Cache.__init__(self)
self.max_batch_size = max_batch_size self.max_batch_size = max_batch_size
self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
...@@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache): ...@@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache):
self.value_cache: List[torch.Tensor] = [] self.value_cache: List[torch.Tensor] = []
cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
if config.architectures[0] == "DeepseekV2ForCausalLM": if config.architectures[0] == "DeepseekV2ForCausalLM":
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim) # key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim) # value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim) key_shape = (max_batch_size, 1, self.max_cache_len, config.qk_rope_head_dim)
...@@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache): ...@@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache):
self.past_tokens = [] self.past_tokens = []
self.num_hidden_layers = config.num_hidden_layers self.num_hidden_layers = config.num_hidden_layers
for _ in range(self.num_hidden_layers): for idx in range(self.num_hidden_layers):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache. # breaks when updating the cache.
new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=device) if isinstance(device, dict):
new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=device) target_device = device[f"blk.{idx}.self_attn"]["generate_device"]
else:
target_device = device
new_layer_key_cache = torch.zeros(key_shape, dtype=self.dtype, device=target_device)
new_layer_value_cache = torch.zeros(value_shape, dtype=self.dtype, device=target_device)
torch._dynamo.mark_static_address(new_layer_key_cache) torch._dynamo.mark_static_address(new_layer_key_cache)
torch._dynamo.mark_static_address(new_layer_value_cache) torch._dynamo.mark_static_address(new_layer_value_cache)
self.key_cache.append(new_layer_key_cache) self.key_cache.append(new_layer_key_cache)
......
...@@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention): ...@@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
""" """
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores. first unpad the input, then computes the attention scores and pad the final attention scores.
Args: # Args:
query_states (`torch.Tensor`): query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`): key_states (`torch.Tensor`):
...@@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module):
cache_position=cache_position, cache_position=cache_position,
**kwargs, **kwargs,
) )
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
# Fully Connected # Fully Connected
residual = hidden_states residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states) hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states hidden_states = residual + hidden_states
outputs = (hidden_states,) outputs = (hidden_states,)
......
This diff is collapsed.
...@@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule ...@@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.custom_gguf import GGUFLoader from ktransformers.util.custom_gguf import GGUFLoader
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding): class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
def __init__(self, def __init__(self,
...@@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding): ...@@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
gguf_loader : GGUFLoader, gguf_loader : GGUFLoader,
config: PretrainedConfig, config: PretrainedConfig,
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", # device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs): **kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim, self.orig_module.__init__(orig_module.dim,
orig_module.max_position_embeddings, orig_module.max_position_embeddings,
orig_module.base) orig_module.base)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self): def load(self):
self.orig_module.__init__(self.orig_module.dim, self.orig_module.__init__(self.orig_module.dim,
...@@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding): ...@@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
gguf_loader : GGUFLoader, gguf_loader : GGUFLoader,
config: PretrainedConfig, config: PretrainedConfig,
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", # device: str = "cuda",
generate_device: str = "cuda",
prefill_device: str = "cuda",
**kwargs): **kwargs):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
self.orig_module.__init__(orig_module.dim, self.orig_module.__init__(orig_module.dim,
orig_module.max_position_embeddings, orig_module.max_position_embeddings,
orig_module.base, orig_module.base,
...@@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding): ...@@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
orig_module.beta_slow, orig_module.beta_slow,
orig_module.mscale, orig_module.mscale,
orig_module.mscale_all_dim) orig_module.mscale_all_dim)
self.generate_device = generate_device
self.prefill_device = prefill_device
def load(self): def load(self):
self.orig_module.__init__(self.orig_module.dim, self.orig_module.__init__(self.orig_module.dim,
self.orig_module.max_position_embeddings, self.orig_module.max_position_embeddings,
self.orig_module.base, self.orig_module.base,
self.device, self.generate_device,
self.orig_module.scaling_factor, self.orig_module.scaling_factor,
self.orig_module.original_max_position_embeddings, self.orig_module.original_max_position_embeddings,
self.orig_module.beta_fast, self.orig_module.beta_fast,
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ Author : Azure-Tang ...@@ -6,7 +6,7 @@ Author : Azure-Tang
Date : 2024-07-25 11:25:24 Date : 2024-07-25 11:25:24
Version : 1.0.0 Version : 1.0.0
LastEditors : Azure LastEditors : Azure
LastEditTime : 2024-07-26 09:27:48 LastEditTime : 2024-08-08 10:09:14
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
...@@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep ...@@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep
from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig from transformers.models.qwen2_moe.configuration_qwen2_moe import Qwen2MoeConfig
from ktransformers.operators.base_operator import BaseInjectedModule from ktransformers.operators.base_operator import BaseInjectedModule
from ktransformers.util.utils import InferenceState from ktransformers.util.utils import InferenceState
from ktransformers.util.custom_gguf import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
if is_flash_attn_2_available(): if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn import flash_attn_func, flash_attn_varlen_func
...@@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r""" ...@@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r"""
[`~PreTrainedModel.from_pretrained`] method to load the model weights. [`~PreTrainedModel.from_pretrained`] method to load the model weights.
""" """
@add_start_docstrings(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
QWEN2MOE_START_DOCSTRING,
)
class Qwen2MoePreTrainedModel(PreTrainedModel):
config_class = Qwen2MoeConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["Qwen2MoeDecoderLayer"]
_skip_keys_device_placement = "past_key_values"
_supports_flash_attn_2 = True
_supports_sdpa = True
_supports_cache_class = True
_supports_static_cache = True
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
QWEN2MOE_INPUTS_DOCSTRING = r""" QWEN2MOE_INPUTS_DOCSTRING = r"""
Args: Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...@@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r""" ...@@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
the complete sequence length. the complete sequence length.
""" """
from ktransformers.util.custom_gguf import GGUFLoader
from transformers.configuration_utils import PretrainedConfig
@add_start_docstrings( @add_start_docstrings(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.", "The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top.",
QWEN2MOE_START_DOCSTRING, QWEN2MOE_START_DOCSTRING,
) )
class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule): class Qwen2MoeModelKTransformers(BaseInjectedModule):
""" """
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`] Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
...@@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule): ...@@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", device: str = "cuda",
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
transfer_map: dict = None,
**kwargs, **kwargs,
): ):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
self.transfer_map = transfer_map
self.stream_device_map = dict()
@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
def forward( def forward(
...@@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule): ...@@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
all_router_logits = () if output_router_logits else None all_router_logits = () if output_router_logits else None
next_decoder_cache = None next_decoder_cache = None
for decoder_layer in self.layers: for i, decoder_layer in enumerate(self.layers):
if self.transfer_map is not None and i in self.transfer_map:
prev_stream = torch.cuda.current_stream()
cur_device = self.transfer_map[i]
if cur_device not in self.stream_device_map:
self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
torch.cuda.set_device(cur_device)
self.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.stream_device_map[cur_device])
hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
...@@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r""" ...@@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
""" """
class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule): class DeepseekV2ModelKTransformers(BaseInjectedModule):
""" """
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`] Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
...@@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule): ...@@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", device: str = "cuda",
per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill per_layer_prefill_intput_threshold: int = 30000, # if None, no per-layer prefill
transfer_map: dict = None,
**kwargs, **kwargs,
): ):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs)
self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold self.per_layer_prefill_intput_threshold = per_layer_prefill_intput_threshold
self.transfer_map = transfer_map
self.stream_device_map = dict()
@add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING) @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
def forward( def forward(
...@@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule): ...@@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
t_cpu = 0 t_cpu = 0
t_f = 0 t_f = 0
for decoder_layer in self.layers: for i, decoder_layer in enumerate(self.layers):
if self.transfer_map is not None and i in self.transfer_map:
prev_stream = torch.cuda.current_stream()
cur_device = self.transfer_map[i]
if cur_device not in self.stream_device_map:
self.stream_device_map[cur_device] = torch.cuda.Stream(cur_device)
torch.cuda.set_device(cur_device)
self.stream_device_map[cur_device].wait_stream(prev_stream)
torch.cuda.set_stream(self.stream_device_map[cur_device])
hidden_states = hidden_states.to(self.transfer_map[i], non_blocking = True)
causal_mask = causal_mask.to(self.transfer_map[i], non_blocking = True) if causal_mask is not None else None
position_ids = position_ids.to(self.transfer_map[i], non_blocking = True) if position_ids is not None else None
cache_position = cache_position.to(self.transfer_map[i], non_blocking = True) if cache_position is not None else None
if output_hidden_states: if output_hidden_states:
all_hidden_states += (hidden_states,) all_hidden_states += (hidden_states,)
......
...@@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase): ...@@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
self.act_order = act_order self.act_order = act_order
self.is_k_full = is_k_full self.is_k_full = is_k_full
def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = "cuda"): def load(self, w: dict | nn.Parameter | tuple | None = None, device: str|None = None):
if device is None: device = self.device if device is None: device = self.device
assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device" assert device.lower() != "cpu", "Marlin quantized linear only supports GPU device"
if w is None: w = self.load_weight(device=device) if w is None: w = self.load_weight(device=device)
...@@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase): ...@@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
weight, self.num_bits, self.group_size, self.act_order weight, self.num_bits, self.group_size, self.act_order
) )
self.workspace = MarlinWorkspace( self.workspace = MarlinWorkspace(
self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL self.out_features, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL,self.device
) )
self.marlin_q_w = marlin_q_w self.marlin_q_w = marlin_q_w
self.marlin_s = marlin_s self.marlin_s = marlin_s
...@@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase): ...@@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
LINEAR_MAP = { LINEAR_MAP = {
"QuantizedLinearMarlin": QuantizedLinearMarlin, "QuantizedLinearMarlin": QuantizedLinearMarlin,
"QuantizedLinearTorch": QuantizedLinearTorch, "QuantizedLinearTorch": QuantizedLinearTorch,
"QuantizedLinearTorch": QuantizedLinearTorch,
} }
class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
...@@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): ...@@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
gguf_loader: GGUFLoader, gguf_loader: GGUFLoader,
config: PretrainedConfig, config: PretrainedConfig,
orig_module: nn.Module, orig_module: nn.Module,
device: str = "cuda", # device: str = "cuda",
generate_device: str = "cuda", generate_device: str = "cuda",
generate_op: str| None = "QuantizedLinearMarlin", generate_op: str| None = "QuantizedLinearMarlin",
prefill_device: str = "cuda", prefill_device: str = "cuda",
prefill_op: str| None = "QuantizedLinearTorch", prefill_op: str| None = "QuantizedLinearTorch",
**kwargs, **kwargs,
): ):
BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) BaseInjectedModule.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, device, **kwargs) QuantizedLinearBase.__init__(self, key, gguf_loader, config, orig_module, generate_device, **kwargs)
# build all the linear operators # build all the linear operators
if prefill_op is not None: if prefill_op is not None:
assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported" assert prefill_op in LINEAR_MAP, f"linear_type {prefill_op} not supported"
...@@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase): ...@@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs) self.generate_linear = LINEAR_MAP[generate_op](key, gguf_loader, config, orig_module, generate_device, **kwargs)
else: else:
self.generate_linear = None self.generate_linear = None
self.device = device
self.mode = InferenceState.UNLOAD self.mode = InferenceState.UNLOAD
def forward(self, x): def forward(self, x):
......
''' '''
Description : Description :
Author : Boxin Zhang Author : Boxin Zhang, Azure-Tang
Version : 0.1.0 Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved. Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
''' '''
...@@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig ...@@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
from ktransformers.util.utils import set_module, load_weights from ktransformers.util.utils import set_module, load_weights
import itertools import itertools
import copy
def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''): def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader:GGUFLoader, prefix=''):
for name, child in module._modules.items(): for name, child in module._modules.items():
...@@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader ...@@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader
child_prefix = prefix + name child_prefix = prefix + name
if child_prefix in local_optimization_dict: if child_prefix in local_optimization_dict:
inject_module_meta=local_optimization_dict[child_prefix] inject_module_meta=local_optimization_dict[child_prefix]
if isinstance(inject_module_meta, Mapping): if inject_module_meta["class"] != "default":
import_path = inject_module_meta["class"].split(".") import_path = inject_module_meta["class"].split(".")
import_module_name = ".".join(import_path[:-1]) import_module_name = ".".join(import_path[:-1])
gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
import_class_name = import_path[-1] import_class_name = import_path[-1]
module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name) module_cls=getattr(__import__(import_module_name, fromlist=[""]), import_class_name)
print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name) print(f"Injecting {child_prefix} as", import_module_name, ".", import_class_name)
inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, device = inject_module_meta["device"], **inject_module_meta["kwargs"]) inject_module=module_cls(key = inject_module_meta["key"], gguf_loader = gguf_loader, config = model_config, orig_module=child, **inject_module_meta["kwargs"])
set_module(module, name, inject_module) set_module(module, name, inject_module)
elif isinstance(inject_module_meta, str): elif inject_module_meta["class"] == "default":
assert inject_module_meta=="default", "for str inject_module_meta, only support \"default\"." print(f"Injecting {child_prefix} as default")
gguf_loader.tensor_device_map[inject_module_meta["key"]] = inject_module_meta["kwargs"] if "kwargs" in inject_module_meta else dict()
else: else:
raise Exception("inject_module_meta must be a dict or str") raise Exception("inject_module_meta[\"class\"] must be \"default\" or a class path")
child_prefix += "." child_prefix += "."
child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)} child_optimization_dict = {k: v for k, v in local_optimization_dict.items() if k.startswith(child_prefix)}
inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix) inject(child, child_optimization_dict, model_config, gguf_loader, child_prefix)
...@@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
for rule in rule_list: for rule in rule_list:
#print(rule) #print(rule)
match_meta = rule["match"] match_meta = rule["match"]
if "class" not in match_meta and "name" not in match_meta:
raise Exception("match must have at least one of \"class\" and \"name\"")
if "class" in match_meta: if "class" in match_meta:
import_path = match_meta["class"].split(".") import_path = match_meta["class"].split(".")
import_module_name = ".".join(import_path[:-1]) import_module_name = ".".join(import_path[:-1])
...@@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if "name" in match_meta: if "name" in match_meta:
if re.search(match_meta["name"], module_name) is None: if re.search(match_meta["name"], module_name) is None:
continue continue
replace_meta = rule["replace"] if "replace" not in rule:
out_data[module_name]={"key": translated_name, raise Exception("replace must be in rule")
"class": replace_meta["class"], if "replace" in rule:
"device": replace_meta["device"] if "device" in replace_meta else default_device, replace_meta = rule["replace"]
"kwargs": replace_meta["kwargs"] if "kwargs" in replace_meta else dict()} if module_name not in out_data:
out_data[module_name]={"key": translated_name,
"class": replace_meta["class"] if "class" in replace_meta else "default",
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs": copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict()}
else:
if out_data[module_name]["class"] == "default":
out_data[module_name]["class"] = replace_meta["class"] if "class" in replace_meta else "default"
out_data[module_name]["kwargs"].update(copy.deepcopy(replace_meta["kwargs"]) if "kwargs" in replace_meta else dict())
if "recursive" in rule: if "recursive" in rule:
recursive = bool(rule["recursive"]) recursive = bool(rule["recursive"])
if module_name not in out_data: if module_name not in out_data:
out_data[module_name]="default" out_data[module_name]= {
"class": "default",
"key": translated_name,
"kwargs": {"generate_device": default_device,
"prefill_device": default_device}
}
#print(out_data[module_name]) #print(out_data[module_name])
#input() #input()
...@@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p ...@@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
gen_optimize_config(child, out_data, rule_list, child_prefix) gen_optimize_config(child, out_data, rule_list, child_prefix)
def translate_model_config(model_config: PretrainedConfig):
# for supporting some special model
if model_config.model_type == "mixtral":
model_config.moe_intermediate_size = model_config.intermediate_size
return model_config
def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"): def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, model_config: PretrainedConfig, default_device: str = "cuda:0"):
with open(rule_file, 'r', encoding='utf-8') as f: with open(rule_file, 'r', encoding='utf-8') as f:
rule_list = yaml.load(f.read(), Loader=yaml.FullLoader) rule_list = yaml.load(f.read(), Loader=yaml.FullLoader)
...@@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo ...@@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
optimize_config = dict() optimize_config = dict()
gen_optimize_config(module, optimize_config, rule_list, default_device = default_device) gen_optimize_config(module, optimize_config, rule_list, default_device = default_device)
model_config = translate_model_config(model_config)
gguf_loader=GGUFLoader(gguf_path) gguf_loader=GGUFLoader(gguf_path)
with torch.device("meta"): with torch.device("meta"):
inject(module, optimize_config, model_config, gguf_loader) inject(module, optimize_config, model_config, gguf_loader)
load_weights(module, gguf_loader) load_weights(module, gguf_loader)
model_config.gguf_loader = gguf_loader
del_meta(module) del_meta(module)
- match:
name: "^model\\.layers\\.([0-9])\\."
replace:
class: "default"
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "(^model\\.layers\\.([1][0-9])\\.)"
replace:
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "(^model\\.layers\\.([2][0-9])\\.)"
replace:
class: "default"
kwargs:
generate_device: "cuda:2"
prefill_device: "cuda:2"
- match:
name: "(^model\\.layers\\.([345][0-9])\\.)|(^model.norm)|(^lm_head)"
replace:
class: "default"
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "^model\\.layers\\.([0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([1][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.([2][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:2"
prefill_device: "cuda:2"
- match:
name: "^model\\.layers\\.([345][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
- match:
name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([1][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([2][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:2"
prefill_device: "cuda:2"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([1][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.([2][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:2"
prefill_device: "cuda:2"
- match:
name: "^model\\.layers\\.([345][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
- match:
name: "^model\\.layers\\.([0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:0"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([1][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:1"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([2][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:2"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:2"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:3"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:3"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([1][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.([2][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:2"
prefill_device: "cuda:2"
- match:
name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:3"
prefill_device: "cuda:3"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
transfer_map:
10: "cuda:1"
20: "cuda:2"
30: "cuda:3"
\ No newline at end of file
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
replace:
class: "default"
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "(^model\\.layers\\.([345][0-9])\\.)|(model.norm)|(lm_head)"
replace:
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([345][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([345][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([345][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:0"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([345][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:1"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.(0|[1-9]|[12][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([345][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
transfer_map:
30: "cuda:1"
\ No newline at end of file
- match:
name: "^model\\.layers\\..*\\.|^lm_head"
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match: - match:
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace: replace:
...@@ -21,12 +28,11 @@ ...@@ -21,12 +28,11 @@
name: "^model\\.layers\\..*\\.mlp\\.experts$" name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
device: "cpu" # which devices to load this module when initializing
kwargs: kwargs:
prefill_device: "cuda" prefill_device: "cuda"
prefill_mlp_type: "MLPExpertsTorch" prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu" generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts" generate_mlp_type: "MLPCPUExperts"
out_device: "cuda" out_device: "cuda"
recursive: False # don't recursively inject submodules of this module recursive: False # don't recursively inject submodules of this module
- match: - match:
...@@ -36,6 +42,13 @@ ...@@ -36,6 +42,13 @@
- match: - match:
name: "^model$" name: "^model$"
replace: replace:
class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelPerLayerPrefill" class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs: kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
\ No newline at end of file
- match:
name: "^model\\.layers\\.(0|[1-9])\\."
replace:
class: "default"
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "(^model\\.layers\\.([12][0-9])\\.)|(model.norm)|(lm_head)"
replace:
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "^model\\.layers\\.(0|[1-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([12][0-9])\\."
class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.(0|[1-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([12][0-9])\\.(?!self_attn).*$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.(0|[1-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([12][0-9])\\.mlp$"
class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace:
class: ktransformers.operators.experts.DeepseekV2MoEInjected # mlp module with custom forward function
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.(0|[1-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:0"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([12][0-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
kwargs:
prefill_device: "cuda:1"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.(0|[1-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([12][0-9])\\.self_attn$"
replace:
class: ktransformers.operators.attention.DeepseekV2AttentionInjected # optimized MLA implementation
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
transfer_map:
10: "cuda:1"
\ No newline at end of file
- match:
name: "^model\\.layers\\..*\\."
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match:
class: ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbedding
- match:
name: "^model\\.layers\\..*$"
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\..*\\.block_sparse_moe$"
class: ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
replace:
class: ktransformers.operators.experts.MisrtalSparseMoEBlockInjected
- match:
name: "^model\\.layers\\..*\\.block_sparse_moe\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert
kwargs:
prefill_device: "cuda"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "^model\\.layers\\.([012])\\."
replace:
class: "default"
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([012])\\."
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbedding
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
- match:
name: "^model\\.layers\\.([012])$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:0"
prefill_device: "cuda:0"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([012])\\.mlp$"
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace:
class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected # mlp module with custom forward function
- match:
name: "^model\\.layers\\.([012])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs:
prefill_device: "cuda:0"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:0"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
replace:
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace:
class: ktransformers.operators.RoPE.RotaryEmbedding
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model\\.layers\\.([12][0-9]|[3-9])$" # regular expression
class: torch.nn.Linear # only match modules matching name and class simultaneously
replace:
class: ktransformers.operators.linear.KTransformerLinear # optimized Kernel on quantized data types
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
generate_op: "QuantizedLinearMarlin"
prefill_op: "QuantizedLinearTorch"
- match:
name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$"
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace:
class: ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected # mlp module with custom forward function
- match:
name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$"
replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs:
prefill_device: "cuda:1"
prefill_mlp_type: "MLPExpertsTorch"
generate_device: "cpu"
generate_mlp_type: "MLPCPUExperts"
out_device: "cuda:1"
recursive: False # don't recursively inject submodules of this module
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
- match:
name: "(^model.norm)|(^lm_head)"
replace:
class: "default"
kwargs:
generate_device: "cuda:1"
prefill_device: "cuda:1"
- match:
name: "^model$"
replace:
class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
transfer_map:
3: "cuda:1"
- match:
name: "^model\\.layers\\..*\\."
replace:
class: "default"
kwargs:
generate_device: "cuda"
prefill_device: "cuda"
- match: - match:
class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace: replace:
...@@ -21,7 +28,7 @@ ...@@ -21,7 +28,7 @@
name: "^model\\.layers\\..*\\.mlp\\.experts$" name: "^model\\.layers\\..*\\.mlp\\.experts$"
replace: replace:
class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism class: ktransformers.operators.experts.KTransformersMLPExpert # custom MoE Kernel with expert paralleism
device: "cpu" # which devices to load this module when initializing # device: "cpu" # which devices to load this module when initializing
kwargs: kwargs:
prefill_device: "cuda" prefill_device: "cuda"
prefill_mlp_type: "MLPExpertsTorch" prefill_mlp_type: "MLPExpertsTorch"
...@@ -32,6 +39,13 @@ ...@@ -32,6 +39,13 @@
- match: - match:
name: "^model$" name: "^model$"
replace: replace:
class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelPerLayerPrefill" class: "ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
kwargs: kwargs:
per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
- match:
name: "^model.embed_tokens"
replace:
class: "default"
kwargs:
generate_device: "cpu"
prefill_device: "cpu"
\ No newline at end of file
This diff is collapsed.
...@@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa ...@@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa
from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch from ktransformers.operators.experts import KTransformersMLPExpert, MLPExpertsTorch
from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k from ktransformers.util.custom_gguf import GGUFLoader, dequantize_q4_k_gpu, dequantize_q4_k
import torch import torch
import CudaOps import KTransformersOps
torch.set_default_dtype(torch.bfloat16) torch.set_default_dtype(torch.bfloat16)
import time import time
from transformers import ( from transformers import (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment