Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f5f79f5c
Commit
f5f79f5c
authored
Aug 12, 2024
by
chenxl
Browse files
[ADD] support multi-gpu qlen>1 q5_k
parent
f2938031
Changes
63
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2780 additions
and
180 deletions
+2780
-180
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+16
-16
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
...ransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+1
-1
ktransformers/local_chat.py
ktransformers/local_chat.py
+9
-4
ktransformers/models/custom_cache.py
ktransformers/models/custom_cache.py
+11
-5
ktransformers/models/modeling_deepseek.py
ktransformers/models/modeling_deepseek.py
+3
-1
ktransformers/models/modeling_mixtral.py
ktransformers/models/modeling_mixtral.py
+1735
-0
ktransformers/operators/RoPE.py
ktransformers/operators/RoPE.py
+14
-5
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+220
-72
ktransformers/operators/layer_wise_prefill.py
ktransformers/operators/layer_wise_prefill.py
+39
-35
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+5
-7
ktransformers/optimize/optimize.py
ktransformers/optimize/optimize.py
+41
-12
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
...optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+228
-0
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+126
-0
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+16
-3
ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
...imize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+126
-0
ktransformers/optimize/optimize_rules/Mixtral.yaml
ktransformers/optimize/optimize_rules/Mixtral.yaml
+45
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
...ize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+111
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+16
-2
ktransformers/tests/dequant_gpu.py
ktransformers/tests/dequant_gpu.py
+17
-16
ktransformers/tests/dequant_gpu_t.py
ktransformers/tests/dequant_gpu_t.py
+1
-1
No files found.
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
f5f79f5c
...
...
@@ -9,7 +9,7 @@
**/
#include "moe.h"
#include <iostream>
#include
"unistd.h"
#include
<cstdint>
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
...
...
@@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) {
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_local_pos_
[
i
].
res
erv
e
(
config_
.
expert_num
);
m_local_pos_
[
i
].
res
iz
e
(
config_
.
routed_
expert_num
);
}
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
...
...
@@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
int
expert_idx
=
task_id
/
nth
;
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
int
ith
=
task_id
%
nth
;
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
...
@@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
...
@@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
});
...
...
@@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
else
{
to_float
(
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
...
...
@@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
m_gate_input_
[
i
];
}
else
{
gate_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
m_up_input_
[
i
];
}
else
{
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
}
...
...
@@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_proj_ptr
=
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
...
...
@@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
});
...
...
@@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
}
}
from_float
(
m_output_fp32_
[
i
],
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
m_output_fp32_
[
i
],
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
});
}
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
return
;
}
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
(
uint8_t
*
)
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
View file @
f5f79f5c
...
...
@@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) {
void
SharedMemBuffer
::
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
offset
=
0
;
for
(
auto
&
request
:
requests
)
{
*
(
request
.
first
)
=
buffer_
+
offset
;
*
(
request
.
first
)
=
(
uint8_t
*
)
buffer_
+
offset
;
offset
+=
request
.
second
;
}
}
ktransformers/local_chat.py
100644 → 100755
View file @
f5f79f5c
...
...
@@ -31,18 +31,21 @@ import fire
from
ktransformers.optimize.optimize
import
optimize_and_load_gguf
from
ktransformers.models.modeling_deepseek
import
DeepseekV2ForCausalLM
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
from
ktransformers.util.utils
import
prefill_and_generate
from
ktransformers.server.config.config
import
Config
custom_models
=
{
"DeepseekV2ForCausalLM"
:
DeepseekV2ForCausalLM
,
"Qwen2MoeForCausalLM"
:
Qwen2MoeForCausalLM
,
"MixtralForCausalLM"
:
MixtralForCausalLM
,
}
ktransformer_rules_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/optimize/optimize_rules/"
default_optimize_rules
=
{
"DeepseekV2ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V2-Chat.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-57B-A14B-Instruct.yaml"
,
"MixtralForCausalLM"
:
ktransformer_rules_dir
+
"Mixtral.yaml"
,
}
def
local_chat
(
...
...
@@ -50,7 +53,8 @@ def local_chat(
optimize_rule_path
:
str
=
None
,
gguf_path
:
str
=
None
,
max_new_tokens
:
int
=
1000
,
cpu_infer
:
int
=
Config
().
cpu_infer
cpu_infer
:
int
=
Config
().
cpu_infer
,
use_cuda_graph
:
bool
=
True
,
):
torch
.
set_grad_enabled
(
False
)
...
...
@@ -64,6 +68,8 @@ def local_chat(
print
(
"using custom modeling_xxx.py."
)
if
"Qwen2Moe"
in
config
.
architectures
[
0
]:
# Qwen2Moe must use flash_attention_2 to avoid overflow.
config
.
_attn_implementation
=
"flash_attention_2"
if
"Mixtral"
in
config
.
architectures
[
0
]:
config
.
_attn_implementation
=
"flash_attention_2"
model
=
custom_models
[
config
.
architectures
[
0
]](
config
)
else
:
model
=
AutoModelForCausalLM
.
from_config
(
...
...
@@ -100,7 +106,6 @@ def local_chat(
while
True
:
content
=
input
(
"Chat: "
)
# if content is num
if
content
==
""
:
content
=
"Please write a piece of quicksort code in C++."
...
...
@@ -109,7 +114,7 @@ def local_chat(
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
)
torch
.
set_default_dtype
(
torch
.
bfloat16
)
# TODO: Remove this, replace dtype using config
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
)
if
__name__
==
"__main__"
:
fire
.
Fire
(
local_chat
)
fire
.
Fire
(
local_chat
)
\ No newline at end of file
ktransformers/models/custom_cache.py
View file @
f5f79f5c
...
...
@@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache):
The maximum batch size with which the model will be used.
max_cache_len (`int`):
The maximum sequence length with which the model will be used.
device (`torch.device`):
device (`torch.device`
or `dict`
):
The device on which the cache should be initialized. Should be the same as the layer.
If a `dict`, it should contain the `device` key with the device name as the value.
dtype (*optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
max_batch_size
:
int
,
max_cache_len
:
int
,
device
,
dtype
=
None
)
->
None
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
max_batch_size
:
int
,
max_cache_len
:
int
,
device
:
torch
.
device
|
dict
,
dtype
=
None
)
->
None
:
Cache
.
__init__
(
self
)
self
.
max_batch_size
=
max_batch_size
self
.
max_cache_len
=
config
.
max_position_embeddings
if
max_cache_len
is
None
else
max_cache_len
...
...
@@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache):
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
...
...
@@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache):
self
.
past_tokens
=
[]
self
.
num_hidden_layers
=
config
.
num_hidden_layers
for
_
in
range
(
self
.
num_hidden_layers
):
for
idx
in
range
(
self
.
num_hidden_layers
):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache.
new_layer_key_cache
=
torch
.
zeros
(
key_shape
,
dtype
=
self
.
dtype
,
device
=
device
)
new_layer_value_cache
=
torch
.
zeros
(
value_shape
,
dtype
=
self
.
dtype
,
device
=
device
)
if
isinstance
(
device
,
dict
):
target_device
=
device
[
f
"blk.
{
idx
}
.self_attn"
][
"generate_device"
]
else
:
target_device
=
device
new_layer_key_cache
=
torch
.
zeros
(
key_shape
,
dtype
=
self
.
dtype
,
device
=
target_device
)
new_layer_value_cache
=
torch
.
zeros
(
value_shape
,
dtype
=
self
.
dtype
,
device
=
target_device
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_key_cache
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_value_cache
)
self
.
key_cache
.
append
(
new_layer_key_cache
)
...
...
ktransformers/models/modeling_deepseek.py
View file @
f5f79f5c
...
...
@@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
#
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
...
...
@@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module):
cache_position
=
cache_position
,
**
kwargs
,
)
hidden_states
=
residual
+
hidden_states
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
...
...
ktransformers/models/modeling_mixtral.py
0 → 100644
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/operators/RoPE.py
View file @
f5f79f5c
...
...
@@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.utils
import
InferenceState
from
transformers.configuration_utils
import
PretrainedConfig
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class
RotaryEmbedding
(
BaseInjectedModule
,
DeepseekV2RotaryEmbedding
):
def
__init__
(
self
,
...
...
@@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
# device: str = "cuda",
generate_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
orig_module
.
max_position_embeddings
,
orig_module
.
base
)
self
.
generate_device
=
generate_device
self
.
prefill_device
=
prefill_device
def
load
(
self
):
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
...
...
@@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
# device: str = "cuda",
generate_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
orig_module
.
max_position_embeddings
,
orig_module
.
base
,
...
...
@@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
orig_module
.
beta_slow
,
orig_module
.
mscale
,
orig_module
.
mscale_all_dim
)
self
.
generate_device
=
generate_device
self
.
prefill_device
=
prefill_device
def
load
(
self
):
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
self
.
orig_module
.
max_position_embeddings
,
self
.
orig_module
.
base
,
self
.
device
,
self
.
generate_
device
,
self
.
orig_module
.
scaling_factor
,
self
.
orig_module
.
original_max_position_embeddings
,
self
.
orig_module
.
beta_fast
,
...
...
ktransformers/operators/experts.py
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/operators/layer_wise_prefill.py
View file @
f5f79f5c
...
...
@@ -6,7 +6,7 @@ Author : Azure-Tang
Date : 2024-07-25 11:25:24
Version : 1.0.0
LastEditors : Azure
LastEditTime : 2024-0
7-26 09:27:48
LastEditTime : 2024-0
8-08 10:09:14
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
...
...
@@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep
from
transformers.models.qwen2_moe.configuration_qwen2_moe
import
Qwen2MoeConfig
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
if
is_flash_attn_2_available
():
from
flash_attn
import
flash_attn_func
,
flash_attn_varlen_func
...
...
@@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r"""
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@
add_start_docstrings
(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top."
,
QWEN2MOE_START_DOCSTRING
,
)
class
Qwen2MoePreTrainedModel
(
PreTrainedModel
):
config_class
=
Qwen2MoeConfig
base_model_prefix
=
"model"
supports_gradient_checkpointing
=
True
_no_split_modules
=
[
"Qwen2MoeDecoderLayer"
]
_skip_keys_device_placement
=
"past_key_values"
_supports_flash_attn_2
=
True
_supports_sdpa
=
True
_supports_cache_class
=
True
_supports_static_cache
=
True
def
_init_weights
(
self
,
module
):
std
=
self
.
config
.
initializer_range
if
isinstance
(
module
,
nn
.
Linear
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
if
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
elif
isinstance
(
module
,
nn
.
Embedding
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
if
module
.
padding_idx
is
not
None
:
module
.
weight
.
data
[
module
.
padding_idx
].
zero_
()
QWEN2MOE_INPUTS_DOCSTRING
=
r
"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...
...
@@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
the complete sequence length.
"""
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
@
add_start_docstrings
(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top."
,
QWEN2MOE_START_DOCSTRING
,
)
class
Qwen2MoeModel
PerLayerPrefill
(
BaseInjectedModule
):
class
Qwen2MoeModel
KTransformers
(
BaseInjectedModule
):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
...
...
@@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
transfer_map
:
dict
=
None
,
**
kwargs
,
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
transfer_map
=
transfer_map
self
.
stream_device_map
=
dict
()
@
add_start_docstrings_to_model_forward
(
QWEN2MOE_INPUTS_DOCSTRING
)
def
forward
(
...
...
@@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
all_router_logits
=
()
if
output_router_logits
else
None
next_decoder_cache
=
None
for
decoder_layer
in
self
.
layers
:
for
i
,
decoder_layer
in
enumerate
(
self
.
layers
):
if
self
.
transfer_map
is
not
None
and
i
in
self
.
transfer_map
:
prev_stream
=
torch
.
cuda
.
current_stream
()
cur_device
=
self
.
transfer_map
[
i
]
if
cur_device
not
in
self
.
stream_device_map
:
self
.
stream_device_map
[
cur_device
]
=
torch
.
cuda
.
Stream
(
cur_device
)
torch
.
cuda
.
set_device
(
cur_device
)
self
.
stream_device_map
[
cur_device
].
wait_stream
(
prev_stream
)
torch
.
cuda
.
set_stream
(
self
.
stream_device_map
[
cur_device
])
hidden_states
=
hidden_states
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
causal_mask
=
causal_mask
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
causal_mask
is
not
None
else
None
position_ids
=
position_ids
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
position_ids
is
not
None
else
None
cache_position
=
cache_position
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
cache_position
is
not
None
else
None
if
output_hidden_states
:
all_hidden_states
+=
(
hidden_states
,)
...
...
@@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
"""
class
DeepseekV2Model
PerLayerPrefill
(
BaseInjectedModule
):
class
DeepseekV2Model
KTransformers
(
BaseInjectedModule
):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
...
...
@@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
transfer_map
:
dict
=
None
,
**
kwargs
,
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
transfer_map
=
transfer_map
self
.
stream_device_map
=
dict
()
@
add_start_docstrings_to_model_forward
(
DeepseekV2_INPUTS_DOCSTRING
)
def
forward
(
...
...
@@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
t_cpu
=
0
t_f
=
0
for
decoder_layer
in
self
.
layers
:
for
i
,
decoder_layer
in
enumerate
(
self
.
layers
):
if
self
.
transfer_map
is
not
None
and
i
in
self
.
transfer_map
:
prev_stream
=
torch
.
cuda
.
current_stream
()
cur_device
=
self
.
transfer_map
[
i
]
if
cur_device
not
in
self
.
stream_device_map
:
self
.
stream_device_map
[
cur_device
]
=
torch
.
cuda
.
Stream
(
cur_device
)
torch
.
cuda
.
set_device
(
cur_device
)
self
.
stream_device_map
[
cur_device
].
wait_stream
(
prev_stream
)
torch
.
cuda
.
set_stream
(
self
.
stream_device_map
[
cur_device
])
hidden_states
=
hidden_states
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
causal_mask
=
causal_mask
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
causal_mask
is
not
None
else
None
position_ids
=
position_ids
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
position_ids
is
not
None
else
None
cache_position
=
cache_position
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
cache_position
is
not
None
else
None
if
output_hidden_states
:
all_hidden_states
+=
(
hidden_states
,)
...
...
ktransformers/operators/linear.py
View file @
f5f79f5c
...
...
@@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
self
.
act_order
=
act_order
self
.
is_k_full
=
is_k_full
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
"cuda"
):
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
):
if
device
is
None
:
device
=
self
.
device
assert
device
.
lower
()
!=
"cpu"
,
"Marlin quantized linear only supports GPU device"
if
w
is
None
:
w
=
self
.
load_weight
(
device
=
device
)
...
...
@@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
weight
,
self
.
num_bits
,
self
.
group_size
,
self
.
act_order
)
self
.
workspace
=
MarlinWorkspace
(
self
.
out_features
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
self
.
out_features
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
,
self
.
device
)
self
.
marlin_q_w
=
marlin_q_w
self
.
marlin_s
=
marlin_s
...
...
@@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
LINEAR_MAP
=
{
"QuantizedLinearMarlin"
:
QuantizedLinearMarlin
,
"QuantizedLinearTorch"
:
QuantizedLinearTorch
,
"QuantizedLinearTorch"
:
QuantizedLinearTorch
,
}
class
KTransformerLinear
(
BaseInjectedModule
,
QuantizedLinearBase
):
...
...
@@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
#
device: str = "cuda",
generate_device
:
str
=
"cuda"
,
generate_op
:
str
|
None
=
"QuantizedLinearMarlin"
,
prefill_device
:
str
=
"cuda"
,
prefill_op
:
str
|
None
=
"QuantizedLinearTorch"
,
**
kwargs
,
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
QuantizedLinearBase
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
QuantizedLinearBase
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
# build all the linear operators
if
prefill_op
is
not
None
:
assert
prefill_op
in
LINEAR_MAP
,
f
"linear_type
{
prefill_op
}
not supported"
...
...
@@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
self
.
generate_linear
=
LINEAR_MAP
[
generate_op
](
key
,
gguf_loader
,
config
,
orig_module
,
generate_device
,
**
kwargs
)
else
:
self
.
generate_linear
=
None
self
.
device
=
device
self
.
mode
=
InferenceState
.
UNLOAD
def
forward
(
self
,
x
):
...
...
ktransformers/optimize/optimize.py
View file @
f5f79f5c
'''
Description :
Author : Boxin Zhang
Author : Boxin Zhang
, Azure-Tang
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
...
...
@@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
translate_name_to_gguf
from
ktransformers.util.utils
import
set_module
,
load_weights
import
itertools
import
copy
def
inject
(
module
,
local_optimization_dict
,
model_config
:
AutoConfig
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
for
name
,
child
in
module
.
_modules
.
items
():
...
...
@@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader
child_prefix
=
prefix
+
name
if
child_prefix
in
local_optimization_dict
:
inject_module_meta
=
local_optimization_dict
[
child_prefix
]
if
isinstance
(
inject_module_meta
,
Mapping
)
:
if
inject_module_meta
[
"class"
]
!=
"default"
:
import_path
=
inject_module_meta
[
"class"
].
split
(
"."
)
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
gguf_loader
.
tensor_device_map
[
inject_module_meta
[
"key"
]]
=
inject_module_meta
[
"kwargs"
]
if
"kwargs"
in
inject_module_meta
else
dict
()
import_class_name
=
import_path
[
-
1
]
module_cls
=
getattr
(
__import__
(
import_module_name
,
fromlist
=
[
""
]),
import_class_name
)
print
(
f
"Injecting
{
child_prefix
}
as"
,
import_module_name
,
"."
,
import_class_name
)
inject_module
=
module_cls
(
key
=
inject_module_meta
[
"key"
],
gguf_loader
=
gguf_loader
,
config
=
model_config
,
orig_module
=
child
,
device
=
inject_module_meta
[
"device"
],
**
inject_module_meta
[
"kwargs"
])
inject_module
=
module_cls
(
key
=
inject_module_meta
[
"key"
],
gguf_loader
=
gguf_loader
,
config
=
model_config
,
orig_module
=
child
,
**
inject_module_meta
[
"kwargs"
])
set_module
(
module
,
name
,
inject_module
)
elif
isinstance
(
inject_module_meta
,
str
):
assert
inject_module_meta
==
"default"
,
"for str inject_module_meta, only support
\"
default
\"
."
elif
inject_module_meta
[
"class"
]
==
"default"
:
print
(
f
"Injecting
{
child_prefix
}
as default"
)
gguf_loader
.
tensor_device_map
[
inject_module_meta
[
"key"
]]
=
inject_module_meta
[
"kwargs"
]
if
"kwargs"
in
inject_module_meta
else
dict
()
else
:
raise
Exception
(
"inject_module_meta
must be a dict or str
"
)
raise
Exception
(
"inject_module_meta
[
\"
class
\"
] must be
\"
default
\"
or a class path
"
)
child_prefix
+=
"."
child_optimization_dict
=
{
k
:
v
for
k
,
v
in
local_optimization_dict
.
items
()
if
k
.
startswith
(
child_prefix
)}
inject
(
child
,
child_optimization_dict
,
model_config
,
gguf_loader
,
child_prefix
)
...
...
@@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
for
rule
in
rule_list
:
#print(rule)
match_meta
=
rule
[
"match"
]
if
"class"
not
in
match_meta
and
"name"
not
in
match_meta
:
raise
Exception
(
"match must have at least one of
\"
class
\"
and
\"
name
\"
"
)
if
"class"
in
match_meta
:
import_path
=
match_meta
[
"class"
].
split
(
"."
)
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
...
...
@@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if
"name"
in
match_meta
:
if
re
.
search
(
match_meta
[
"name"
],
module_name
)
is
None
:
continue
replace_meta
=
rule
[
"replace"
]
out_data
[
module_name
]
=
{
"key"
:
translated_name
,
"class"
:
replace_meta
[
"class"
],
"device"
:
replace_meta
[
"device"
]
if
"device"
in
replace_meta
else
default_device
,
"kwargs"
:
replace_meta
[
"kwargs"
]
if
"kwargs"
in
replace_meta
else
dict
()}
if
"replace"
not
in
rule
:
raise
Exception
(
"replace must be in rule"
)
if
"replace"
in
rule
:
replace_meta
=
rule
[
"replace"
]
if
module_name
not
in
out_data
:
out_data
[
module_name
]
=
{
"key"
:
translated_name
,
"class"
:
replace_meta
[
"class"
]
if
"class"
in
replace_meta
else
"default"
,
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs"
:
copy
.
deepcopy
(
replace_meta
[
"kwargs"
])
if
"kwargs"
in
replace_meta
else
dict
()}
else
:
if
out_data
[
module_name
][
"class"
]
==
"default"
:
out_data
[
module_name
][
"class"
]
=
replace_meta
[
"class"
]
if
"class"
in
replace_meta
else
"default"
out_data
[
module_name
][
"kwargs"
].
update
(
copy
.
deepcopy
(
replace_meta
[
"kwargs"
])
if
"kwargs"
in
replace_meta
else
dict
())
if
"recursive"
in
rule
:
recursive
=
bool
(
rule
[
"recursive"
])
if
module_name
not
in
out_data
:
out_data
[
module_name
]
=
"default"
out_data
[
module_name
]
=
{
"class"
:
"default"
,
"key"
:
translated_name
,
"kwargs"
:
{
"generate_device"
:
default_device
,
"prefill_device"
:
default_device
}
}
#print(out_data[module_name])
#input()
...
...
@@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
gen_optimize_config
(
child
,
out_data
,
rule_list
,
child_prefix
)
def
translate_model_config
(
model_config
:
PretrainedConfig
):
# for supporting some special model
if
model_config
.
model_type
==
"mixtral"
:
model_config
.
moe_intermediate_size
=
model_config
.
intermediate_size
return
model_config
def
optimize_and_load_gguf
(
module
:
nn
.
Module
,
rule_file
:
str
,
gguf_path
:
str
,
model_config
:
PretrainedConfig
,
default_device
:
str
=
"cuda:0"
):
with
open
(
rule_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
rule_list
=
yaml
.
load
(
f
.
read
(),
Loader
=
yaml
.
FullLoader
)
...
...
@@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
optimize_config
=
dict
()
gen_optimize_config
(
module
,
optimize_config
,
rule_list
,
default_device
=
default_device
)
model_config
=
translate_model_config
(
model_config
)
gguf_loader
=
GGUFLoader
(
gguf_path
)
with
torch
.
device
(
"meta"
):
inject
(
module
,
optimize_config
,
model_config
,
gguf_loader
)
load_weights
(
module
,
gguf_loader
)
model_config
.
gguf_loader
=
gguf_loader
del_meta
(
module
)
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([1][0-9])
\\
.)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([2][0-9])
\\
.)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([345][0-9])
\\
.)|(^model.norm)|(^lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:2"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:2"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:3"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:3"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
10
:
"
cuda:1"
20
:
"
cuda:2"
30
:
"
cuda:3"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([345][0-9])
\\
.)|(model.norm)|(lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
30
:
"
cuda:1"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.|^lm_head"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
...
...
@@ -21,12 +28,11 @@
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
...
...
@@ -36,6 +42,13 @@
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2Model
PerLayerPrefill
"
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2Model
KTransformers
"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([12][0-9])
\\
.)|(model.norm)|(lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
10
:
"
cuda:1"
\ No newline at end of file
ktransformers/optimize/optimize_rules/Mixtral.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
-
match
:
name
:
"
^model
\\
.layers
\\
..*$"
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe$"
class
:
ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.MisrtalSparseMoEBlockInjected
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
kwargs
:
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
."
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
."
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
(^model.norm)|(^lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
3
:
"
cuda:1"
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
...
...
@@ -21,7 +28,7 @@
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
#
device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
...
...
@@ -32,6 +39,13 @@
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModel
PerLayerPrefill
"
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModel
KTransformers
"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/tests/dequant_gpu.py
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/tests/dequant_gpu_t.py
View file @
f5f79f5c
...
...
@@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa
from
ktransformers.operators.experts
import
KTransformersMLPExpert
,
MLPExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
dequantize_q4_k_gpu
,
dequantize_q4_k
import
torch
import
Cuda
Ops
import
KTransformers
Ops
torch
.
set_default_dtype
(
torch
.
bfloat16
)
import
time
from
transformers
import
(
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment