Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
f5f79f5c
Commit
f5f79f5c
authored
Aug 12, 2024
by
chenxl
Browse files
[ADD] support multi-gpu qlen>1 q5_k
parent
f2938031
Changes
63
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2780 additions
and
180 deletions
+2780
-180
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
+16
-16
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
...ransformers_ext/operators/llamafile/shared_mem_buffer.cpp
+1
-1
ktransformers/local_chat.py
ktransformers/local_chat.py
+9
-4
ktransformers/models/custom_cache.py
ktransformers/models/custom_cache.py
+11
-5
ktransformers/models/modeling_deepseek.py
ktransformers/models/modeling_deepseek.py
+3
-1
ktransformers/models/modeling_mixtral.py
ktransformers/models/modeling_mixtral.py
+1735
-0
ktransformers/operators/RoPE.py
ktransformers/operators/RoPE.py
+14
-5
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+220
-72
ktransformers/operators/layer_wise_prefill.py
ktransformers/operators/layer_wise_prefill.py
+39
-35
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+5
-7
ktransformers/optimize/optimize.py
ktransformers/optimize/optimize.py
+41
-12
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
...optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
+228
-0
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
+126
-0
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
+16
-3
ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
...imize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
+126
-0
ktransformers/optimize/optimize_rules/Mixtral.yaml
ktransformers/optimize/optimize_rules/Mixtral.yaml
+45
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
...ize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
+111
-0
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
...mers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
+16
-2
ktransformers/tests/dequant_gpu.py
ktransformers/tests/dequant_gpu.py
+17
-16
ktransformers/tests/dequant_gpu_t.py
ktransformers/tests/dequant_gpu_t.py
+1
-1
No files found.
ktransformers/ktransformers_ext/operators/llamafile/moe.cpp
View file @
f5f79f5c
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
**/
**/
#include "moe.h"
#include "moe.h"
#include <iostream>
#include <iostream>
#include
"unistd.h"
#include
<cstdint>
MOE
::
MOE
(
MOEConfig
config
)
{
MOE
::
MOE
(
MOEConfig
config
)
{
config_
=
config
;
config_
=
config
;
...
@@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) {
...
@@ -60,7 +60,7 @@ MOE::MOE(MOEConfig config) {
m_local_pos_
.
resize
(
config_
.
group_max_len
);
m_local_pos_
.
resize
(
config_
.
group_max_len
);
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
for
(
int
i
=
0
;
i
<
config_
.
group_max_len
;
i
++
)
{
m_local_pos_
[
i
].
res
erv
e
(
config_
.
expert_num
);
m_local_pos_
[
i
].
res
iz
e
(
config_
.
routed_
expert_num
);
}
}
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_num_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
m_local_gate_input_ptr_
.
resize
(
config_
.
expert_num
);
...
@@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -125,10 +125,10 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
gate_output_ptr
=
s_gate_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_proj_ptr
=
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_id
*
config_
.
intermediate_size
+
ith
*
config_
.
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
up_output_ptr
=
s_up_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -153,7 +153,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
for
(
int
expert_idx
=
0
;
expert_idx
<
k
;
expert_idx
++
)
{
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
uint64_t
expert_id
=
expert_ids
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_id
*
config_
.
hidden_size
+
ith
*
config_
.
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
float
*
down_output_ptr
=
s_down_output_
[
expert_idx
]
+
ith
*
config_
.
stride
;
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
config_
.
stride
,
1
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
s_down_input_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
stride
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
for
(
int
i
=
ith
*
config_
.
stride
;
i
<
(
ith
+
1
)
*
config_
.
stride
;
i
++
)
{
...
@@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
...
@@ -162,7 +162,7 @@ void MOE::forward_one(int k, const uint64_t* expert_ids, const float* weights, c
}
}
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
if
(
config_
.
stride
%
ggml_blck_size
(
config_
.
hidden_type
)
==
0
)
{
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
float
*
output_fp32_ptr
=
s_output_fp32_
+
ith
*
config_
.
stride
;
void
*
output_ptr
=
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
void
*
output_ptr
=
(
uint8_t
*
)
output
+
ith
*
config_
.
stride
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
from_float
(
output_fp32_ptr
,
output_ptr
,
config_
.
stride
,
config_
.
hidden_type
);
}
}
});
});
...
@@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -195,9 +195,9 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
const
void
*
gate_input_ptr
;
const
void
*
gate_input_ptr
;
const
void
*
up_input_ptr
;
const
void
*
up_input_ptr
;
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
&&
config_
.
hidden_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
gate_input_ptr
=
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
else
{
}
else
{
to_float
(
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
to_float
(
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
m_input_fp32_
[
i
],
config_
.
hidden_size
,
config_
.
hidden_type
);
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
==
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
up_input_ptr
=
m_gate_input_
[
i
];
...
@@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -206,13 +206,13 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_gate_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
);
gate_input_ptr
=
m_gate_input_
[
i
];
gate_input_ptr
=
m_gate_input_
[
i
];
}
else
{
}
else
{
gate_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
gate_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
if
(
config_
.
hidden_type
!=
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
)
{
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
from_float
(
m_input_fp32_
[
i
],
m_up_input_
[
i
],
config_
.
hidden_size
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
);
up_input_ptr
=
m_up_input_
[
i
];
up_input_ptr
=
m_up_input_
[
i
];
}
else
{
}
else
{
up_input_ptr
=
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
up_input_ptr
=
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
);
}
}
}
}
}
}
...
@@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -227,11 +227,11 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_input_ptr
=
m_local_gate_input_ptr_
[
expert_idx
];
void
*
gate_proj_ptr
=
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
void
*
gate_proj_ptr
=
(
uint8_t
*
)
gate_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
gate_type
)
/
ggml_blck_size
(
config_
.
gate_type
);
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
gate_output_ptr
=
m_local_gate_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
gate_type
),
gate_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
gate_type
,
ggml_internal_get_type_traits
(
config_
.
gate_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_input_ptr
=
m_local_up_input_ptr_
[
expert_idx
];
void
*
up_proj_ptr
=
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
void
*
up_proj_ptr
=
(
uint8_t
*
)
up_proj_
+
(
expert_idx
*
config_
.
intermediate_size
+
ith
*
stride
)
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
up_type
)
/
ggml_blck_size
(
config_
.
up_type
);
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
up_output_ptr
=
m_local_up_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_proj_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_input_ptr
,
config_
.
hidden_size
/
ggml_blck_size
(
config_
.
up_type
),
up_output_ptr
,
config_
.
intermediate_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
up_type
,
ggml_internal_get_type_traits
(
config_
.
up_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
for
(
int
i
=
0
;
i
<
m_local_num_
[
expert_idx
];
i
++
)
{
...
@@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -249,7 +249,7 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
int
expert_idx
=
task_id
/
nth
;
int
expert_idx
=
task_id
/
nth
;
int
ith
=
task_id
%
nth
;
int
ith
=
task_id
%
nth
;
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_input_ptr
=
m_local_down_input_ptr_
[
expert_idx
];
void
*
down_proj_ptr
=
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
void
*
down_proj_ptr
=
(
uint8_t
*
)
down_proj_
+
(
expert_idx
*
config_
.
hidden_size
+
ith
*
stride
)
*
config_
.
intermediate_size
*
ggml_type_size
(
config_
.
down_type
)
/
ggml_blck_size
(
config_
.
down_type
);
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
float
*
down_output_ptr
=
m_local_down_output_ptr_
[
expert_idx
]
+
ith
*
stride
;
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
llamafile_sgemm
(
stride
,
m_local_num_
[
expert_idx
],
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_proj_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_input_ptr
,
config_
.
intermediate_size
/
ggml_blck_size
(
config_
.
down_type
),
down_output_ptr
,
config_
.
hidden_size
,
0
,
1
,
GGML_TASK_TYPE_COMPUTE
,
config_
.
down_type
,
ggml_internal_get_type_traits
(
config_
.
down_type
).
vec_dot_type
,
GGML_TYPE_F32
,
GGML_PREC_DEFAULT
);
});
});
...
@@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
...
@@ -262,18 +262,18 @@ void MOE::forward_many(int qlen, int k, const uint64_t* expert_ids, const float*
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
m_output_fp32_
[
i
][
e
]
+=
m_local_down_output_ptr_
[
expert_ids
[
i
*
k
+
j
]][
m_local_pos_
[
i
][
j
]
*
config_
.
hidden_size
+
e
]
*
weights
[
i
*
k
+
j
];
}
}
}
}
from_float
(
m_output_fp32_
[
i
],
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
from_float
(
m_output_fp32_
[
i
],
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
config_
.
hidden_size
,
config_
.
hidden_type
);
});
});
}
}
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
void
MOE
::
forward
(
int
qlen
,
int
k
,
const
uint64_t
*
expert_ids
,
const
float
*
weights
,
const
void
*
input
,
void
*
output
,
Backend
*
backend
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
if
(
qlen
<
config_
.
group_min_len
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
for
(
int
i
=
0
;
i
<
qlen
;
i
++
)
{
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward_one
(
k
,
expert_ids
+
i
*
k
,
weights
+
i
*
k
,
(
uint8_t
*
)
input
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
i
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
return
;
return
;
}
}
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
int
forward_len
=
std
::
min
(
config_
.
group_max_len
,
qlen
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward_many
(
forward_len
,
k
,
expert_ids
,
weights
,
input
,
output
,
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
forward
(
qlen
-
forward_len
,
k
,
expert_ids
+
forward_len
*
k
,
weights
+
forward_len
*
k
,
(
uint8_t
*
)
input
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
(
uint8_t
*
)
output
+
forward_len
*
config_
.
hidden_size
*
ggml_type_size
(
config_
.
hidden_type
)
/
ggml_blck_size
(
config_
.
hidden_type
),
backend
);
}
}
\ No newline at end of file
ktransformers/ktransformers_ext/operators/llamafile/shared_mem_buffer.cpp
View file @
f5f79f5c
...
@@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) {
...
@@ -49,7 +49,7 @@ void SharedMemBuffer::dealloc(void* object) {
void
SharedMemBuffer
::
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
void
SharedMemBuffer
::
arrange
(
std
::
vector
<
std
::
pair
<
void
**
,
uint64_t
>>
requests
)
{
uint64_t
offset
=
0
;
uint64_t
offset
=
0
;
for
(
auto
&
request
:
requests
)
{
for
(
auto
&
request
:
requests
)
{
*
(
request
.
first
)
=
buffer_
+
offset
;
*
(
request
.
first
)
=
(
uint8_t
*
)
buffer_
+
offset
;
offset
+=
request
.
second
;
offset
+=
request
.
second
;
}
}
}
}
ktransformers/local_chat.py
100644 → 100755
View file @
f5f79f5c
...
@@ -31,18 +31,21 @@ import fire
...
@@ -31,18 +31,21 @@ import fire
from
ktransformers.optimize.optimize
import
optimize_and_load_gguf
from
ktransformers.optimize.optimize
import
optimize_and_load_gguf
from
ktransformers.models.modeling_deepseek
import
DeepseekV2ForCausalLM
from
ktransformers.models.modeling_deepseek
import
DeepseekV2ForCausalLM
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeForCausalLM
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
from
ktransformers.util.utils
import
prefill_and_generate
from
ktransformers.util.utils
import
prefill_and_generate
from
ktransformers.server.config.config
import
Config
from
ktransformers.server.config.config
import
Config
custom_models
=
{
custom_models
=
{
"DeepseekV2ForCausalLM"
:
DeepseekV2ForCausalLM
,
"DeepseekV2ForCausalLM"
:
DeepseekV2ForCausalLM
,
"Qwen2MoeForCausalLM"
:
Qwen2MoeForCausalLM
,
"Qwen2MoeForCausalLM"
:
Qwen2MoeForCausalLM
,
"MixtralForCausalLM"
:
MixtralForCausalLM
,
}
}
ktransformer_rules_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/optimize/optimize_rules/"
ktransformer_rules_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/optimize/optimize_rules/"
default_optimize_rules
=
{
default_optimize_rules
=
{
"DeepseekV2ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V2-Chat.yaml"
,
"DeepseekV2ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V2-Chat.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-57B-A14B-Instruct.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-57B-A14B-Instruct.yaml"
,
"MixtralForCausalLM"
:
ktransformer_rules_dir
+
"Mixtral.yaml"
,
}
}
def
local_chat
(
def
local_chat
(
...
@@ -50,7 +53,8 @@ def local_chat(
...
@@ -50,7 +53,8 @@ def local_chat(
optimize_rule_path
:
str
=
None
,
optimize_rule_path
:
str
=
None
,
gguf_path
:
str
=
None
,
gguf_path
:
str
=
None
,
max_new_tokens
:
int
=
1000
,
max_new_tokens
:
int
=
1000
,
cpu_infer
:
int
=
Config
().
cpu_infer
cpu_infer
:
int
=
Config
().
cpu_infer
,
use_cuda_graph
:
bool
=
True
,
):
):
torch
.
set_grad_enabled
(
False
)
torch
.
set_grad_enabled
(
False
)
...
@@ -64,6 +68,8 @@ def local_chat(
...
@@ -64,6 +68,8 @@ def local_chat(
print
(
"using custom modeling_xxx.py."
)
print
(
"using custom modeling_xxx.py."
)
if
"Qwen2Moe"
in
config
.
architectures
[
0
]:
# Qwen2Moe must use flash_attention_2 to avoid overflow.
if
"Qwen2Moe"
in
config
.
architectures
[
0
]:
# Qwen2Moe must use flash_attention_2 to avoid overflow.
config
.
_attn_implementation
=
"flash_attention_2"
config
.
_attn_implementation
=
"flash_attention_2"
if
"Mixtral"
in
config
.
architectures
[
0
]:
config
.
_attn_implementation
=
"flash_attention_2"
model
=
custom_models
[
config
.
architectures
[
0
]](
config
)
model
=
custom_models
[
config
.
architectures
[
0
]](
config
)
else
:
else
:
model
=
AutoModelForCausalLM
.
from_config
(
model
=
AutoModelForCausalLM
.
from_config
(
...
@@ -100,7 +106,6 @@ def local_chat(
...
@@ -100,7 +106,6 @@ def local_chat(
while
True
:
while
True
:
content
=
input
(
"Chat: "
)
content
=
input
(
"Chat: "
)
# if content is num
if
content
==
""
:
if
content
==
""
:
content
=
"Please write a piece of quicksort code in C++."
content
=
"Please write a piece of quicksort code in C++."
...
@@ -109,7 +114,7 @@ def local_chat(
...
@@ -109,7 +114,7 @@ def local_chat(
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
messages
,
add_generation_prompt
=
True
,
return_tensors
=
"pt"
)
)
torch
.
set_default_dtype
(
torch
.
bfloat16
)
# TODO: Remove this, replace dtype using config
torch
.
set_default_dtype
(
torch
.
bfloat16
)
# TODO: Remove this, replace dtype using config
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
)
generated
=
prefill_and_generate
(
model
,
tokenizer
,
input_tensor
.
cuda
(),
max_new_tokens
,
use_cuda_graph
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
fire
.
Fire
(
local_chat
)
fire
.
Fire
(
local_chat
)
\ No newline at end of file
ktransformers/models/custom_cache.py
View file @
f5f79f5c
...
@@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache):
...
@@ -22,13 +22,14 @@ class StaticCache(transformers.StaticCache):
The maximum batch size with which the model will be used.
The maximum batch size with which the model will be used.
max_cache_len (`int`):
max_cache_len (`int`):
The maximum sequence length with which the model will be used.
The maximum sequence length with which the model will be used.
device (`torch.device`):
device (`torch.device`
or `dict`
):
The device on which the cache should be initialized. Should be the same as the layer.
The device on which the cache should be initialized. Should be the same as the layer.
If a `dict`, it should contain the `device` key with the device name as the value.
dtype (*optional*, defaults to `torch.float32`):
dtype (*optional*, defaults to `torch.float32`):
The default `dtype` to use when initializing the layer.
The default `dtype` to use when initializing the layer.
"""
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
max_batch_size
:
int
,
max_cache_len
:
int
,
device
,
dtype
=
None
)
->
None
:
def
__init__
(
self
,
config
:
PretrainedConfig
,
max_batch_size
:
int
,
max_cache_len
:
int
,
device
:
torch
.
device
|
dict
,
dtype
=
None
)
->
None
:
Cache
.
__init__
(
self
)
Cache
.
__init__
(
self
)
self
.
max_batch_size
=
max_batch_size
self
.
max_batch_size
=
max_batch_size
self
.
max_cache_len
=
config
.
max_position_embeddings
if
max_cache_len
is
None
else
max_cache_len
self
.
max_cache_len
=
config
.
max_position_embeddings
if
max_cache_len
is
None
else
max_cache_len
...
@@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache):
...
@@ -46,6 +47,7 @@ class StaticCache(transformers.StaticCache):
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
key_shape
=
(
max_batch_size
,
1
,
self
.
max_cache_len
,
config
.
qk_rope_head_dim
)
...
@@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache):
...
@@ -56,11 +58,15 @@ class StaticCache(transformers.StaticCache):
self
.
past_tokens
=
[]
self
.
past_tokens
=
[]
self
.
num_hidden_layers
=
config
.
num_hidden_layers
self
.
num_hidden_layers
=
config
.
num_hidden_layers
for
_
in
range
(
self
.
num_hidden_layers
):
for
idx
in
range
(
self
.
num_hidden_layers
):
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
# breaks when updating the cache.
# breaks when updating the cache.
new_layer_key_cache
=
torch
.
zeros
(
key_shape
,
dtype
=
self
.
dtype
,
device
=
device
)
if
isinstance
(
device
,
dict
):
new_layer_value_cache
=
torch
.
zeros
(
value_shape
,
dtype
=
self
.
dtype
,
device
=
device
)
target_device
=
device
[
f
"blk.
{
idx
}
.self_attn"
][
"generate_device"
]
else
:
target_device
=
device
new_layer_key_cache
=
torch
.
zeros
(
key_shape
,
dtype
=
self
.
dtype
,
device
=
target_device
)
new_layer_value_cache
=
torch
.
zeros
(
value_shape
,
dtype
=
self
.
dtype
,
device
=
target_device
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_key_cache
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_key_cache
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_value_cache
)
torch
.
_dynamo
.
mark_static_address
(
new_layer_value_cache
)
self
.
key_cache
.
append
(
new_layer_key_cache
)
self
.
key_cache
.
append
(
new_layer_key_cache
)
...
...
ktransformers/models/modeling_deepseek.py
View file @
f5f79f5c
...
@@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
...
@@ -1048,7 +1048,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
"""
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
#
Args:
query_states (`torch.Tensor`):
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
key_states (`torch.Tensor`):
...
@@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -1245,12 +1245,14 @@ class DeepseekV2DecoderLayer(nn.Module):
cache_position
=
cache_position
,
cache_position
=
cache_position
,
**
kwargs
,
**
kwargs
,
)
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
residual
+
hidden_states
# Fully Connected
# Fully Connected
residual
=
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,)
outputs
=
(
hidden_states
,)
...
...
ktransformers/models/modeling_mixtral.py
0 → 100644
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/operators/RoPE.py
View file @
f5f79f5c
...
@@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule
...
@@ -10,6 +10,7 @@ from ktransformers.operators.base_operator import BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.utils
import
InferenceState
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.configuration_utils
import
PretrainedConfig
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
class
RotaryEmbedding
(
BaseInjectedModule
,
DeepseekV2RotaryEmbedding
):
class
RotaryEmbedding
(
BaseInjectedModule
,
DeepseekV2RotaryEmbedding
):
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
...
@@ -17,12 +18,16 @@ class RotaryEmbedding(BaseInjectedModule, DeepseekV2RotaryEmbedding):
gguf_loader
:
GGUFLoader
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
# device: str = "cuda",
generate_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
**
kwargs
):
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
orig_module
.
max_position_embeddings
,
orig_module
.
max_position_embeddings
,
orig_module
.
base
)
orig_module
.
base
)
self
.
generate_device
=
generate_device
self
.
prefill_device
=
prefill_device
def
load
(
self
):
def
load
(
self
):
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
...
@@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
...
@@ -36,9 +41,11 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
gguf_loader
:
GGUFLoader
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
# device: str = "cuda",
generate_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
**
kwargs
):
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
self
.
orig_module
.
__init__
(
orig_module
.
dim
,
orig_module
.
max_position_embeddings
,
orig_module
.
max_position_embeddings
,
orig_module
.
base
,
orig_module
.
base
,
...
@@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
...
@@ -49,13 +56,15 @@ class YarnRotaryEmbedding(BaseInjectedModule, DeepseekV2YarnRotaryEmbedding):
orig_module
.
beta_slow
,
orig_module
.
beta_slow
,
orig_module
.
mscale
,
orig_module
.
mscale
,
orig_module
.
mscale_all_dim
)
orig_module
.
mscale_all_dim
)
self
.
generate_device
=
generate_device
self
.
prefill_device
=
prefill_device
def
load
(
self
):
def
load
(
self
):
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
self
.
orig_module
.
__init__
(
self
.
orig_module
.
dim
,
self
.
orig_module
.
max_position_embeddings
,
self
.
orig_module
.
max_position_embeddings
,
self
.
orig_module
.
base
,
self
.
orig_module
.
base
,
self
.
device
,
self
.
generate_
device
,
self
.
orig_module
.
scaling_factor
,
self
.
orig_module
.
scaling_factor
,
self
.
orig_module
.
original_max_position_embeddings
,
self
.
orig_module
.
original_max_position_embeddings
,
self
.
orig_module
.
beta_fast
,
self
.
orig_module
.
beta_fast
,
...
...
ktransformers/operators/experts.py
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/operators/layer_wise_prefill.py
View file @
f5f79f5c
...
@@ -6,7 +6,7 @@ Author : Azure-Tang
...
@@ -6,7 +6,7 @@ Author : Azure-Tang
Date : 2024-07-25 11:25:24
Date : 2024-07-25 11:25:24
Version : 1.0.0
Version : 1.0.0
LastEditors : Azure
LastEditors : Azure
LastEditTime : 2024-0
7-26 09:27:48
LastEditTime : 2024-0
8-08 10:09:14
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
...
@@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep
...
@@ -45,6 +45,8 @@ from ktransformers.models.modeling_deepseek import BaseModelOutputWithPast, Deep
from
transformers.models.qwen2_moe.configuration_qwen2_moe
import
Qwen2MoeConfig
from
transformers.models.qwen2_moe.configuration_qwen2_moe
import
Qwen2MoeConfig
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
if
is_flash_attn_2_available
():
if
is_flash_attn_2_available
():
from
flash_attn
import
flash_attn_func
,
flash_attn_varlen_func
from
flash_attn
import
flash_attn_func
,
flash_attn_varlen_func
...
@@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r"""
...
@@ -73,34 +75,6 @@ QWEN2MOE_START_DOCSTRING = r"""
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
"""
@
add_start_docstrings
(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top."
,
QWEN2MOE_START_DOCSTRING
,
)
class
Qwen2MoePreTrainedModel
(
PreTrainedModel
):
config_class
=
Qwen2MoeConfig
base_model_prefix
=
"model"
supports_gradient_checkpointing
=
True
_no_split_modules
=
[
"Qwen2MoeDecoderLayer"
]
_skip_keys_device_placement
=
"past_key_values"
_supports_flash_attn_2
=
True
_supports_sdpa
=
True
_supports_cache_class
=
True
_supports_static_cache
=
True
def
_init_weights
(
self
,
module
):
std
=
self
.
config
.
initializer_range
if
isinstance
(
module
,
nn
.
Linear
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
if
module
.
bias
is
not
None
:
module
.
bias
.
data
.
zero_
()
elif
isinstance
(
module
,
nn
.
Embedding
):
module
.
weight
.
data
.
normal_
(
mean
=
0.0
,
std
=
std
)
if
module
.
padding_idx
is
not
None
:
module
.
weight
.
data
[
module
.
padding_idx
].
zero_
()
QWEN2MOE_INPUTS_DOCSTRING
=
r
"""
QWEN2MOE_INPUTS_DOCSTRING
=
r
"""
Args:
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
...
@@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
...
@@ -177,13 +151,11 @@ QWEN2MOE_INPUTS_DOCSTRING = r"""
the complete sequence length.
the complete sequence length.
"""
"""
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
transformers.configuration_utils
import
PretrainedConfig
@
add_start_docstrings
(
@
add_start_docstrings
(
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top."
,
"The bare Qwen2MoE Model outputting raw hidden-states without any specific head on top."
,
QWEN2MOE_START_DOCSTRING
,
QWEN2MOE_START_DOCSTRING
,
)
)
class
Qwen2MoeModel
PerLayerPrefill
(
BaseInjectedModule
):
class
Qwen2MoeModel
KTransformers
(
BaseInjectedModule
):
"""
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
...
@@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
...
@@ -198,10 +170,13 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
orig_module
:
nn
.
Module
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
device
:
str
=
"cuda"
,
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
transfer_map
:
dict
=
None
,
**
kwargs
,
**
kwargs
,
):
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
transfer_map
=
transfer_map
self
.
stream_device_map
=
dict
()
@
add_start_docstrings_to_model_forward
(
QWEN2MOE_INPUTS_DOCSTRING
)
@
add_start_docstrings_to_model_forward
(
QWEN2MOE_INPUTS_DOCSTRING
)
def
forward
(
def
forward
(
...
@@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
...
@@ -287,7 +262,20 @@ class Qwen2MoeModelPerLayerPrefill(BaseInjectedModule):
all_router_logits
=
()
if
output_router_logits
else
None
all_router_logits
=
()
if
output_router_logits
else
None
next_decoder_cache
=
None
next_decoder_cache
=
None
for
decoder_layer
in
self
.
layers
:
for
i
,
decoder_layer
in
enumerate
(
self
.
layers
):
if
self
.
transfer_map
is
not
None
and
i
in
self
.
transfer_map
:
prev_stream
=
torch
.
cuda
.
current_stream
()
cur_device
=
self
.
transfer_map
[
i
]
if
cur_device
not
in
self
.
stream_device_map
:
self
.
stream_device_map
[
cur_device
]
=
torch
.
cuda
.
Stream
(
cur_device
)
torch
.
cuda
.
set_device
(
cur_device
)
self
.
stream_device_map
[
cur_device
].
wait_stream
(
prev_stream
)
torch
.
cuda
.
set_stream
(
self
.
stream_device_map
[
cur_device
])
hidden_states
=
hidden_states
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
causal_mask
=
causal_mask
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
causal_mask
is
not
None
else
None
position_ids
=
position_ids
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
position_ids
is
not
None
else
None
cache_position
=
cache_position
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
cache_position
is
not
None
else
None
if
output_hidden_states
:
if
output_hidden_states
:
all_hidden_states
+=
(
hidden_states
,)
all_hidden_states
+=
(
hidden_states
,)
...
@@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
...
@@ -463,7 +451,7 @@ DeepseekV2_INPUTS_DOCSTRING = r"""
"""
"""
class
DeepseekV2Model
PerLayerPrefill
(
BaseInjectedModule
):
class
DeepseekV2Model
KTransformers
(
BaseInjectedModule
):
"""
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DeepseekV2DecoderLayer`]
...
@@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
...
@@ -478,10 +466,13 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
orig_module
:
nn
.
Module
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
device
:
str
=
"cuda"
,
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
per_layer_prefill_intput_threshold
:
int
=
30000
,
# if None, no per-layer prefill
transfer_map
:
dict
=
None
,
**
kwargs
,
**
kwargs
,
):
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
per_layer_prefill_intput_threshold
=
per_layer_prefill_intput_threshold
self
.
transfer_map
=
transfer_map
self
.
stream_device_map
=
dict
()
@
add_start_docstrings_to_model_forward
(
DeepseekV2_INPUTS_DOCSTRING
)
@
add_start_docstrings_to_model_forward
(
DeepseekV2_INPUTS_DOCSTRING
)
def
forward
(
def
forward
(
...
@@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
...
@@ -584,7 +575,20 @@ class DeepseekV2ModelPerLayerPrefill(BaseInjectedModule):
t_cpu
=
0
t_cpu
=
0
t_f
=
0
t_f
=
0
for
decoder_layer
in
self
.
layers
:
for
i
,
decoder_layer
in
enumerate
(
self
.
layers
):
if
self
.
transfer_map
is
not
None
and
i
in
self
.
transfer_map
:
prev_stream
=
torch
.
cuda
.
current_stream
()
cur_device
=
self
.
transfer_map
[
i
]
if
cur_device
not
in
self
.
stream_device_map
:
self
.
stream_device_map
[
cur_device
]
=
torch
.
cuda
.
Stream
(
cur_device
)
torch
.
cuda
.
set_device
(
cur_device
)
self
.
stream_device_map
[
cur_device
].
wait_stream
(
prev_stream
)
torch
.
cuda
.
set_stream
(
self
.
stream_device_map
[
cur_device
])
hidden_states
=
hidden_states
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
causal_mask
=
causal_mask
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
causal_mask
is
not
None
else
None
position_ids
=
position_ids
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
position_ids
is
not
None
else
None
cache_position
=
cache_position
.
to
(
self
.
transfer_map
[
i
],
non_blocking
=
True
)
if
cache_position
is
not
None
else
None
if
output_hidden_states
:
if
output_hidden_states
:
all_hidden_states
+=
(
hidden_states
,)
all_hidden_states
+=
(
hidden_states
,)
...
...
ktransformers/operators/linear.py
View file @
f5f79f5c
...
@@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
...
@@ -176,7 +176,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
self
.
act_order
=
act_order
self
.
act_order
=
act_order
self
.
is_k_full
=
is_k_full
self
.
is_k_full
=
is_k_full
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
"cuda"
):
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
):
if
device
is
None
:
device
=
self
.
device
if
device
is
None
:
device
=
self
.
device
assert
device
.
lower
()
!=
"cpu"
,
"Marlin quantized linear only supports GPU device"
assert
device
.
lower
()
!=
"cpu"
,
"Marlin quantized linear only supports GPU device"
if
w
is
None
:
w
=
self
.
load_weight
(
device
=
device
)
if
w
is
None
:
w
=
self
.
load_weight
(
device
=
device
)
...
@@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
...
@@ -200,7 +200,7 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
weight
,
self
.
num_bits
,
self
.
group_size
,
self
.
act_order
weight
,
self
.
num_bits
,
self
.
group_size
,
self
.
act_order
)
)
self
.
workspace
=
MarlinWorkspace
(
self
.
workspace
=
MarlinWorkspace
(
self
.
out_features
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
self
.
out_features
,
GPTQ_MARLIN_MIN_THREAD_N
,
GPTQ_MARLIN_MAX_PARALLEL
,
self
.
device
)
)
self
.
marlin_q_w
=
marlin_q_w
self
.
marlin_q_w
=
marlin_q_w
self
.
marlin_s
=
marlin_s
self
.
marlin_s
=
marlin_s
...
@@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
...
@@ -247,7 +247,6 @@ class QuantizedLinearMarlin(QuantizedLinearBase):
LINEAR_MAP
=
{
LINEAR_MAP
=
{
"QuantizedLinearMarlin"
:
QuantizedLinearMarlin
,
"QuantizedLinearMarlin"
:
QuantizedLinearMarlin
,
"QuantizedLinearTorch"
:
QuantizedLinearTorch
,
"QuantizedLinearTorch"
:
QuantizedLinearTorch
,
"QuantizedLinearTorch"
:
QuantizedLinearTorch
,
}
}
class
KTransformerLinear
(
BaseInjectedModule
,
QuantizedLinearBase
):
class
KTransformerLinear
(
BaseInjectedModule
,
QuantizedLinearBase
):
...
@@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
...
@@ -257,15 +256,15 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
gguf_loader
:
GGUFLoader
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
#
device: str = "cuda",
generate_device
:
str
=
"cuda"
,
generate_device
:
str
=
"cuda"
,
generate_op
:
str
|
None
=
"QuantizedLinearMarlin"
,
generate_op
:
str
|
None
=
"QuantizedLinearMarlin"
,
prefill_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
prefill_op
:
str
|
None
=
"QuantizedLinearTorch"
,
prefill_op
:
str
|
None
=
"QuantizedLinearTorch"
,
**
kwargs
,
**
kwargs
,
):
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
QuantizedLinearBase
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
QuantizedLinearBase
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_
device
,
**
kwargs
)
# build all the linear operators
# build all the linear operators
if
prefill_op
is
not
None
:
if
prefill_op
is
not
None
:
assert
prefill_op
in
LINEAR_MAP
,
f
"linear_type
{
prefill_op
}
not supported"
assert
prefill_op
in
LINEAR_MAP
,
f
"linear_type
{
prefill_op
}
not supported"
...
@@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
...
@@ -289,7 +288,6 @@ class KTransformerLinear(BaseInjectedModule, QuantizedLinearBase):
self
.
generate_linear
=
LINEAR_MAP
[
generate_op
](
key
,
gguf_loader
,
config
,
orig_module
,
generate_device
,
**
kwargs
)
self
.
generate_linear
=
LINEAR_MAP
[
generate_op
](
key
,
gguf_loader
,
config
,
orig_module
,
generate_device
,
**
kwargs
)
else
:
else
:
self
.
generate_linear
=
None
self
.
generate_linear
=
None
self
.
device
=
device
self
.
mode
=
InferenceState
.
UNLOAD
self
.
mode
=
InferenceState
.
UNLOAD
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
...
...
ktransformers/optimize/optimize.py
View file @
f5f79f5c
'''
'''
Description :
Description :
Author : Boxin Zhang
Author : Boxin Zhang
, Azure-Tang
Version : 0.1.0
Version : 0.1.0
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
'''
'''
...
@@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig
...
@@ -15,6 +15,7 @@ from transformers.configuration_utils import PretrainedConfig
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
translate_name_to_gguf
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
translate_name_to_gguf
from
ktransformers.util.utils
import
set_module
,
load_weights
from
ktransformers.util.utils
import
set_module
,
load_weights
import
itertools
import
itertools
import
copy
def
inject
(
module
,
local_optimization_dict
,
model_config
:
AutoConfig
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
def
inject
(
module
,
local_optimization_dict
,
model_config
:
AutoConfig
,
gguf_loader
:
GGUFLoader
,
prefix
=
''
):
for
name
,
child
in
module
.
_modules
.
items
():
for
name
,
child
in
module
.
_modules
.
items
():
...
@@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader
...
@@ -22,18 +23,20 @@ def inject(module, local_optimization_dict, model_config:AutoConfig ,gguf_loader
child_prefix
=
prefix
+
name
child_prefix
=
prefix
+
name
if
child_prefix
in
local_optimization_dict
:
if
child_prefix
in
local_optimization_dict
:
inject_module_meta
=
local_optimization_dict
[
child_prefix
]
inject_module_meta
=
local_optimization_dict
[
child_prefix
]
if
isinstance
(
inject_module_meta
,
Mapping
)
:
if
inject_module_meta
[
"class"
]
!=
"default"
:
import_path
=
inject_module_meta
[
"class"
].
split
(
"."
)
import_path
=
inject_module_meta
[
"class"
].
split
(
"."
)
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
gguf_loader
.
tensor_device_map
[
inject_module_meta
[
"key"
]]
=
inject_module_meta
[
"kwargs"
]
if
"kwargs"
in
inject_module_meta
else
dict
()
import_class_name
=
import_path
[
-
1
]
import_class_name
=
import_path
[
-
1
]
module_cls
=
getattr
(
__import__
(
import_module_name
,
fromlist
=
[
""
]),
import_class_name
)
module_cls
=
getattr
(
__import__
(
import_module_name
,
fromlist
=
[
""
]),
import_class_name
)
print
(
f
"Injecting
{
child_prefix
}
as"
,
import_module_name
,
"."
,
import_class_name
)
print
(
f
"Injecting
{
child_prefix
}
as"
,
import_module_name
,
"."
,
import_class_name
)
inject_module
=
module_cls
(
key
=
inject_module_meta
[
"key"
],
gguf_loader
=
gguf_loader
,
config
=
model_config
,
orig_module
=
child
,
device
=
inject_module_meta
[
"device"
],
**
inject_module_meta
[
"kwargs"
])
inject_module
=
module_cls
(
key
=
inject_module_meta
[
"key"
],
gguf_loader
=
gguf_loader
,
config
=
model_config
,
orig_module
=
child
,
**
inject_module_meta
[
"kwargs"
])
set_module
(
module
,
name
,
inject_module
)
set_module
(
module
,
name
,
inject_module
)
elif
isinstance
(
inject_module_meta
,
str
):
elif
inject_module_meta
[
"class"
]
==
"default"
:
assert
inject_module_meta
==
"default"
,
"for str inject_module_meta, only support
\"
default
\"
."
print
(
f
"Injecting
{
child_prefix
}
as default"
)
gguf_loader
.
tensor_device_map
[
inject_module_meta
[
"key"
]]
=
inject_module_meta
[
"kwargs"
]
if
"kwargs"
in
inject_module_meta
else
dict
()
else
:
else
:
raise
Exception
(
"inject_module_meta
must be a dict or str
"
)
raise
Exception
(
"inject_module_meta
[
\"
class
\"
] must be
\"
default
\"
or a class path
"
)
child_prefix
+=
"."
child_prefix
+=
"."
child_optimization_dict
=
{
k
:
v
for
k
,
v
in
local_optimization_dict
.
items
()
if
k
.
startswith
(
child_prefix
)}
child_optimization_dict
=
{
k
:
v
for
k
,
v
in
local_optimization_dict
.
items
()
if
k
.
startswith
(
child_prefix
)}
inject
(
child
,
child_optimization_dict
,
model_config
,
gguf_loader
,
child_prefix
)
inject
(
child
,
child_optimization_dict
,
model_config
,
gguf_loader
,
child_prefix
)
...
@@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
...
@@ -57,6 +60,8 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
for
rule
in
rule_list
:
for
rule
in
rule_list
:
#print(rule)
#print(rule)
match_meta
=
rule
[
"match"
]
match_meta
=
rule
[
"match"
]
if
"class"
not
in
match_meta
and
"name"
not
in
match_meta
:
raise
Exception
(
"match must have at least one of
\"
class
\"
and
\"
name
\"
"
)
if
"class"
in
match_meta
:
if
"class"
in
match_meta
:
import_path
=
match_meta
[
"class"
].
split
(
"."
)
import_path
=
match_meta
[
"class"
].
split
(
"."
)
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
import_module_name
=
"."
.
join
(
import_path
[:
-
1
])
...
@@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
...
@@ -67,16 +72,29 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
if
"name"
in
match_meta
:
if
"name"
in
match_meta
:
if
re
.
search
(
match_meta
[
"name"
],
module_name
)
is
None
:
if
re
.
search
(
match_meta
[
"name"
],
module_name
)
is
None
:
continue
continue
replace_meta
=
rule
[
"replace"
]
if
"replace"
not
in
rule
:
out_data
[
module_name
]
=
{
"key"
:
translated_name
,
raise
Exception
(
"replace must be in rule"
)
"class"
:
replace_meta
[
"class"
],
if
"replace"
in
rule
:
"device"
:
replace_meta
[
"device"
]
if
"device"
in
replace_meta
else
default_device
,
replace_meta
=
rule
[
"replace"
]
"kwargs"
:
replace_meta
[
"kwargs"
]
if
"kwargs"
in
replace_meta
else
dict
()}
if
module_name
not
in
out_data
:
out_data
[
module_name
]
=
{
"key"
:
translated_name
,
"class"
:
replace_meta
[
"class"
]
if
"class"
in
replace_meta
else
"default"
,
# "device": replace_meta["device"] if "device" in replace_meta else default_device,
"kwargs"
:
copy
.
deepcopy
(
replace_meta
[
"kwargs"
])
if
"kwargs"
in
replace_meta
else
dict
()}
else
:
if
out_data
[
module_name
][
"class"
]
==
"default"
:
out_data
[
module_name
][
"class"
]
=
replace_meta
[
"class"
]
if
"class"
in
replace_meta
else
"default"
out_data
[
module_name
][
"kwargs"
].
update
(
copy
.
deepcopy
(
replace_meta
[
"kwargs"
])
if
"kwargs"
in
replace_meta
else
dict
())
if
"recursive"
in
rule
:
if
"recursive"
in
rule
:
recursive
=
bool
(
rule
[
"recursive"
])
recursive
=
bool
(
rule
[
"recursive"
])
if
module_name
not
in
out_data
:
if
module_name
not
in
out_data
:
out_data
[
module_name
]
=
"default"
out_data
[
module_name
]
=
{
"class"
:
"default"
,
"key"
:
translated_name
,
"kwargs"
:
{
"generate_device"
:
default_device
,
"prefill_device"
:
default_device
}
}
#print(out_data[module_name])
#print(out_data[module_name])
#input()
#input()
...
@@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
...
@@ -88,6 +106,14 @@ def gen_optimize_config(module: nn.Module, out_data: Mapping, rule_list: List, p
gen_optimize_config
(
child
,
out_data
,
rule_list
,
child_prefix
)
gen_optimize_config
(
child
,
out_data
,
rule_list
,
child_prefix
)
def
translate_model_config
(
model_config
:
PretrainedConfig
):
# for supporting some special model
if
model_config
.
model_type
==
"mixtral"
:
model_config
.
moe_intermediate_size
=
model_config
.
intermediate_size
return
model_config
def
optimize_and_load_gguf
(
module
:
nn
.
Module
,
rule_file
:
str
,
gguf_path
:
str
,
model_config
:
PretrainedConfig
,
default_device
:
str
=
"cuda:0"
):
def
optimize_and_load_gguf
(
module
:
nn
.
Module
,
rule_file
:
str
,
gguf_path
:
str
,
model_config
:
PretrainedConfig
,
default_device
:
str
=
"cuda:0"
):
with
open
(
rule_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
with
open
(
rule_file
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
rule_list
=
yaml
.
load
(
f
.
read
(),
Loader
=
yaml
.
FullLoader
)
rule_list
=
yaml
.
load
(
f
.
read
(),
Loader
=
yaml
.
FullLoader
)
...
@@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
...
@@ -95,8 +121,11 @@ def optimize_and_load_gguf(module: nn.Module, rule_file: str, gguf_path: str, mo
optimize_config
=
dict
()
optimize_config
=
dict
()
gen_optimize_config
(
module
,
optimize_config
,
rule_list
,
default_device
=
default_device
)
gen_optimize_config
(
module
,
optimize_config
,
rule_list
,
default_device
=
default_device
)
model_config
=
translate_model_config
(
model_config
)
gguf_loader
=
GGUFLoader
(
gguf_path
)
gguf_loader
=
GGUFLoader
(
gguf_path
)
with
torch
.
device
(
"meta"
):
with
torch
.
device
(
"meta"
):
inject
(
module
,
optimize_config
,
model_config
,
gguf_loader
)
inject
(
module
,
optimize_config
,
model_config
,
gguf_loader
)
load_weights
(
module
,
gguf_loader
)
load_weights
(
module
,
gguf_loader
)
model_config
.
gguf_loader
=
gguf_loader
del_meta
(
module
)
del_meta
(
module
)
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu-4.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([1][0-9])
\\
.)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([2][0-9])
\\
.)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([345][0-9])
\\
.)|(^model.norm)|(^lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:2"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:2"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:3"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:3"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([1][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([2][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:2"
prefill_device
:
"
cuda:2"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:3"
prefill_device
:
"
cuda:3"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
10
:
"
cuda:1"
20
:
"
cuda:2"
30
:
"
cuda:3"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([345][0-9])
\\
.)|(model.norm)|(lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([345][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
30
:
"
cuda:1"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Chat.yaml
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.|^lm_head"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
replace
:
...
@@ -21,12 +28,11 @@
...
@@ -21,12 +28,11 @@
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
kwargs
:
kwargs
:
prefill_device
:
"
cuda"
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
-
match
:
...
@@ -36,6 +42,13 @@
...
@@ -36,6 +42,13 @@
-
match
:
-
match
:
name
:
"
^model$"
name
:
"
^model$"
replace
:
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2Model
PerLayerPrefill
"
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2Model
KTransformers
"
kwargs
:
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/optimize/optimize_rules/DeepSeek-V2-Lite-Chat-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([12][0-9])
\\
.)|(model.norm)|(lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.(?!self_attn).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.DeepseekV2MoEInjected
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.DeepseekV2AttentionInjected
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.DeepseekV2ModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
10
:
"
cuda:1"
\ No newline at end of file
ktransformers/optimize/optimize_rules/Mixtral.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
class
:
ktransformers.models.modeling_mixtral.MixtralRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
-
match
:
name
:
"
^model
\\
.layers
\\
..*$"
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe$"
class
:
ktransformers.models.modeling_mixtral.MixtralSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.MisrtalSparseMoEBlockInjected
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.block_sparse_moe
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
kwargs
:
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct-multi-gpu.yaml
0 → 100644
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
."
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
-
match
:
name
:
"
^model
\\
.layers
\\
.([012])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda:0"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
."
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.RotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformerLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
QuantizedLinearMarlin"
prefill_op
:
"
QuantizedLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
replace
:
class
:
ktransformers.operators.experts.Qwen2MoeSparseMoeBlockInjected
# mlp module with custom forward function
-
match
:
name
:
"
^model
\\
.layers
\\
.([12][0-9]|[3-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
# device: "cpu" # which devices to load this module when initializing
kwargs
:
prefill_device
:
"
cuda:1"
prefill_mlp_type
:
"
MLPExpertsTorch"
generate_device
:
"
cpu"
generate_mlp_type
:
"
MLPCPUExperts"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
(^model.norm)|(^lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModelKTransformers"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
3
:
"
cuda:1"
ktransformers/optimize/optimize_rules/Qwen2-57B-A14B-Instruct.yaml
View file @
f5f79f5c
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
-
match
:
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
class
:
ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
replace
:
replace
:
...
@@ -21,7 +28,7 @@
...
@@ -21,7 +28,7 @@
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
replace
:
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
class
:
ktransformers.operators.experts.KTransformersMLPExpert
# custom MoE Kernel with expert paralleism
device
:
"
cpu"
# which devices to load this module when initializing
#
device: "cpu" # which devices to load this module when initializing
kwargs
:
kwargs
:
prefill_device
:
"
cuda"
prefill_device
:
"
cuda"
prefill_mlp_type
:
"
MLPExpertsTorch"
prefill_mlp_type
:
"
MLPExpertsTorch"
...
@@ -32,6 +39,13 @@
...
@@ -32,6 +39,13 @@
-
match
:
-
match
:
name
:
"
^model$"
name
:
"
^model$"
replace
:
replace
:
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModel
PerLayerPrefill
"
class
:
"
ktransformers.operators.layer_wise_prefill.Qwen2MoeModel
KTransformers
"
kwargs
:
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/tests/dequant_gpu.py
View file @
f5f79f5c
This diff is collapsed.
Click to expand it.
ktransformers/tests/dequant_gpu_t.py
View file @
f5f79f5c
...
@@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa
...
@@ -11,7 +11,7 @@ from ktransformers.operators.linear import KTransformerLinear, QuantizedLinearMa
from
ktransformers.operators.experts
import
KTransformersMLPExpert
,
MLPExpertsTorch
from
ktransformers.operators.experts
import
KTransformersMLPExpert
,
MLPExpertsTorch
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
dequantize_q4_k_gpu
,
dequantize_q4_k
from
ktransformers.util.custom_gguf
import
GGUFLoader
,
dequantize_q4_k_gpu
,
dequantize_q4_k
import
torch
import
torch
import
Cuda
Ops
import
KTransformers
Ops
torch
.
set_default_dtype
(
torch
.
bfloat16
)
torch
.
set_default_dtype
(
torch
.
bfloat16
)
import
time
import
time
from
transformers
import
(
from
transformers
import
(
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment