Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
476b1d8d
Commit
476b1d8d
authored
Jan 31, 2025
by
Azure
Browse files
support deepseekv3; runable but have precition problem
parent
de7e892f
Changes
13
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
2178 additions
and
24 deletions
+2178
-24
ktransformers/local_chat.py
ktransformers/local_chat.py
+6
-2
ktransformers/models/configuration_deepseekv3.py
ktransformers/models/configuration_deepseekv3.py
+231
-0
ktransformers/models/custom_cache.py
ktransformers/models/custom_cache.py
+7
-4
ktransformers/models/modeling_deepseekv3.py
ktransformers/models/modeling_deepseekv3.py
+1216
-0
ktransformers/operators/attention.py
ktransformers/operators/attention.py
+201
-0
ktransformers/operators/experts.py
ktransformers/operators/experts.py
+101
-0
ktransformers/operators/gate.py
ktransformers/operators/gate.py
+128
-0
ktransformers/operators/linear.py
ktransformers/operators/linear.py
+9
-9
ktransformers/operators/models.py
ktransformers/operators/models.py
+4
-2
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
...s/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
+143
-0
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
+56
-0
ktransformers/server/backend/interfaces/ktransformers.py
ktransformers/server/backend/interfaces/ktransformers.py
+74
-5
ktransformers/server/backend/interfaces/transformers.py
ktransformers/server/backend/interfaces/transformers.py
+2
-2
No files found.
ktransformers/local_chat.py
View file @
476b1d8d
...
@@ -15,6 +15,7 @@ from ktransformers.server.args import ArgumentParser
...
@@ -15,6 +15,7 @@ from ktransformers.server.args import ArgumentParser
from
ktransformers.models.modeling_deepseek
import
DeepseekV2ForCausalLM
from
ktransformers.models.modeling_deepseek
import
DeepseekV2ForCausalLM
from
ktransformers.models.modeling_deepseekv3
import
DeepseekV3ForCausalLM
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeForCausalLM
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeForCausalLM
from
ktransformers.models.modeling_llama
import
LlamaForCausalLM
from
ktransformers.models.modeling_llama
import
LlamaForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
from
ktransformers.models.modeling_mixtral
import
MixtralForCausalLM
...
@@ -22,6 +23,7 @@ from ktransformers.server.config.config import Config
...
@@ -22,6 +23,7 @@ from ktransformers.server.config.config import Config
custom_models
=
{
custom_models
=
{
"DeepseekV2ForCausalLM"
:
DeepseekV2ForCausalLM
,
"DeepseekV2ForCausalLM"
:
DeepseekV2ForCausalLM
,
"DeepseekV3ForCausalLM"
:
DeepseekV3ForCausalLM
,
"Qwen2MoeForCausalLM"
:
Qwen2MoeForCausalLM
,
"Qwen2MoeForCausalLM"
:
Qwen2MoeForCausalLM
,
"LlamaForCausalLM"
:
LlamaForCausalLM
,
"LlamaForCausalLM"
:
LlamaForCausalLM
,
"MixtralForCausalLM"
:
MixtralForCausalLM
,
"MixtralForCausalLM"
:
MixtralForCausalLM
,
...
@@ -30,6 +32,8 @@ custom_models = {
...
@@ -30,6 +32,8 @@ custom_models = {
ktransformer_rules_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/optimize/optimize_rules/"
ktransformer_rules_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/optimize/optimize_rules/"
default_optimize_rules
=
{
default_optimize_rules
=
{
"DeepseekV2ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V2-Chat.yaml"
,
"DeepseekV2ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V2-Chat.yaml"
,
# "DeepseekV3ForCausalLM": ktransformer_rules_dir + "DeepSeek-V2-Chat.yaml",
"DeepseekV3ForCausalLM"
:
ktransformer_rules_dir
+
"DeepSeek-V3-Chat-multi-gpu.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-57B-A14B-Instruct.yaml"
,
"Qwen2MoeForCausalLM"
:
ktransformer_rules_dir
+
"Qwen2-57B-A14B-Instruct.yaml"
,
"LlamaForCausalLM"
:
ktransformer_rules_dir
+
"Internlm2_5-7b-Chat-1m.yaml"
,
"LlamaForCausalLM"
:
ktransformer_rules_dir
+
"Internlm2_5-7b-Chat-1m.yaml"
,
"MixtralForCausalLM"
:
ktransformer_rules_dir
+
"Mixtral.yaml"
,
"MixtralForCausalLM"
:
ktransformer_rules_dir
+
"Mixtral.yaml"
,
...
@@ -74,8 +78,8 @@ def local_chat():
...
@@ -74,8 +78,8 @@ def local_chat():
else
:
else
:
content
+=
line
+
"
\n
"
content
+=
line
+
"
\n
"
if
content
==
""
:
if
content
==
""
:
if
config
.
prompt_file
==
None
or
config
.
prompt_file
==
""
:
if
True
:
#
config.prompt_file == None or config.prompt_file == "":
content
=
"
Please write a piece of quicksort code in C++.
"
content
=
"
hi
"
else
:
else
:
content
=
open
(
config
.
prompt_file
,
"r"
).
read
()
content
=
open
(
config
.
prompt_file
,
"r"
).
read
()
elif
os
.
path
.
isfile
(
content
):
elif
os
.
path
.
isfile
(
content
):
...
...
ktransformers/models/configuration_deepseekv3.py
0 → 100644
View file @
476b1d8d
# coding=utf-8
# Copyright 2025 bzantium and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on the DeepSeekV3 implementations from the DeepSeek AI team. (https://huggingface.co/deepseek-ai/DeepSeek-V3)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" DeepSeekV3 model configuration """
from
transformers.configuration_utils
import
PretrainedConfig
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{}
class
DeepseekV3Config
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DeepSeek-V3.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 129280):
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`DeepseekV3Model`]
hidden_size (`int`, *optional*, defaults to 7168):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 18432):
Dimension of the MLP representations.
moe_intermediate_size (`int`, *optional*, defaults to 2048):
Dimension of the MoE representations.
num_hidden_layers (`int`, *optional*, defaults to 61):
Number of hidden layers in the Transformer decoder.
num_nextn_predict_layers (`int`, *optional*, defaults to 1):
Number of nextn predict layers in the DeepSeekV3 Model.
num_attention_heads (`int`, *optional*, defaults to 128):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*, defaults to 128):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
n_shared_experts (`int`, *optional*, defaults to 1):
Number of shared experts, None means dense model.
n_routed_experts (`int`, *optional*, defaults to 256):
Number of routed experts, None means dense model.
ep_size (`<fill_type>`, *optional*, defaults to 1): <fill_docstring>
routed_scaling_factor (`float`, *optional*, defaults to 2.5):
Scaling factor or routed experts.
kv_lora_rank (`<fill_type>`, *optional*, defaults to 512): <fill_docstring>
q_lora_rank (`<fill_type>`, *optional*, defaults to 1536): <fill_docstring>
qk_rope_head_dim (`<fill_type>`, *optional*, defaults to 64): <fill_docstring>
v_head_dim (`<fill_type>`, *optional*, defaults to 128): <fill_docstring>
qk_nope_head_dim (`<fill_type>`, *optional*, defaults to 128): <fill_docstring>
topk_method (`str`, *optional*, defaults to `"noaux_tc"`):
Topk method used in routed gate.
n_group (`int`, *optional*, defaults to 8):
Number of groups for routed experts.
topk_group (`int`, *optional*, defaults to 4):
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
num_experts_per_tok (`int`, *optional*, defaults to 8):
Number of selected experts, None means dense model.
moe_layer_freq (`int`, *optional*, defaults to 1):
The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
first_k_dense_replace (`int`, *optional*, defaults to 3):
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
\--k dense layers--/
norm_topk_prob (`bool`, *optional*, defaults to `True`):
Whether to normalize the weights of the routed experts.
scoring_func (`str`, *optional*, defaults to `"sigmoid"`):
Method of computing expert weights.
aux_loss_alpha (`float`, *optional*, defaults to 0.001):
Auxiliary loss weight coefficient.
Whether to compute the auxiliary loss for each individual sample.
seq_aux (`<fill_type>`, *optional*, defaults to `True`): <fill_docstring>
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 4096):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 0):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 1):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
```python
>>> from transformers import DeepseekV3Model, DeepseekV3Config
>>> # Initializing a Deepseek-V3 style configuration
>>> configuration = DeepseekV3Config()
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"deepseek_v3"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
129280
,
hidden_size
=
7168
,
intermediate_size
=
18432
,
moe_intermediate_size
=
2048
,
num_hidden_layers
=
61
,
num_nextn_predict_layers
=
1
,
num_attention_heads
=
128
,
num_key_value_heads
=
128
,
n_shared_experts
=
1
,
n_routed_experts
=
256
,
ep_size
=
1
,
routed_scaling_factor
=
2.5
,
kv_lora_rank
=
512
,
q_lora_rank
=
1536
,
qk_rope_head_dim
=
64
,
v_head_dim
=
128
,
qk_nope_head_dim
=
128
,
topk_method
=
'noaux_tc'
,
n_group
=
8
,
topk_group
=
4
,
num_experts_per_tok
=
8
,
moe_layer_freq
=
1
,
first_k_dense_replace
=
3
,
norm_topk_prob
=
True
,
scoring_func
=
'sigmoid'
,
aux_loss_alpha
=
0.001
,
seq_aux
=
True
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
0
,
eos_token_id
=
1
,
pretraining_tp
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
moe_intermediate_size
=
moe_intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_nextn_predict_layers
=
num_nextn_predict_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
n_shared_experts
=
n_shared_experts
self
.
n_routed_experts
=
n_routed_experts
self
.
ep_size
=
ep_size
self
.
routed_scaling_factor
=
routed_scaling_factor
self
.
kv_lora_rank
=
kv_lora_rank
self
.
q_lora_rank
=
q_lora_rank
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
topk_method
=
topk_method
self
.
n_group
=
n_group
self
.
topk_group
=
topk_group
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
moe_layer_freq
=
moe_layer_freq
self
.
first_k_dense_replace
=
first_k_dense_replace
self
.
norm_topk_prob
=
norm_topk_prob
self
.
scoring_func
=
scoring_func
self
.
aux_loss_alpha
=
aux_loss_alpha
self
.
seq_aux
=
seq_aux
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
mlp_bias
=
mlp_bias
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
__all__
=
[
"DeepseekV3Config"
]
\ No newline at end of file
ktransformers/models/custom_cache.py
View file @
476b1d8d
...
@@ -34,9 +34,12 @@ class StaticCache(transformers.StaticCache):
...
@@ -34,9 +34,12 @@ class StaticCache(transformers.StaticCache):
self
.
max_batch_size
=
max_batch_size
self
.
max_batch_size
=
max_batch_size
self
.
max_cache_len
=
config
.
max_position_embeddings
if
max_cache_len
is
None
else
max_cache_len
self
.
max_cache_len
=
config
.
max_position_embeddings
if
max_cache_len
is
None
else
max_cache_len
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
# Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
self
.
head_dim
=
(
if
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
:
config
.
head_dim
if
hasattr
(
config
,
"head_dim"
)
else
config
.
hidden_size
//
config
.
num_attention_heads
self
.
head_dim
=
config
.
qk_rope_head_dim
)
else
:
self
.
head_dim
=
(
config
.
head_dim
if
hasattr
(
config
,
"head_dim"
)
else
config
.
hidden_size
//
config
.
num_attention_heads
)
self
.
dtype
=
dtype
if
dtype
is
not
None
else
torch
.
float32
self
.
dtype
=
dtype
if
dtype
is
not
None
else
torch
.
float32
self
.
num_key_value_heads
=
(
self
.
num_key_value_heads
=
(
...
@@ -46,7 +49,7 @@ class StaticCache(transformers.StaticCache):
...
@@ -46,7 +49,7 @@ class StaticCache(transformers.StaticCache):
self
.
key_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
key_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
self
.
value_cache
:
List
[
torch
.
Tensor
]
=
[]
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
cache_shape
=
(
max_batch_size
,
self
.
num_key_value_heads
,
self
.
max_cache_len
,
self
.
head_dim
)
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
:
if
config
.
architectures
[
0
]
==
"DeepseekV2ForCausalLM"
or
config
.
architectures
[
0
]
==
"DeepseekV3ForCausalLM"
:
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# TODO: for deepseek, cache_shape is different whether using Absorbed MLA, check it automatically
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# key_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.qk_rope_head_dim + config.qk_nope_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
# value_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, config.v_head_dim)
...
...
ktransformers/models/modeling_deepseekv3.py
0 → 100644
View file @
476b1d8d
This diff is collapsed.
Click to expand it.
ktransformers/operators/attention.py
View file @
476b1d8d
...
@@ -13,6 +13,7 @@ from ktransformers.models.configuration_deepseek import DeepseekV2Config
...
@@ -13,6 +13,7 @@ from ktransformers.models.configuration_deepseek import DeepseekV2Config
from
ktransformers.models.configuration_llama
import
LlamaConfig
from
ktransformers.models.configuration_llama
import
LlamaConfig
from
ktransformers.models.modeling_llama
import
LlamaRotaryEmbedding
from
ktransformers.models.modeling_llama
import
LlamaRotaryEmbedding
from
ktransformers.models.modeling_deepseek
import
DeepseekV2Attention
,
apply_rotary_pos_emb
from
ktransformers.models.modeling_deepseek
import
DeepseekV2Attention
,
apply_rotary_pos_emb
from
ktransformers.models.modeling_deepseekv3
import
DeepseekV3Attention
,
apply_rotary_pos_emb
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.util.custom_gguf
import
GGUFLoader
...
@@ -20,6 +21,206 @@ import logging
...
@@ -20,6 +21,206 @@ import logging
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.cache_utils
import
Cache
from
transformers.cache_utils
import
Cache
logger
=
logging
.
getLogger
(
"attention"
)
logger
=
logging
.
getLogger
(
"attention"
)
class
KDeepseekV3Attention
(
BaseInjectedModule
,
DeepseekV3Attention
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
def
__init__
(
self
,
key
:
str
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
chunck_size
:
int
=
1000
,
**
kwargs
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
device
,
**
kwargs
)
self
.
orig_module
.
__init__
(
orig_module
.
config
,
orig_module
.
layer_idx
)
self
.
chunck_size
=
chunck_size
# TODO, generate chunck_size automatically.
self
.
softmax_scale
=
self
.
q_head_dim
**
(
-
0.5
)
def
get_absorbed
(
self
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
if
not
(
hasattr
(
self
,
'q_absorb'
)
and
hasattr
(
self
,
'out_absorb'
)):
kv_b_proj
=
self
.
kv_b_proj
.
weight
.
view
(
self
.
num_heads
,
-
1
,
self
.
kv_lora_rank
)
q_absorb
=
kv_b_proj
[:,
:
self
.
qk_nope_head_dim
,
:].
reshape
(
-
1
,
self
.
kv_lora_rank
)
out_absorb
=
kv_b_proj
[:,
self
.
qk_nope_head_dim
:,
:].
reshape
(
-
1
,
self
.
kv_lora_rank
)
self
.
q_absorb
=
nn
.
Linear
(
self
.
kv_lora_rank
,
self
.
num_heads
*
self
.
qk_nope_head_dim
,
bias
=
False
,
dtype
=
q_absorb
.
dtype
,
device
=
q_absorb
.
device
)
self
.
q_absorb
.
weight
.
data
=
q_absorb
self
.
out_absorb
=
nn
.
Linear
(
self
.
kv_lora_rank
,
self
.
num_heads
*
self
.
v_head_dim
,
bias
=
False
,
dtype
=
out_absorb
.
dtype
,
device
=
out_absorb
.
device
)
self
.
out_absorb
.
weight
.
data
=
out_absorb
del
self
.
orig_module
.
kv_b_proj
q_absorb
=
self
.
q_absorb
.
weight
.
view
(
self
.
num_heads
,
self
.
qk_nope_head_dim
,
self
.
kv_lora_rank
)
out_absorb
=
self
.
out_absorb
.
weight
.
view
(
self
.
num_heads
,
self
.
v_head_dim
,
self
.
kv_lora_rank
)
return
q_absorb
,
out_absorb
def
forward_chunck
(
self
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
LongTensor
]
=
None
,
past_key_value
:
Optional
[
Cache
]
=
None
,
output_attentions
:
bool
=
False
,
use_cache
:
bool
=
False
,
cache_position
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
kwargs
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
bsz
,
q_len
,
_
=
hidden_states
.
size
()
if
self
.
q_lora_rank
is
None
:
q
=
self
.
q_proj
(
hidden_states
)
else
:
q
=
self
.
q_b_proj
(
self
.
q_a_layernorm
(
self
.
q_a_proj
(
hidden_states
)))
q
=
q
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
q_head_dim
).
transpose
(
1
,
2
)
q_nope
,
q_pe
=
torch
.
split
(
q
,
[
self
.
qk_nope_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
compressed_kv
=
self
.
kv_a_proj_with_mqa
(
hidden_states
)
compressed_kv
,
k_pe
=
torch
.
split
(
compressed_kv
,
[
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
compressed_kv
=
self
.
kv_a_layernorm
(
compressed_kv
)
k_pe
=
k_pe
.
view
(
bsz
,
q_len
,
1
,
self
.
qk_rope_head_dim
).
transpose
(
1
,
2
)
kv_seq_len
=
k_pe
.
shape
[
-
2
]
if
past_key_value
is
not
None
:
if
self
.
layer_idx
is
None
:
raise
ValueError
(
f
"The cache structure has changed since version v4.36. If you are using
{
self
.
__class__
.
__name__
}
"
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
"with a layer index."
)
kv_seq_len
+=
past_key_value
.
get_usable_length
(
kv_seq_len
,
self
.
layer_idx
)
cos
,
sin
=
self
.
rotary_emb
(
q_pe
,
position_ids
)
q_pe
,
k_pe
=
apply_rotary_pos_emb
(
q_pe
,
k_pe
,
cos
,
sin
)
if
past_key_value
is
not
None
:
cache_kwargs
=
{
"sin"
:
sin
,
"cos"
:
cos
,
"cache_position"
:
cache_position
}
# Specific to RoPE models
compressed_kv
=
compressed_kv
.
unsqueeze
(
1
)
k_pe
,
compressed_kv
=
past_key_value
.
update
(
k_pe
,
compressed_kv
,
self
.
layer_idx
,
cache_kwargs
)
compressed_kv
=
compressed_kv
.
squeeze
(
1
)
#if cache_position is not None:
# compressed_kv = compressed_kv[:,: cache_position[-1] + 1,:]
# k_pe = k_pe[:,:,: cache_position[-1] + 1,:]
q_absorb
,
out_absorb
=
self
.
get_absorbed
()
q_nope
=
torch
.
matmul
(
q_nope
,
q_absorb
)
attn_weights
=
(
torch
.
matmul
(
q_pe
,
k_pe
.
mT
)
+
torch
.
matmul
(
q_nope
,
compressed_kv
.
unsqueeze
(
-
3
).
mT
))
*
self
.
softmax_scale
"""
if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
raise ValueError(
f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
f" {attn_weights.size()}"
)
assert attention_mask is not None
"""
if
attention_mask
is
not
None
:
"""
if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
)
"""
#causal_mask = attention_mask[:, :, :, : kv_seq_len]
attn_weights
=
attn_weights
+
attention_mask
# upcast attention to fp32
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
,
dtype
=
torch
.
float32
).
to
(
q_pe
.
dtype
)
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
attention_dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
einsum
(
'bhql,blc->bhqc'
,
attn_weights
,
compressed_kv
)
attn_output
=
torch
.
matmul
(
attn_output
,
out_absorb
.
mT
)
if
attn_output
.
size
()
!=
(
bsz
,
self
.
num_heads
,
q_len
,
self
.
v_head_dim
):
raise
ValueError
(
f
"`attn_output` should be of size
{
(
bsz
,
self
.
num_heads
,
q_len
,
self
.
v_head_dim
)
}
, but is"
f
"
{
attn_output
.
size
()
}
"
)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
num_heads
*
self
.
v_head_dim
)
attn_output
=
self
.
o_proj
(
attn_output
)
return
attn_output
,
attn_weights
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
position_ids
:
Optional
[
torch
.
LongTensor
]
=
None
,
past_key_value
:
Optional
[
Cache
]
=
None
,
output_attentions
:
bool
=
False
,
use_cache
:
bool
=
False
,
cache_position
:
Optional
[
torch
.
LongTensor
]
=
None
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
if
"padding_mask"
in
kwargs
:
warnings
.
warn
(
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
)
bsz
,
q_len
,
_
=
hidden_states
.
size
()
if
q_len
<=
self
.
chunck_size
:
return
self
.
forward_chunck
(
hidden_states
,
attention_mask
,
position_ids
,
past_key_value
,
output_attentions
,
use_cache
,
cache_position
,
**
kwargs
)
assert
output_attentions
==
False
,
"output_attentions is not supported when using chunked attention"
attn_output
=
None
attn_weight
=
None
cur_idx
=
0
while
cur_idx
<
q_len
:
if
attention_mask
is
not
None
:
chunk_mask
=
attention_mask
[:,
:,
cur_idx
:
min
(
cur_idx
+
self
.
chunck_size
,
q_len
),
...]
else
:
# generate chunk_mask automatically.
self
.
attn_mask
=
\
torch
.
zeros
(
1
,
1
,
self
.
chunck_size
,
past_key_value
.
max_cache_len
,
device
=
hidden_states
.
device
)
\
if
self
.
attn_mask
is
None
\
else
self
.
attn_mask
self
.
attn_mask
[:,
:,
:,
cur_idx
:
min
(
cur_idx
+
self
.
chunck_size
,
past_key_value
.
max_cache_len
)]
=
\
-
1e+38
*
torch
.
triu
(
torch
.
ones
(
self
.
chunck_size
,
self
.
chunck_size
,
device
=
hidden_states
.
device
),
diagonal
=
1
)
\
[:,:
min
(
self
.
chunck_size
,
min
(
past_key_value
.
max_cache_len
-
cur_idx
,
self
.
chunck_size
))]
self
.
attn_mask
[:,
:,
:,
cur_idx
+
self
.
chunck_size
:]
=
-
1e+38
self
.
attn_mask
[:,
:,
:,
:
cur_idx
]
=
0
chunk_mask
=
torch
.
narrow
(
self
.
attn_mask
,
2
,
0
,
min
(
self
.
chunck_size
,
q_len
-
cur_idx
))
cur_output
,
cur_attn_weight
=
self
.
forward_chunck
(
hidden_states
[:,
cur_idx
:
min
(
cur_idx
+
self
.
chunck_size
,
q_len
),
...],
chunk_mask
,
position_ids
[:,
cur_idx
:
min
(
cur_idx
+
self
.
chunck_size
,
q_len
)],
past_key_value
,
output_attentions
,
use_cache
,
cache_position
[
cur_idx
:
min
(
cur_idx
+
self
.
chunck_size
,
q_len
)],
**
kwargs
)
cur_idx
+=
self
.
chunck_size
if
attn_output
is
None
:
attn_output
=
cur_output
attn_weight
=
cur_attn_weight
else
:
attn_output
=
torch
.
cat
((
attn_output
,
cur_output
),
dim
=-
2
)
attn_weight
=
torch
.
cat
((
attn_weight
,
cur_attn_weight
),
dim
=-
2
)
return
attn_output
,
attn_weight
class
KDeepseekV2Attention
(
BaseInjectedModule
,
DeepseekV2Attention
):
class
KDeepseekV2Attention
(
BaseInjectedModule
,
DeepseekV2Attention
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
"""Multi-headed attention from 'Attention Is All You Need' paper"""
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
...
...
ktransformers/operators/experts.py
View file @
476b1d8d
...
@@ -519,6 +519,7 @@ class KTransformersExperts(BaseInjectedModule, KExpertsBase):
...
@@ -519,6 +519,7 @@ class KTransformersExperts(BaseInjectedModule, KExpertsBase):
from
ktransformers.models.modeling_deepseek
import
DeepseekV2MoE
from
ktransformers.models.modeling_deepseek
import
DeepseekV2MoE
from
ktransformers.models.modeling_deepseekv3
import
DeepseekV3MoE
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeSparseMoeBlock
from
ktransformers.models.modeling_qwen2_moe
import
Qwen2MoeSparseMoeBlock
from
ktransformers.models.modeling_mixtral
import
MixtralSparseMoeBlock
from
ktransformers.models.modeling_mixtral
import
MixtralSparseMoeBlock
...
@@ -727,6 +728,106 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
...
@@ -727,6 +728,106 @@ class KDeepseekV2MoE(BaseInjectedModule, DeepseekV2MoE):
)
)
return
final_out
return
final_out
class
KDeepseekV3MoE
(
BaseInjectedModule
,
DeepseekV3MoE
):
def
forward
(
self
,
hidden_states
):
identity
=
hidden_states
orig_shape
=
hidden_states
.
shape
sequence_length
=
orig_shape
[
1
]
topk_idx
,
topk_weight
=
self
.
gate
(
hidden_states
)
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_states
.
shape
[
-
1
])
if
sequence_length
==
1
and
hasattr
(
self
.
experts
.
generate_experts
,
"submit_for_one_decode"
)
and
torch
.
cuda
.
is_current_stream_capturing
():
self
.
experts
.
generate_experts
.
submit_for_one_decode
(
hidden_states
[
0
],
topk_idx
[
0
],
topk_weight
[
0
])
if
self
.
config
.
n_shared_experts
is
not
None
:
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
y
=
self
.
experts
.
generate_experts
.
sync_for_one_decode
().
unsqueeze
(
0
)
y
+=
y_
y
.
resize_
(
*
orig_shape
)
return
y
if
self
.
config
.
n_shared_experts
is
not
None
:
y_
=
self
.
shared_experts
(
identity
).
squeeze
(
0
)
if
isinstance
(
self
.
experts
,
KExpertsBase
):
y
=
self
.
moe_on_cpuinfer
(
hidden_states
,
topk_idx
,
topk_weight
).
view
(
*
orig_shape
).
to
(
device
=
hidden_states
.
device
)
elif
hidden_states
.
size
(
0
)
>
10
:
# TODO may bugs here
y
=
(
self
.
moe_infer
(
hidden_states
,
topk_idx
,
topk_weight
)
.
view
(
*
orig_shape
)
.
to
(
device
=
hidden_states
.
device
)
)
else
:
# TODO may bugs here
y
=
(
self
.
moe_infer_simple
(
hidden_states
,
topk_idx
,
topk_weight
)
.
view
(
*
orig_shape
)
.
to
(
device
=
hidden_states
.
device
)
)
if
self
.
config
.
n_shared_experts
is
not
None
:
y
+=
y_
return
y
@
torch
.
no_grad
()
def
moe_on_cpuinfer
(
self
,
x
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_weight
:
torch
.
Tensor
)
->
torch
.
Tensor
:
outs
=
torch
.
empty_like
(
x
)
outs
=
self
.
experts
(
x
,
topk_ids
,
topk_weight
)
return
outs
@
torch
.
no_grad
()
# TODO may bugs here
def
moe_infer_simple
(
self
,
x
:
torch
.
Tensor
,
topk_ids
:
torch
.
Tensor
,
topk_weight
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
x: [num_tokens, hidden_size]
topk_ids, topk_weight: [num_tokens, num_selected_experts]
"""
outs
=
torch
.
zeros_like
(
x
)
for
token_idx
in
range
(
topk_ids
.
size
(
0
)):
for
expert_idx
in
range
(
topk_ids
.
size
(
1
)):
expert
=
self
.
experts
[
topk_ids
[
token_idx
,
expert_idx
]]
outs
[
token_idx
]
+=
(
expert
.
forward
(
x
[
token_idx
])
*
topk_weight
[
token_idx
,
expert_idx
]
)
return
outs
@
torch
.
no_grad
()
# TODO may bugs here
def
moe_infer
(
self
,
x
,
topk_ids
,
topk_weight
):
cnts
=
topk_ids
.
new_zeros
((
topk_ids
.
shape
[
0
],
len
(
self
.
experts
)))
cnts
.
scatter_
(
1
,
topk_ids
,
1
)
tokens_per_expert
=
cnts
.
sum
(
dim
=
0
)
idxs
=
topk_ids
.
view
(
-
1
).
argsort
()
sorted_tokens
=
x
[
idxs
//
topk_ids
.
shape
[
1
]]
tokens_per_expert
=
tokens_per_expert
.
cpu
().
numpy
()
outputs
=
[]
start_idx
=
0
for
i
,
num_tokens
in
enumerate
(
tokens_per_expert
):
end_idx
=
start_idx
+
num_tokens
if
num_tokens
==
0
:
continue
expert
=
self
.
experts
[
i
+
self
.
ep_rank
*
self
.
experts_per_rank
]
tokens_for_this_expert
=
sorted_tokens
[
start_idx
:
end_idx
]
expert_out
=
expert
.
forward
(
tokens_for_this_expert
)
outputs
.
append
(
expert_out
)
start_idx
=
end_idx
outs
=
torch
.
cat
(
outputs
,
dim
=
0
)
if
len
(
outputs
)
else
sorted_tokens
.
new_empty
(
0
)
new_x
=
torch
.
empty_like
(
outs
)
new_x
[
idxs
]
=
outs
final_out
=
(
new_x
.
view
(
*
topk_ids
.
shape
,
-
1
)
.
type
(
topk_weight
.
dtype
)
.
mul_
(
topk_weight
.
unsqueeze
(
dim
=-
1
))
.
sum
(
dim
=
1
)
.
type
(
new_x
.
dtype
)
)
return
final_out
class
KMistralSparseMoEBlock
(
BaseInjectedModule
,
MixtralSparseMoeBlock
):
class
KMistralSparseMoEBlock
(
BaseInjectedModule
,
MixtralSparseMoeBlock
):
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
ktransformers/operators/gate.py
0 → 100644
View file @
476b1d8d
from
typing
import
Any
,
Union
import
numpy
as
np
import
numpy.typing
as
npt
from
torch
import
Tensor
,
nn
import
torch.nn.functional
as
F
import
torch
import
sys
,
os
from
ktransformers.operators.base_operator
import
BaseInjectedModule
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
))
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Release"
))
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
".."
,
"ktransformers_ext"
,
"build"
,
"Debug"
))
import
cpuinfer_ext
from
cpuinfer_ext.moe
import
MOEConfig
,
MOE
import
ctypes
from
ktransformers.operators.base_operator
import
BaseInjectedModule
from
ktransformers.util.custom_gguf
import
GGUFLoader
from
ktransformers.models.modeling_deepseekv3
import
MoEGate
from
ktransformers.util.utils
import
InferenceState
from
ktransformers.server.config.config
import
Config
from
transformers.activations
import
ACT2FN
from
transformers.configuration_utils
import
PretrainedConfig
from
abc
import
ABC
,
abstractmethod
import
time
# class Base(BaseInjectedModule, ABC):
class
KMoEGateBase
(
ABC
):
def
__init__
(
self
,
key
:
str
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
,
device
:
str
=
"cuda"
,
**
kwargs
):
# super().__init__(key, gguf_loader, config, orig_module, device, **kwargs)
super
().
__init__
()
self
.
key
=
key
self
.
gguf_loader
=
gguf_loader
self
.
config
=
config
self
.
device
=
device
self
.
orig_module
=
orig_module
@
abstractmethod
def
forward
(
self
,
input_tensor
,
expert_ids
,
weights
):
pass
@
abstractmethod
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
=
"cpu"
,
warmup
:
bool
=
False
):
pass
@
abstractmethod
def
unload
():
pass
def
load_weights
(
self
,
override_key
:
str
|
None
=
None
,
device
:
str
=
"cpu"
):
res
=
{}
if
override_key
is
not
None
:
keys
=
override_key
else
:
keys
=
[
self
.
key
]
gate
=
None
up
=
None
down
=
None
gate_type
=
None
up_type
=
None
down_type
=
None
for
key
in
keys
:
key
=
"."
.
join
(
key
.
split
(
"."
)[:
-
1
])
if
key
+
".ffn_gate_inp.weight"
in
self
.
gguf_loader
.
tensor_info
:
targets
=
[
".ffn_gate_inp.weight"
,
".exp_probs_b.bias"
]
tensors
=
self
.
load_multi
(
key
,
targets
,
device
=
device
)
weight
=
tensors
[
".ffn_gate_inp.weight"
]
e_score_correction_bias
=
tensors
[
".exp_probs_b.bias"
]
weight_type
=
self
.
gguf_loader
.
tensor_info
[
key
+
".ffn_gate_inp.weight"
][
"ggml_type"
]
e_score_correction_bias_type
=
self
.
gguf_loader
.
tensor_info
[
key
+
".exp_probs_b.bias"
][
"ggml_type"
]
else
:
raise
ValueError
(
f
"Experts
{
key
}
not found in gguf_loader"
)
res
=
{
"weight"
:
weight
,
"e_score_correction_bias"
:
e_score_correction_bias
,
"weight_type"
:
weight_type
,
"e_score_correction_bias_type"
:
e_score_correction_bias_type
}
return
res
def
load_multi
(
self
,
key
:
str
,
keys
:
list
[
str
],
device
:
str
=
"cpu"
):
tensors
=
{}
for
k
in
keys
:
tensors
[
k
]
=
self
.
gguf_loader
.
load_gguf_tensor
(
key
+
k
,
device
=
device
)
return
tensors
class
KMoEGate
(
BaseInjectedModule
,
KMoEGateBase
):
def
__init__
(
self
,
key
:
str
,
gguf_loader
:
GGUFLoader
,
config
:
PretrainedConfig
,
orig_module
:
nn
.
Module
=
None
,
generate_device
:
str
=
"cuda"
,
prefill_device
:
str
=
"cuda"
,
**
kwargs
,
):
BaseInjectedModule
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_device
,
**
kwargs
)
KMoEGateBase
.
__init__
(
self
,
key
,
gguf_loader
,
config
,
orig_module
,
generate_device
,
**
kwargs
)
def
forward
(
self
,
hidden_states
)
->
torch
.
Tensor
:
return
self
.
orig_module
.
forward
(
hidden_states
)
def
load
(
self
,
w
:
dict
|
nn
.
Parameter
|
tuple
|
None
=
None
,
device
:
str
|
None
=
None
):
if
device
is
None
:
device
=
self
.
device
if
w
is
None
:
w
=
self
.
load_weights
(
device
=
device
)
if
isinstance
(
w
,
dict
):
self
.
weight_type
=
w
[
"weight_type"
]
self
.
e_score_correction_bias_type
=
w
[
"e_score_correction_bias_type"
]
self
.
orig_module
.
weight
=
nn
.
Parameter
(
w
[
"weight"
])
self
.
orig_module
.
e_score_correction_bias
=
nn
.
Parameter
(
w
[
"e_score_correction_bias"
])
else
:
raise
ValueError
(
"Invalid weight type"
)
self
.
orig_module
.
weight
=
self
.
orig_module
.
weight
.
to
(
device
)
if
self
.
topk_method
==
"noaux_tc"
:
self
.
orig_module
.
e_score_correction_bias
=
self
.
orig_module
.
e_score_correction_bias
.
to
(
device
)
def
unload
(
self
):
if
self
.
weight
is
not
None
:
self
.
weight
=
None
if
self
.
topk_method
==
"noaux_tc"
:
self
.
e_score_correction_bias
=
None
ktransformers/operators/linear.py
View file @
476b1d8d
...
@@ -54,15 +54,15 @@ class KLinearBase(ABC):
...
@@ -54,15 +54,15 @@ class KLinearBase(ABC):
self
.
has_bias
=
False
self
.
has_bias
=
False
self
.
dtype
=
torch
.
get_default_dtype
()
self
.
dtype
=
torch
.
get_default_dtype
()
if
orig_module
is
not
None
:
#
if orig_module is not None:
self
.
in_features
=
orig_module
.
in_features
#
self.in_features = orig_module.in_features
self
.
out_features
=
orig_module
.
out_features
#
self.out_features = orig_module.out_features
else
:
#
else:
shape
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
]
shape
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
]
if
len
(
shape
)
==
1
:
if
len
(
shape
)
==
1
:
print
(
"Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF"
)
print
(
"Warning: orig_module is not set, but has in_features or out_features equals to 1, can't get in_features and out_features from GGUF"
)
self
.
in_features
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
][
0
]
self
.
in_features
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
][
0
]
self
.
out_features
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
][
1
]
self
.
out_features
=
self
.
gguf_loader
.
tensor_info
[
key
+
".weight"
][
"shape"
][
1
]
@
abstractmethod
@
abstractmethod
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
ktransformers/operators/models.py
View file @
476b1d8d
...
@@ -641,6 +641,7 @@ class KDeepseekV2Model(BaseInjectedModule):
...
@@ -641,6 +641,7 @@ class KDeepseekV2Model(BaseInjectedModule):
if
inputs_embeds
is
None
:
if
inputs_embeds
is
None
:
org_device
=
input_ids
.
device
org_device
=
input_ids
.
device
# TODO move to embed_tokens's device, not hard code to cpu
input_ids
=
input_ids
.
to
(
"cpu"
)
input_ids
=
input_ids
.
to
(
"cpu"
)
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
input_ids
=
input_ids
.
to
(
org_device
)
input_ids
=
input_ids
.
to
(
org_device
)
...
@@ -737,8 +738,9 @@ class KDeepseekV2Model(BaseInjectedModule):
...
@@ -737,8 +738,9 @@ class KDeepseekV2Model(BaseInjectedModule):
hidden_states
=
layer_outputs
[
0
]
hidden_states
=
layer_outputs
[
0
]
if
use_cache
:
# @@@@@@@ TODO open this notes, tmp close to fit deepseekv3
next_decoder_cache
=
layer_outputs
[
2
if
output_attentions
else
1
]
# if use_cache:
# next_decoder_cache = layer_outputs[2 if output_attentions else 1]
if
output_attentions
:
if
output_attentions
:
all_self_attns
+=
(
layer_outputs
[
1
],)
all_self_attns
+=
(
layer_outputs
[
1
],)
...
...
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml
0 → 100644
View file @
476b1d8d
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
."
class
:
ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.(?!self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseekv3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseekv3.DeepseekV3MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV3MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseekv3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.gate$"
class
:
ktransformers.models.modeling_deepseekv3.MoEGate
replace
:
class
:
ktransformers.operators.gate.KMoEGate
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:0"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda:0"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda:1"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda:1"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV3Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
^model
\\
.layers
\\
.([3456][0-9])
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV3Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
transfer_map
:
30
:
"
cuda:1"
-
match
:
name
:
"
^model
\\
.layers
\\
.(0|[1-9]|[12][0-9])
\\
."
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:0"
prefill_device
:
"
cuda:0"
-
match
:
name
:
"
(^model
\\
.layers
\\
.([3456][0-9])
\\
.)|(model.norm)|(lm_head)"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cuda:1"
prefill_device
:
"
cuda:1"
ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml
0 → 100644
View file @
476b1d8d
-
match
:
class
:
ktransformers.models.modeling_deepseek.DeepseekV3YarnRotaryEmbedding
replace
:
class
:
ktransformers.operators.RoPE.YarnRotaryEmbedding
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
.(?!.*self_attn
\\
.kv_b_proj).*$"
# regular expression
class
:
torch.nn.Linear
# only match modules matching name and class simultaneously
replace
:
class
:
ktransformers.operators.linear.KTransformersLinear
# optimized Kernel on quantized data types
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
generate_op
:
"
KLinearMarlin"
prefill_op
:
"
KLinearTorch"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp$"
class
:
ktransformers.models.modeling_deepseek.DeepseekV2MoE
replace
:
class
:
ktransformers.operators.experts.KDeepseekV2MoE
# mlp module with custom forward function
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.mlp
\\
.experts$"
replace
:
class
:
ktransformers.operators.experts.KTransformersExperts
# custom MoE Kernel with expert paralleism
kwargs
:
prefill_device
:
"
cuda"
prefill_op
:
"
KExpertsTorch"
generate_device
:
"
cpu"
generate_op
:
"
KExpertsCPU"
out_device
:
"
cuda"
recursive
:
False
# don't recursively inject submodules of this module
-
match
:
name
:
"
^model
\\
.layers
\\
..*
\\
.self_attn$"
replace
:
class
:
ktransformers.operators.attention.KDeepseekV2Attention
# optimized MLA implementation
kwargs
:
generate_device
:
"
cuda"
prefill_device
:
"
cuda"
-
match
:
name
:
"
^model$"
replace
:
class
:
"
ktransformers.operators.models.KDeepseekV2Model"
kwargs
:
per_layer_prefill_intput_threshold
:
0
# 0 is close layer wise prefill
-
match
:
name
:
"
^model.embed_tokens"
replace
:
class
:
"
default"
kwargs
:
generate_device
:
"
cpu"
prefill_device
:
"
cpu"
\ No newline at end of file
ktransformers/server/backend/interfaces/ktransformers.py
View file @
476b1d8d
...
@@ -46,17 +46,26 @@ class KTransformersInterface(TransformersInterface):
...
@@ -46,17 +46,26 @@ class KTransformersInterface(TransformersInterface):
)
)
optimize_and_load_gguf
(
self
.
model
,
optimize_rule_path
,
gguf_path
,
config
)
optimize_and_load_gguf
(
self
.
model
,
optimize_rule_path
,
gguf_path
,
config
)
device_map
=
self
.
model
.
gguf_loader
.
tensor_device_map
self
.
device_map
=
self
.
model
.
gguf_loader
.
tensor_device_map
logger
.
info
(
f
"
{
args
.
model_name
}
loaded from
{
args
.
model_dir
}
to
{
device_map
}
"
)
#
logger.info(f"{args.model_name} loaded from {args.model_dir} to {
self.
device_map}")
self
.
cache
=
StaticCache
(
self
.
cache
=
StaticCache
(
config
=
self
.
model
.
config
,
config
=
self
.
model
.
config
,
max_batch_size
=
args
.
batch_size
,
max_batch_size
=
args
.
batch_size
,
max_cache_len
=
args
.
cache_lens
,
max_cache_len
=
args
.
cache_lens
,
device
=
device_map
,
device
=
self
.
device_map
,
dtype
=
self
.
model
.
dtype
,
dtype
=
self
.
model
.
dtype
,
)
)
logger
.
info
(
f
"StaticCache (length=
{
args
.
cache_lens
}
) created at
{
device_map
}
, batch size:
{
args
.
batch_size
}
"
)
# logger.info(f"StaticCache (length={args.cache_lens}), batch size:{args.batch_size}")
self
.
model
.
generation_config
=
GenerationConfig
.
from_pretrained
(
args
.
model_dir
)
try
:
self
.
model
.
generation_config
=
GenerationConfig
.
from_pretrained
(
args
.
model_dir
)
except
:
gen_config
=
GenerationConfig
(
max_length
=
128
,
temperature
=
0.7
,
top_p
=
0.9
,
do_sample
=
True
)
self
.
model
.
generation_config
=
gen_config
if
self
.
model
.
generation_config
.
pad_token_id
is
None
:
if
self
.
model
.
generation_config
.
pad_token_id
is
None
:
self
.
model
.
generation_config
.
pad_token_id
=
self
.
model
.
generation_config
.
eos_token_id
self
.
model
.
generation_config
.
pad_token_id
=
self
.
model
.
generation_config
.
eos_token_id
self
.
streamer
=
TextStreamer
(
self
.
tokenizer
)
self
.
streamer
=
TextStreamer
(
self
.
tokenizer
)
...
@@ -102,3 +111,63 @@ class KTransformersInterface(TransformersInterface):
...
@@ -102,3 +111,63 @@ class KTransformersInterface(TransformersInterface):
logits
=
logits
[
0
,
-
1
,
:]
logits
=
logits
[
0
,
-
1
,
:]
return
self
.
logits_to_token
(
logits
)
return
self
.
logits_to_token
(
logits
)
@
torch
.
no_grad
def
prefill
(
self
,
input_ids
:
torch
.
Tensor
,
is_new
:
bool
):
input_ids_length
=
input_ids
.
shape
[
-
1
]
self
.
profiler
.
set_counter
(
"prefill"
,
input_ids_length
)
logger
.
debug
(
f
"input_ids:
{
input_ids
.
shape
}
"
)
device
=
self
.
device_map
.
get
(
"blk.0.self_attn"
,
{}).
get
(
"generate_device"
,
"cuda:0"
)
if
is_new
:
self
.
cache
.
reset
()
self
.
ever_generated_ids
.
clear
()
former_seq_length
=
0
self
.
seq_length
=
input_ids_length
self
.
generated_ids
=
torch
.
zeros
(
self
.
args
.
batch_size
,
self
.
seq_length
+
self
.
args
.
max_new_tokens
+
1
,
dtype
=
torch
.
int
,
device
=
self
.
args
.
device
,
)
else
:
logger
.
debug
(
f
"generate_ids:
{
self
.
generated_ids
.
shape
}
"
)
former_seq_length
=
self
.
seq_length
self
.
seq_length
+=
input_ids_length
expected_length
=
self
.
seq_length
+
self
.
args
.
max_new_tokens
+
1
delta_length
=
expected_length
-
self
.
generated_ids
.
shape
[
-
1
]
if
delta_length
>
0
:
new_generate_ids
=
torch
.
zeros
(
self
.
args
.
batch_size
,
delta_length
,
dtype
=
torch
.
int
,
device
=
self
.
args
.
device
)
self
.
generated_ids
=
torch
.
cat
([
self
.
generated_ids
,
new_generate_ids
],
dim
=-
1
)
logger
.
debug
(
f
"cache position:
{
former_seq_length
}
to
{
self
.
seq_length
}
"
)
cache_position
=
torch
.
arange
(
former_seq_length
,
self
.
seq_length
,
device
=
device
)
self
.
generated_ids
[:,
cache_position
]
=
input_ids
.
to
(
self
.
args
.
device
).
to
(
torch
.
int
)
mask
=
torch
.
ones
((
1
,
self
.
seq_length
)).
to
(
device
)
if
not
(
type
(
self
)
is
TransformersInterface
):
input_ids
=
input_ids
.
to
(
"cpu"
)
inputs_embeds
=
self
.
model
.
model
.
embed_tokens
(
input_ids
).
to
(
device
)
if
self
.
use_static_cache
:
logits
=
self
.
model
(
inputs_embeds
=
inputs_embeds
,
cache_position
=
cache_position
,
past_key_values
=
self
.
cache
,
return_dict
=
False
,
use_cache
=
True
,
attention_mask
=
mask
,
)[
0
]
else
:
logits
=
self
.
model
(
inputs_embeds
=
inputs_embeds
,
return_dict
=
False
)[
0
]
next_token
=
self
.
logits_to_token
(
logits
[
0
,
-
1
,
:])
yield
self
.
append_new_tokens
(
next_token
)
@
property
def
active_cache_position
(
self
):
device
=
self
.
device_map
.
get
(
"blk.0.self_attn"
,
{}).
get
(
"generate_device"
,
"cuda:0"
)
return
torch
.
tensor
([
self
.
seq_length
-
1
],
device
=
device
)
\ No newline at end of file
ktransformers/server/backend/interfaces/transformers.py
View file @
476b1d8d
...
@@ -134,7 +134,7 @@ class TransformersInterface(BackendInterfaceBase):
...
@@ -134,7 +134,7 @@ class TransformersInterface(BackendInterfaceBase):
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_dir
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_dir
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
args
.
model_dir
,
device_map
=
args
.
device
,
use_safetensors
=
True
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
args
.
model_dir
,
device_map
=
args
.
device
,
use_safetensors
=
True
)
logger
.
info
(
f
"
{
args
.
model_name
}
loaded from
{
args
.
model_dir
}
to
{
args
.
device
}
"
)
#
logger.info(f"{args.model_name} loaded from {args.model_dir} to {args.device}")
self
.
cache
=
StaticCache
(
self
.
cache
=
StaticCache
(
config
=
self
.
model
.
config
,
config
=
self
.
model
.
config
,
...
@@ -143,7 +143,7 @@ class TransformersInterface(BackendInterfaceBase):
...
@@ -143,7 +143,7 @@ class TransformersInterface(BackendInterfaceBase):
device
=
args
.
device
,
device
=
args
.
device
,
dtype
=
self
.
model
.
dtype
,
dtype
=
self
.
model
.
dtype
,
)
)
logger
.
info
(
f
"StaticCache (length=
{
args
.
cache_lens
}
) created at
{
args
.
device
}
, batch size:
{
args
.
batch_size
}
"
)
#
logger.info(f"StaticCache (length={args.cache_lens}) created at {args.device}, batch size:{args.batch_size}")
self
.
streamer
=
TextStreamer
(
self
.
tokenizer
)
self
.
streamer
=
TextStreamer
(
self
.
tokenizer
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment