DeepSeek-V2-Chat.yaml 2.17 KB
Newer Older
chenxl's avatar
chenxl committed
1
- match:
2
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
chenxl's avatar
chenxl committed
3
  replace:
4
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
chenxl's avatar
chenxl committed
5
6
7
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
8
- match:
chenxl's avatar
chenxl committed
9
    name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$"  # regular expression 
chenxl's avatar
chenxl committed
10
11
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
12
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
chenxl's avatar
chenxl committed
13
14
15
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
16
17
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
18
19
20
21
22
23
24
25
26
27
28
29

- match:
    name: "^lm_head"
    class: torch.nn.Linear
  replace:
    class: ktransformers.operators.linear.KTransformersLinear
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"

chenxl's avatar
chenxl committed
30
31
32
33
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
34
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
35
36
37
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
38
39
40
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
41
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
chenxl's avatar
chenxl committed
42
43
    kwargs:
      prefill_device: "cuda"
44
      prefill_op: "KExpertsTorch"
chenxl's avatar
chenxl committed
45
      generate_device: "cpu"
46
      generate_op: "KExpertsCPU"
chenxl's avatar
chenxl committed
47
48
49
50
51
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
52
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
53
54
55
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
56
57
58
59
60
- match:
    name: "^model$"
  replace:
    class: "ktransformers.operators.models.KDeepseekV2Model"
    kwargs:
TangJingqi's avatar
TangJingqi committed
61
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
chenxl's avatar
chenxl committed
62
63
64
65
66
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
67
68
      generate_device: "cpu"
      prefill_device: "cpu"