DeepSeek-V2-Chat.yaml 1.7 KB
Newer Older
chenxl's avatar
chenxl committed
1
- match:
2
    class: ktransformers.models.modeling_deepseek.DeepseekV2YarnRotaryEmbedding
chenxl's avatar
chenxl committed
3
  replace:
4
    class: ktransformers.operators.RoPE.YarnRotaryEmbedding
chenxl's avatar
chenxl committed
5
6
7
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
8
9
10
11
- match:
    name: "^model\\.layers\\.(?!.*self_attn).*$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
12
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
chenxl's avatar
chenxl committed
13
14
15
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
16
17
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
chenxl's avatar
chenxl committed
18
19
20
21
- match:
    name: "^model\\.layers\\..*\\.mlp$"
    class: ktransformers.models.modeling_deepseek.DeepseekV2MoE
  replace:
22
    class: ktransformers.operators.experts.KDeepseekV2MoE     # mlp module with custom forward function
23
24
25
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
26
27
28
- match:
    name: "^model\\.layers\\..*\\.mlp\\.experts$"
  replace:
29
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
chenxl's avatar
chenxl committed
30
31
    kwargs:
      prefill_device: "cuda"
32
      prefill_op: "KExpertsTorch"
chenxl's avatar
chenxl committed
33
      generate_device: "cpu"
34
      generate_op: "KExpertsCPU"
chenxl's avatar
chenxl committed
35
36
37
38
39
      out_device: "cuda"
  recursive: False # don't recursively inject submodules of this module
- match:
    name: "^model\\.layers\\..*\\.self_attn$"
  replace:
40
    class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation
41
42
43
    kwargs:
      generate_device: "cuda"
      prefill_device: "cuda"
chenxl's avatar
chenxl committed
44
45
46
47
48
- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
49
50
      generate_device: "cpu"
      prefill_device: "cpu"