Qwen2-57B-A14B-Instruct-multi-gpu.yaml 3.69 KB
Newer Older
chenxl's avatar
chenxl committed
1
2
3
4
5
6
7
8
9
10
11
12
- match:
    name: "^model\\.layers\\.([012])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
- match:
    name: "^model\\.layers\\.([012])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
13
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
chenxl's avatar
chenxl committed
14
15
16
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"
17
18
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
chenxl's avatar
chenxl committed
19
20
21
22
- match:
    name: "^model\\.layers\\.([012])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
23
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
chenxl's avatar
chenxl committed
24
25
26
- match:
    name: "^model\\.layers\\.([012])\\.mlp\\.experts$"
  replace:
27
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
chenxl's avatar
chenxl committed
28
29
30
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:0"
31
      prefill_op: "KExpertsTorch"
chenxl's avatar
chenxl committed
32
      generate_device: "cpu"
33
      generate_op:  "KExpertsCPU"
chenxl's avatar
chenxl committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
      out_device: "cuda:0"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeRotaryEmbedding
  replace:
    class: ktransformers.operators.RoPE.RotaryEmbedding
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])$"  # regular expression 
    class: torch.nn.Linear  # only match modules matching name and class simultaneously
  replace:
49
    class: ktransformers.operators.linear.KTransformersLinear  # optimized Kernel on quantized data types
chenxl's avatar
chenxl committed
50
51
52
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"
53
54
      generate_op: "KLinearMarlin"
      prefill_op: "KLinearTorch"
chenxl's avatar
chenxl committed
55
56
57
58
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp$"
    class: ktransformers.models.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock
  replace:
59
    class: ktransformers.operators.experts.KQwen2MoeSparseMoeBlock     # mlp module with custom forward function
chenxl's avatar
chenxl committed
60
61
62
- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\.mlp\\.experts$"
  replace:
63
    class: ktransformers.operators.experts.KTransformersExperts     # custom MoE Kernel with expert paralleism
chenxl's avatar
chenxl committed
64
65
66
    # device: "cpu"   # which devices to load this module when initializing
    kwargs:
      prefill_device: "cuda:1"
67
      prefill_op: "KExpertsTorch"
chenxl's avatar
chenxl committed
68
      generate_device: "cpu"
69
      generate_op:  "KExpertsCPU"
chenxl's avatar
chenxl committed
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
      out_device: "cuda:1"
  recursive: False # don't recursively inject submodules of this module

- match:
    name: "^model.embed_tokens"
  replace:
    class: "default"
    kwargs:
        generate_device: "cpu"
        prefill_device: "cpu"

- match:
    name: "(^model.norm)|(^lm_head)"
  replace:
    class: "default"
    kwargs:
        generate_device: "cuda:1"
        prefill_device: "cuda:1"

- match:
    name: "^model$"
  replace:
TangJingqi's avatar
TangJingqi committed
92
    class: "ktransformers.operators.models.KQwen2MoeModel"
chenxl's avatar
chenxl committed
93
94
95
96
97
    kwargs:
      per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill
      transfer_map: 
        3: "cuda:1"

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
- match:
    name: "^model\\.layers\\.([012])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:0"
      prefill_device: "cuda:0"

- match:
    name: "^model\\.layers\\.([12][0-9]|[3-9])\\."
  replace:
    class: "default"
    kwargs:
      generate_device: "cuda:1"
      prefill_device: "cuda:1"