Unverified Commit e72275cf authored by William's avatar William Committed by GitHub
Browse files

Support MiniCPM3 (#1371)

parent fec2d122
...@@ -259,6 +259,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -259,6 +259,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- ChatGLM - ChatGLM
- InternLM 2 - InternLM 2
- Exaone 3 - Exaone 3
- MiniCPM / MiniCPM 3
**Embedding Models** **Embedding Models**
......
...@@ -483,11 +483,14 @@ def _decode_grouped_att_m_fwd( ...@@ -483,11 +483,14 @@ def _decode_grouped_att_m_fwd(
# shape constraints # shape constraints
Lq, Lk = q.shape[-1], k_buffer.shape[-1] Lq, Lk = q.shape[-1], k_buffer.shape[-1]
assert Lq == Lk assert Lq == Lk
assert Lk in {16, 32, 64, 96, 128, 256, 576} assert Lk in {16, 32, 64, 96, 128, 256, 576, 288}
if Lk == 576: if Lk == 576:
BLOCK_DMODEL = 512 BLOCK_DMODEL = 512
BLOCK_DPE = 64 BLOCK_DPE = 64
elif Lk == 288:
BLOCK_DMODEL = 256
BLOCK_DPE = 32
else: else:
BLOCK_DMODEL = triton.next_power_of_2(Lk) BLOCK_DMODEL = triton.next_power_of_2(Lk)
BLOCK_DPE = 0 BLOCK_DPE = 0
......
...@@ -280,12 +280,15 @@ def extend_attention_fwd( ...@@ -280,12 +280,15 @@ def extend_attention_fwd(
assert Lq == Lk and Lv == Lo assert Lq == Lk and Lv == Lo
# TODO: is the assertion necessary? # TODO: is the assertion necessary?
assert Lq in {16, 32, 64, 96, 128, 256, 576} assert Lq in {16, 32, 64, 96, 128, 256, 576, 288}
assert Lv in {16, 32, 64, 96, 128, 256, 512} assert Lv in {16, 32, 64, 96, 128, 256, 512}
if Lq == 576: if Lq == 576:
BLOCK_DMODEL = 512 BLOCK_DMODEL = 512
BLOCK_DPE = 64 BLOCK_DPE = 64
elif Lq == 288:
BLOCK_DMODEL = 256
BLOCK_DPE = 32
else: else:
BLOCK_DMODEL = triton.next_power_of_2(Lq) BLOCK_DMODEL = triton.next_power_of_2(Lq)
BLOCK_DPE = 0 BLOCK_DPE = 0
......
...@@ -64,6 +64,11 @@ class ModelConfig: ...@@ -64,6 +64,11 @@ class ModelConfig:
self.attention_arch = AttentionArch.MLA self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
self.head_dim = 128
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
else: else:
self.attention_arch = AttentionArch.MHA self.attention_arch = AttentionArch.MHA
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment