"examples/sampling/vscode:/vscode.git/clone" did not exist on "f5b045587bd6ca3c32934d7b8398a364b7bbae46"
Unverified Commit e72275cf authored by William's avatar William Committed by GitHub
Browse files

Support MiniCPM3 (#1371)

parent fec2d122
......@@ -259,6 +259,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- ChatGLM
- InternLM 2
- Exaone 3
- MiniCPM / MiniCPM 3
**Embedding Models**
......
......@@ -483,11 +483,14 @@ def _decode_grouped_att_m_fwd(
# shape constraints
Lq, Lk = q.shape[-1], k_buffer.shape[-1]
assert Lq == Lk
assert Lk in {16, 32, 64, 96, 128, 256, 576}
assert Lk in {16, 32, 64, 96, 128, 256, 576, 288}
if Lk == 576:
BLOCK_DMODEL = 512
BLOCK_DPE = 64
elif Lk == 288:
BLOCK_DMODEL = 256
BLOCK_DPE = 32
else:
BLOCK_DMODEL = triton.next_power_of_2(Lk)
BLOCK_DPE = 0
......
......@@ -280,12 +280,15 @@ def extend_attention_fwd(
assert Lq == Lk and Lv == Lo
# TODO: is the assertion necessary?
assert Lq in {16, 32, 64, 96, 128, 256, 576}
assert Lq in {16, 32, 64, 96, 128, 256, 576, 288}
assert Lv in {16, 32, 64, 96, 128, 256, 512}
if Lq == 576:
BLOCK_DMODEL = 512
BLOCK_DPE = 64
elif Lq == 288:
BLOCK_DMODEL = 256
BLOCK_DPE = 32
else:
BLOCK_DMODEL = triton.next_power_of_2(Lq)
BLOCK_DPE = 0
......
......@@ -64,6 +64,11 @@ class ModelConfig:
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
self.head_dim = 128
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
else:
self.attention_arch = AttentionArch.MHA
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment