Support MiniCPM3 (#1371)

e72275cf · William · GitHub · fec2d122 · e72275cf · e72275cf
Unverified Commit e72275cf authored Sep 10, 2024 by William Committed by GitHub Sep 10, 2024
5 changed files
--- a/README.md
+++ b/README.md
@@ -259,6 +259,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - ChatGLM
 - InternLM 2
 - Exaone 3
+- MiniCPM / MiniCPM 3
 **Embedding Models**

--- a/python/sglang/srt/layers/decode_attention.py
+++ b/python/sglang/srt/layers/decode_attention.py
@@ -483,11 +483,14 @@ def _decode_grouped_att_m_fwd(
    # shape constraints
    Lq, Lk = q.shape[-1], k_buffer.shape[-1]
    assert Lq == Lk
-    assert Lk in {16, 32, 64, 96, 128, 256, 576}
+    assert Lk in {16, 32, 64, 96, 128, 256, 576, 288}
    if Lk == 576:
        BLOCK_DMODEL = 512
        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
    else:
        BLOCK_DMODEL = triton.next_power_of_2(Lk)
        BLOCK_DPE = 0

--- a/python/sglang/srt/layers/extend_attention.py
+++ b/python/sglang/srt/layers/extend_attention.py
@@ -280,12 +280,15 @@ def extend_attention_fwd(
    assert Lq == Lk and Lv == Lo
    # TODO: is the assertion necessary?
-    assert Lq in {16, 32, 64, 96, 128, 256, 576}
+    assert Lq in {16, 32, 64, 96, 128, 256, 576, 288}
    assert Lv in {16, 32, 64, 96, 128, 256, 512}
    if Lq == 576:
        BLOCK_DMODEL = 512
        BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
    else:
        BLOCK_DMODEL = triton.next_power_of_2(Lq)
        BLOCK_DPE = 0

--- a/python/sglang/srt/model_config.py
+++ b/python/sglang/srt/model_config.py
@@ -64,6 +64,11 @@ class ModelConfig:
            self.attention_arch = AttentionArch.MLA
            self.kv_lora_rank = self.hf_config.kv_lora_rank
            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+        elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 128
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
        else:
            self.attention_arch = AttentionArch.MHA

--- a/python/sglang/srt/models/minicpm3.py
+++ b/python/sglang/srt/models/minicpm3.py