[Bugfix] Fix GLM4 model

9e14887f · zhuwenwen · d1ea1295 · 9e14887f · 9e14887f · 9e14887f
Commit 9e14887f authored Apr 19, 2025 by zhuwenwen
Showing with 7 additions and 7 deletions

docs/source/models/supported_models.md docs/source/models/supported_models.md +1 -1

tests/models/registry.py tests/models/registry.py +1 -1

vllm/model_executor/models/glm4.py vllm/model_executor/models/glm4.py +5 -5

No files found.
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -334,7 +334,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
 - * `Glm4ForCausalLM`
  * GLM-4-0414
-  * `THUDM/GLM-4-32B-Chat-0414`, etc.
+  * `THUDM/GLM-4-32B-0414`, etc.
  * ✅︎
  * ✅︎
 - * `GPT2LMHeadModel`

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -147,7 +147,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         min_transformers_version="4.50"),
    "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
    "Glm4ForCausalLM": _HfExamplesInfo(
-        "THUDM/GLM-4-32B-Chat-0414",
+        "THUDM/GLM-4-32B-0414",
        is_available_online=False,
        min_transformers_version="4.52.dev0"
    ),

--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -82,7 +82,7 @@ class Glm4Attention(nn.Module):
        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        self.head_dim = head_dim or hidden_size // self.total_num_heads
-        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.rotary_dim = self.head_dim
        self.q_size = self.num_heads * self.head_dim
        self.kv_size = self.num_kv_heads * self.head_dim
        self.scaling = self.head_dim**-0.5
@@ -110,6 +110,7 @@ class Glm4Attention(nn.Module):
            base=self.rope_theta,
            rope_scaling=rope_scaling,
            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
        )
        self.attn = Attention(self.num_heads,
                              self.head_dim,
@@ -197,13 +198,12 @@ class Glm4DecoderLayer(nn.Module):
        )

        hidden_states = self.post_self_attn_layernorm(hidden_states)
-        hidden_states = residual + hidden_states

        # Fully Connected
-        hidden_states = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states, residual = self.post_attention_layernorm(
+             hidden_states, residual)
        hidden_states = self.mlp(hidden_states)
        hidden_states = self.post_mlp_layernorm(hidden_states)
-        hidden_states = residual + hidden_states

        return hidden_states, residual