[Refactor] GLM-ASR Modeling (#31779)

Signed-off-by: JaredforReal <w13431838023@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>

[Refactor] GLM-ASR Modeling (#31779)
Signed-off-by: JaredforReal <w13431838023@gmail.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
97413875 · Jared Wen · GitHub · 41cfa506 · 97413875 · 97413875
Unverified Commit 97413875 authored Jan 07, 2026 by Jared Wen Committed by GitHub Jan 07, 2026
Showing with 672 additions and 41 deletions

vllm/model_executor/models/glmasr.py vllm/model_executor/models/glmasr.py +644 -36

vllm/model_executor/models/glmasr_utils.py vllm/model_executor/models/glmasr_utils.py +28 -5

No files found.
--- a/vllm/model_executor/models/glmasr.py
+++ b/vllm/model_executor/models/glmasr.py
--- a/vllm/model_executor/models/glmasr_utils.py
+++ b/vllm/model_executor/models/glmasr_utils.py
@@ -71,15 +71,38 @@ def _get_audio_output_lengths_for_tower(
    merge_factor: int,
    conv_params: list[tuple[int, int, int]],
 ) -> torch.Tensor:
+    """
+    Calculate the output lengths after audio processing.
+    The output length accounts for:
+    1. Convolution layers (downsampling)
+    2. Merge factor (further downsampling during projection)
+    Args:
+        audio_tower: The audio encoder module
+        audio_lengths: Input feature lengths [batch_size]
+        merge_factor: Factor for merging adjacent features
+        conv_params: List of (padding, kernel_size, stride) for each conv layer
+    Returns:
+        Output lengths after all processing [batch_size]
+    """
+    # First, calculate the output length after convolutions
    if hasattr(audio_tower, "_get_feat_extract_output_lengths"):
-        _, audio_output_lengths = audio_tower._get_feat_extract_output_lengths(
+        _, conv_output_lengths = audio_tower._get_feat_extract_output_lengths(
            audio_lengths
        )
-        return audio_output_lengths
+    else:
-    return _get_audio_output_lengths_from_lengths(
+        conv_output_lengths = audio_lengths
-        audio_lengths, merge_factor, conv_params
+        for padding, kernel_size, stride in conv_params:
+            conv_output_lengths = _calculate_conv_output_length(
+                conv_output_lengths, padding, kernel_size, stride
            )
+    # Then, apply merge_factor to get final output length
+    # Formula: (conv_output_lengths - merge_factor) // merge_factor + 1
+    return (conv_output_lengths - merge_factor) // merge_factor + 1
 def _flatten_audio_features_by_length(
    audio_features: torch.Tensor,