Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

8d75f22e · zhuwenwen · ce888aa4 · 7d80c73d · 8d75f22e · 8d75f22e
Commit 8d75f22e authored Dec 13, 2025 by zhuwenwen
20 changed files
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -178,9 +178,7 @@ class BambaAttentionDecoderLayer(nn.Module):
        self.scaling = self.head_dim**-0.5
        self.max_position_embeddings = max_position_embeddings

-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = int(self.head_dim * config.partial_rotary_factor)
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
            rotary_dim = config.attn_rotary_emb  # for backward compatibility
        else:
            rotary_dim = self.head_dim  # default

--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -523,8 +523,6 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
 class Blip2ForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
        if modality.startswith("image"):

--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -918,8 +918,6 @@ class ChameleonModel(nn.Module):
 class ChameleonForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant
 ):
-    merge_by_field_config = True
-
    packed_modules_mapping = {
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"],

--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -784,7 +784,6 @@ class CLIPEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant):
    is_pooling_model = True

    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
-    merge_by_field_config = True

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:

--- a/vllm/model_executor/models/cohere2_vision.py
+++ b/vllm/model_executor/models/cohere2_vision.py
@@ -331,8 +331,6 @@ class Cohere2VisionMultiModalProcessor(
    dummy_inputs=Cohere2VisionDummyInputsBuilder,
 )
 class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "model.vision_tower.": "vision_tower.",

--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -4,11 +4,10 @@ from copy import deepcopy
 from math import lcm
 from typing import TYPE_CHECKING

-import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -78,8 +77,6 @@ class JinaRobertaModelConfig(VerifyAndUpdateConfig):
            if not model_config.enforce_eager:
                max_position = round_up(max_position, 8)

-            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
            config.rotary_kwargs = {
                "head_size": head_dim,
                "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
@@ -119,8 +116,6 @@ class NomicBertModelConfig(VerifyAndUpdateConfig):
        rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
        max_trained_positions = getattr(config, "max_trained_positions", 2048)

-        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
-
        config.rotary_kwargs = {
            "head_size": head_dim,
            "rotary_dim": rotary_emb_dim,
@@ -336,6 +331,7 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        # Enable FULL_AND_PIECEWISE by default
        MambaModelConfig.verify_and_update_config(vllm_config)

+        attention_config = vllm_config.attention_config
        cache_config = vllm_config.cache_config
        model_config = vllm_config.model_config
        parallel_config = vllm_config.parallel_config
@@ -352,7 +348,9 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
        #   * CUTLASS_MLA backend: kernel_block_size 128 alignment
        #   * Other MLA backends: kernel_block_size 64 alignment
        if model_config.use_mla:
-            use_cutlass_mla = envs.VLLM_ATTENTION_BACKEND == "CUTLASS_MLA"
+            use_cutlass_mla = (
+                attention_config.backend == AttentionBackendEnum.CUTLASS_MLA
+            )
            kernel_block_alignment_size = 128 if use_cutlass_mla else 64
            attn_page_size_1_token = MLAAttentionSpec(
                block_size=1,
@@ -366,8 +364,8 @@ class HybridAttentionMambaModelConfig(VerifyAndUpdateConfig):
                current_platform.is_device_capability(100)
                and model_config.get_head_size() == 256
                and (
-                    envs.VLLM_ATTENTION_BACKEND is None
-                    or envs.VLLM_ATTENTION_BACKEND == "FLASHINFER"
+                    attention_config.backend is None
+                    or attention_config.backend == AttentionBackendEnum.FLASHINFER
                )
            ):
                # https://github.com/flashinfer-ai/flashinfer/issues/1993 reports that`
@@ -490,29 +488,24 @@ class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")


-class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
-
-    @classmethod
-    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
-        """
-        Updated fp8 cache to custom "fp8_ds_mla" format for DeepSeekV32
+class NemotronHForCausalLMConfig(VerifyAndUpdateConfig):
+    @staticmethod
+    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
+        """Update mamba_ssm_cache_dtype for NemotronH models when set to 'auto'
+        (or not explicitly set), to the value specified in the HF config, or to
+        float16 if not specified.
        """
-        hf_config = vllm_config.model_config.hf_config
-
-        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
-        # is_v32 = hasattr(hf_config, "index_topk")
-        # assert is_v32
-
-        # For DeepSeekV3.2, we use a custom fp8 format as default (i.e.
-        #   "auto")
        cache_config = vllm_config.cache_config
-        if cache_config.cache_dtype == "auto" or \
-            cache_config.cache_dtype.startswith("fp8"):
-            cache_config.cache_dtype = "fp8_ds_mla"
-            logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2")
-        if cache_config.cache_dtype == "bfloat16":
-            cache_config.cache_dtype = "auto"
-            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
+        if cache_config.mamba_ssm_cache_dtype == "auto":
+            hf_config = vllm_config.model_config.hf_config
+            mamba_ssm_cache_dtype = getattr(
+                hf_config, "mamba_ssm_cache_dtype", "float16"
+            )
+            logger.info(
+                "Updating mamba_ssm_cache_dtype to '%s' for NemotronH model",
+                mamba_ssm_cache_dtype,
+            )
+            cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype


 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
@@ -532,4 +525,5 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
    "Mamba2ForCausalLM": MambaModelConfig,
    "FalconMambaForCausalLM": MambaModelConfig,
    "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "NemotronHForCausalLM": NemotronHForCausalLMConfig,
 }
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
    MultiModalDataDict,
    MultiModalFieldConfig,
-    MultiModalKwargs,
+    MultiModalKwargsItems,
    NestedTensors,
 )
 from vllm.multimodal.parse import (
@@ -305,7 +305,7 @@ class DeepseekOCRMultiModalProcessor(
        self,
        mm_items: MultiModalDataItems,
        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
+        out_mm_kwargs: MultiModalKwargsItems,
    ) -> Sequence[PromptUpdate]:
        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)

@@ -344,8 +344,6 @@ class DeepseekOCRMultiModalProcessor(
    dummy_inputs=DeepseekOCRDummyInputsBuilder,
 )
 class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            # map prefix for language backbone

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -686,11 +686,10 @@ def sparse_attn_indexer(
                chunk.cu_seqlen_ke,
            )
            num_rows = logits.shape[0]
-            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
            topk_indices = topk_indices_buffer[
                chunk.token_start : chunk.token_end, :topk_tokens
            ]
-            torch.ops._C.top_k_per_row(
+            torch.ops._C.top_k_per_row_prefill(
                logits,
                chunk.cu_seqlen_ks,
                chunk.cu_seqlen_ke,
@@ -698,6 +697,7 @@ def sparse_attn_indexer(
                num_rows,
                logits.stride(0),
                logits.stride(1),
+                topk_tokens,
            )

    if has_decode:
@@ -740,7 +740,6 @@ def sparse_attn_indexer(
            max_model_len=max_model_len,
        )
        num_rows = logits.shape[0]
-        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
        topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]

        torch.ops._C.top_k_per_row_decode(
@@ -751,6 +750,7 @@ def sparse_attn_indexer(
            num_rows,
            logits.stride(0),
            logits.stride(1),
+            topk_tokens,
        )
        if decode_metadata.requires_padding:
            # if padded, we need to unpack

--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -344,8 +344,6 @@ class DeepseekVL2MultiModalProcessor(
    dummy_inputs=DeepseekVL2DummyInputsBuilder,
 )
 class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "language.": "language_model.",

--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -690,8 +690,6 @@ class DotsVisionTransformer(nn.Module):
    dummy_inputs=DotsOCRDummyInputsBuilder,
 )
 class DotsOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
-    merge_by_field_config = True
-
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_substr={
            ".attn.qkv_proj.": ".attn.qkv.",

--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -289,12 +289,12 @@ class Ernie4_5_VisionAttention(nn.Module):
        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
            # Execute attention entry by entry for speed & less VRAM.
            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                q_i, k_i, v_i = (
                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                )
@@ -1254,8 +1254,6 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing
 class Ernie4_5_VLMoeForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",

--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -242,9 +242,7 @@ class FalconH1AttentionDecoderLayer(nn.Module):
        self.scaling = self.head_dim**-0.5
        self.max_position_embeddings = max_position_embeddings

-        if hasattr(config, "partial_rotary_factor"):
-            rotary_dim = self.head_dim * config.partial_rotary_factor
-        elif hasattr(config, "attn_rotary_emb"):
+        if hasattr(config, "attn_rotary_emb"):
            rotary_dim = config.attn_rotary_emb  # for backward compatibility
        else:
            rotary_dim = self.head_dim  # default

--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -260,8 +260,6 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
    dummy_inputs=FuyuDummyInputsBuilder,
 )
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
-    merge_by_field_config = True
-
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "model.vision_embed_tokens.": "vision_embed_tokens.",

--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -483,8 +483,6 @@ class Gemma3MultiModalProjector(nn.Module):
 class Gemma3ForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA
 ):
-    merge_by_field_config = True
-
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",

--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -463,7 +463,6 @@ class Gemma3nMultimodalEmbedder(nn.Module):
 class Gemma3nForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsTranscription
 ):
-    merge_by_field_config = True
    supported_languages = ISO639_1_SUPPORTED_LANGS

    packed_modules_mapping = {

--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -10,7 +10,8 @@ from .utils import PPMissingLayer

 class GlmForCausalLM(LlamaForCausalLM):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        vllm_config.model_config.hf_config.partial_rotary_factor = 0.5
+        hf_config = vllm_config.model_config.hf_config
+        hf_config.rope_parameters["partial_rotary_factor"] = 0.5
        super().__init__(vllm_config=vllm_config, prefix=prefix)
        # Hack Llama model to fit HF format GLM implementation
        # Attention difference between GLM and Llama:

--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -78,7 +78,7 @@ class Glm4Attention(nn.Module):
            # Number of KV heads is less than TP size, so we replicate
            # the KV heads across multiple tensor parallel GPUs.
            assert tp_size % self.total_num_kv_heads == 0
-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        self.head_dim = head_dim or hidden_size // self.total_num_heads
        self.rotary_dim = self.head_dim
@@ -106,7 +106,6 @@ class Glm4Attention(nn.Module):
            rotary_dim=self.rotary_dim,
            max_position=max_position,
            rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
            is_neox_style=False,
        )
        self.attn = Attention(

--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -377,12 +377,12 @@ class Glm4vVisionAttention(nn.Module):
        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
            # Execute attention entry by entry for speed & less VRAM.
            outputs = []
-            for i in range(1, len(cu_seqlens)):
-                start_idx = cu_seqlens[i - 1]
-                end_idx = cu_seqlens[i]
-                q_i = q[:, start_idx:end_idx]
-                k_i = k[:, start_idx:end_idx]
-                v_i = v[:, start_idx:end_idx]
+
+            lens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            q_chunks = torch.split(q, lens, dim=1)
+            k_chunks = torch.split(k, lens, dim=1)
+            v_chunks = torch.split(v, lens, dim=1)
+            for q_i, k_i, v_i in zip(q_chunks, k_chunks, v_chunks):
                q_i, k_i, v_i = (
                    rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
                )
@@ -787,10 +787,10 @@ class Glm4vVisionTransformer(nn.Module):
    def forward(
        self,
        x: torch.Tensor,
-        grid_thw: list[list[int]],
+        grid_thw: torch.Tensor | list[list[int]],
    ) -> torch.Tensor:
-        # Convert grid_thw to tensor (always expecting list format now)
-        grid_thw = torch.tensor(grid_thw, device=x.device, dtype=torch.long)
+        if isinstance(grid_thw, list):
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)

        # patchify
        x = x.to(device=self.device, dtype=self.dtype)
@@ -805,7 +805,8 @@ class Glm4vVisionTransformer(nn.Module):
        cu_seqlens = torch.repeat_interleave(
            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)

        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
@@ -1424,8 +1425,6 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
 class Glm4vForConditionalGeneration(
    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
@@ -1550,7 +1549,6 @@ class Glm4vForConditionalGeneration(
    ) -> tuple[torch.Tensor, ...]:
        grid_thw = image_input["image_grid_thw"]
        assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()

        if image_input["type"] == "image_embeds":
            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
@@ -1561,12 +1559,10 @@ class Glm4vForConditionalGeneration(
                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                )
            else:
-                image_embeds = self.visual(pixel_values, grid_thw=grid_thw.tolist())
+                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
        merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
        return image_embeds.split(sizes)

    def _process_video_input(
@@ -1574,7 +1570,6 @@ class Glm4vForConditionalGeneration(
    ) -> tuple[torch.Tensor, ...]:
        grid_thw = video_input["video_grid_thw"]
        assert grid_thw.ndim == 2
-        grid_thw_list = grid_thw.tolist()

        if video_input["type"] == "video_embeds":
            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
@@ -1590,15 +1585,11 @@ class Glm4vForConditionalGeneration(
                    rope_type="rope_3d",
                )
            else:
-                video_embeds = self.visual(
-                    pixel_values_videos, grid_thw=grid_thw.tolist()
-                )
+                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
        # Split concatenated embeddings for each video item.
        merge_size = self.visual.spatial_merge_size
-        sizes = (
-            torch.tensor(grid_thw_list, dtype=torch.long).prod(-1)
-            // (merge_size * merge_size)
-        ).tolist()
+        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
        return video_embeds.split(sizes)

    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:

--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -282,13 +282,12 @@ class Glm4MoeAttention(nn.Module):
            prefix=f"{prefix}.o_proj",
        )

-        partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+        config.rope_parameters.setdefault("partial_rotary_factor", 0.5)
        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            rope_parameters=config.rope_parameters,
-            partial_rotary_factor=partial_rotary_factor,
        )
        self.attn = Attention(
            self.num_heads,

--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -561,8 +561,6 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 class GLM4VForCausalLM(
    ChatGLMBaseModel, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
 ):
-    merge_by_field_config = True
-
    packed_modules_mapping = {
        "query_key_value": ["query_key_value"],
        "dense_h_to_4h": ["dense_h_to_4h"],