[Bugfix] Fix score layer quantization for sequence classification models -...

[Bugfix] Fix score layer quantization for sequence classification models - Qwen3 (VL) Reranker (#35849) Signed-off-by: Hanjun Cho <gkswns0531@gmail.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>

[Bugfix] Fix score layer quantization for sequence classification models -...
[Bugfix] Fix score layer quantization for sequence classification models - Qwen3 (VL) Reranker (#35849) Signed-off-by: Hanjun Cho <gkswns0531@gmail.com> Co-authored-by: wang.yuqi <yuqi.wang@daocloud.io>
f600d519 · Hanjun Cho · GitHub · 8e782013 · f600d519
Unverified Commit f600d519 authored Mar 05, 2026 by Hanjun Cho Committed by GitHub Mar 04, 2026
Show whitespace changes
Inline Side-by-side

Showing with 29 additions and 7 deletions

vllm/model_executor/models/adapters.py vllm/model_executor/models/adapters.py +29 -7

No files found.
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -288,15 +288,37 @@ def as_seq_cls_model(cls: _T) -> _T:
            vllm_config: "VllmConfig",
            prefix: str = "",
        ) -> "Pooler":
-            text_config = vllm_config.model_config.hf_config.get_text_config()
+            hf_config = vllm_config.model_config.hf_config
+            text_config = hf_config.get_text_config()
            model_config = vllm_config.model_config
-            quant_config = vllm_config.quant_config
+
+            # Check if score weights are derived online from LM head
+            # (same condition as load_weights branch)
+            tokens = getattr(
+                hf_config,
+                "classifier_from_token",
+                getattr(text_config, "classifier_from_token", None),
+            )
+            method = getattr(
+                hf_config,
+                "method",
+                getattr(text_config, "method", None),
+            )
+
+            # Online conversion: no score weights in checkpoint, don't
+            # quantize (small output_dim breaks FP8/Marlin tile alignment).
+            # Checkpoint-based: respect the model's quant_config.
+            quant_config = (
+                None
+                if (tokens is not None or method is not None)
+                else vllm_config.quant_config
+            )

            self.score = ReplicatedLinear(
                model_config.get_hidden_size(),
                text_config.num_labels,
                bias=False,
-                params_dtype=vllm_config.model_config.head_dtype,
+                params_dtype=model_config.head_dtype,
                quant_config=quant_config,
                return_bias=False,
                prefix=maybe_prefix(prefix, "score"),
@@ -452,7 +474,6 @@ def load_weights_using_from_2_way_softmax(
    from vllm.model_executor.model_loader.weight_utils import default_weight_loader

    model_config = model.vllm_config.model_config
-    quant_config = model.vllm_config.quant_config
    hf_config = model.config
    text_config = hf_config.get_text_config()

@@ -469,7 +490,8 @@ def load_weights_using_from_2_way_softmax(
    using_vlm_head = is_vlm and hasattr(language_model, "score")

    language_model.lm_head = ParallelLMHead(
-        text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
+        text_config.vocab_size,
+        text_config.hidden_size,
    )
    if text_config.tie_word_embeddings:
        # embed_tokens is the assumed name for input embeddings. If the model does not
@@ -531,7 +553,6 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
    from vllm.model_executor.model_loader.weight_utils import default_weight_loader

    model_config = model.vllm_config.model_config
-    quant_config = model.vllm_config.quant_config
    text_config = model.config.get_text_config()

    tokens = getattr(text_config, "classifier_from_token", [])
@@ -543,7 +564,8 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
    using_vlm_head = is_vlm and hasattr(language_model, "score")

    language_model.lm_head = ParallelLMHead(
-        text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
+        text_config.vocab_size,
+        text_config.hidden_size,
    )
    if text_config.tie_word_embeddings:
        # embed_tokens is the assumed name for input embeddings. If the model does not