[Model] Classification models support logit_bias / sigmoid_normalize (#24031)

Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

[Model] Classification models support logit_bias / sigmoid_normalize (#24031)
Signed-off-by: wang.yuqi <noooop@126.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
e0653f6c · wang.yuqi · GitHub · 38ba061f · e0653f6c · e0653f6c
Unverified Commit e0653f6c authored Sep 03, 2025 by wang.yuqi Committed by GitHub Sep 02, 2025
4 changed files
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2651,24 +2651,46 @@ class PoolerConfig:
    ## for embeddings models
    normalize: Optional[bool] = None
    """
-    Whether to normalize the embeddings outputs.
+    Whether to normalize the embeddings outputs. Defaults to True.
    """
    dimensions: Optional[int] = None
    """
    Reduce the dimensions of embeddings if model
-    support matryoshka representation.
+    support matryoshka representation. Defaults to None.
+    """
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows
+    inputs longer than max_embed_len to be accepted for embedding models.
+    When an input exceeds max_embed_len, it will be handled according to 
+    the original max_model_len validation logic. 
+    Defaults to None (i.e. set to max_model_len).
    """

    ## for classification models
    activation: Optional[bool] = None
    """
    Whether to apply activation function to the classification outputs.
+    Defaults to True.
+    """
+    logit_bias: Optional[float] = None
+    """
+    If provided, apply classification logit biases. Defaults to None.
    """

    ## for reward models
    softmax: Optional[bool] = None
    """
    Whether to apply softmax to the reward outputs.
+    Defaults to True.
    """
    step_tag_id: Optional[int] = None
    """
@@ -2683,25 +2705,6 @@ class PoolerConfig:
    ``math-shepherd-mistral-7b-prm`` model.
    """

-    enable_chunked_processing: Optional[bool] = None
-    """
-    Whether to enable chunked processing for long inputs that exceed the model's
-    maximum position embeddings. When enabled, long inputs will be split into
-    chunks, processed separately, and then aggregated using weighted averaging.
-    This allows embedding models to handle arbitrarily long text without CUDA
-    errors. Defaults to False.
-    """
-
-    max_embed_len: Optional[int] = None
-    """
-    Maximum input length allowed for embedding generation. When set, allows
-    inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
-    max_embed_len, it will be handled according to the original max_model_len
-    validation logic. Defaults to None (i.e. set to max_model_len).
-    """
-
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,

--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -633,9 +633,14 @@ class ClassifierPooler(Pooler):
    ) -> None:
        super().__init__()

+        from vllm.config import get_current_vllm_config
+        vllm_config = get_current_vllm_config()
+
        self.pooling = pooling
        self.classifier = classifier
        self.act_fn = act_fn or PoolerClassify()
+        self.logit_bias: Optional[
+            float] = vllm_config.model_config.pooler_config.logit_bias

    def get_supported_tasks(self) -> Set[PoolingTask]:
        return {"classify", "score"}
@@ -654,6 +659,9 @@ class ClassifierPooler(Pooler):
            pooled_data = self.classifier(pooled_data)
        # pooled_data shape: [batchsize, num_labels]

+        if self.logit_bias is not None:
+            pooled_data -= self.logit_bias
+
        pooling_params = get_pooling_params(pooling_metadata)
        flags = [p.activation for p in pooling_params]


--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -210,8 +210,10 @@ class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
        config = vllm_config.model_config.hf_config
-
        config.num_labels = 1
+        pooler_config = vllm_config.model_config.pooler_config
+        if pooler_config.logit_bias is None:
+            pooler_config.logit_bias = 2.65


 class SnowflakeGteNewModelConfig(VerifyAndUpdateConfig):

--- a/vllm/model_executor/models/jina_vl.py
+++ b/vllm/model_executor/models/jina_vl.py
@@ -92,17 +92,14 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
        pooler_config = vllm_config.model_config.pooler_config
        assert pooler_config is not None

-        # logit bias for sigmoid normalization
-        self.LOGIT_BIAS = 2.65
-
        self.score = JinaVLScorer(config)
        self.pooler = DispatchPooler({
            "encode":
            Pooler.for_encode(pooler_config),
            "classify":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
            "score":
-            Pooler.for_classify(pooler_config, classifier=None),
+            Pooler.for_classify(pooler_config, classifier=self.score),
        })

    @classmethod
@@ -137,9 +134,7 @@ class JinaVLForSequenceClassification(Qwen2VLForConditionalGeneration,
            inputs_embeds=inputs_embeds,
            **kwargs,
        )
-
-        logits = self.score(hidden_states) - self.LOGIT_BIAS
-        return logits
+        return hidden_states

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
        loader = AutoWeightsLoader(self)