[New Model]: Support Qwen3 Embedding & Reranker

9bcbaafc · zhuwenwen · 2f6f5bb3 · 9bcbaafc · 9bcbaafc · 9bcbaafc
Commit 9bcbaafc authored Jun 13, 2025 by zhuwenwen
5 changed files
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
 | Llama4ForConditionalGeneration | Llama 4                                                                               | No/Yes | -  | - | v0.8.5.post1  | No |
 | QWenLMHeadModel                | QWen,Qwen-VL                                                                          | Yes | Yes | Yes | v0.5.0，Qwen-VL>=v0.6.2 | Yes |
 | Qwen2ForCausalLM               | QWen2,QWen1.5,CodeQwen1.5,DeepSeek-R1-Distill-Qwen,gte_Qwen2-1.5B-instruct            | Yes | Yes | Yes | v0.5.0，gte>=v0.7.2   | Yes |
-| Qwen3ForCausalLM               | QWen3,Qwen3-Embedding                                                                 | Yes | - | - | v0.8.4   | Yes |
+| Qwen3ForCausalLM               | QWen3,Qwen3-Embedding,Qwen3-Reranker                                                  | Yes | - | - | v0.8.4   | Yes |
 | Qwen3MoeForCausalLM            | QWen3MoE                                                    | Yes | - | - | v0.8.4   | Yes |
 | ChatGLMModel                   | glm-4v-9b,chatglm3,chatglm2                                 | Yes | No  | Yes | v0.5.0   | Yes |
 | Glm4ForCausalLM                | GLM-4-0414                                                  | No/Yes | -  | - | v0.8.5.post1   | Yes |
@@ -36,6 +36,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
 | LlavaForConditionalGeneration       | LLaMA,LLaMA-2,LLaMA-3                         | Yes | No  | -   | v0.6.2   | No |
 | Qwen2VLForConditionalGeneration     | Qwen2-VL                                      | Yes | No  | Yes | v0.6.2   | No |
 | Qwen2_5_VLForConditionalGeneration  | Qwen.5-VL                                     | Yes | No  | Yes | v0.7.2   | No |
+| Mistral3ForConditionalGeneration    | Mistral3                                      | Yes | No  | -   | v0.8.5.post1   | No |
 | Gemma3ForConditionalGeneration      | Gemma 3                                       | Yes | -   | -   | v0.8.5.post1   | No |
 | MiniCPMV                            | MiniCPM-V                                     | Yes | No  | -   | v0.6.2  | No |
 | Phi3VForCausalLM                    | Phi-3.5-vision                                | Yes | No  | -   | v0.6.2  | No |

--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -636,6 +636,11 @@ you should explicitly specify the task type to ensure that the model is used in
  * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
  * ✅︎
  * ✅︎
+- * `Qwen3Model`, `Qwen3ForCausalLM` 
+  * Qwen3-based
+  * `Qwen/Qwen3-Embedding-0.6B`, etc. 
+  * ✅︎
+  * ✅︎
 - * `RobertaModel`, `RobertaForMaskedLM`
  * RoBERTa-based
  * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc.

--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+from vllm import LLM
+model_name = "Qwen3-Reranker-0.6B"
+# What is the difference between the official original version and one
+# that has been converted into a sequence classification model?
+# Qwen3-Reranker is a language model that doing reranker by using the
+# logits of "no" and "yes" tokens.
+# It needs to computing 151669 tokens logits, making this method extremely
+# inefficient, not to mention incompatible with the vllm score API.
+# A method for converting the original model into a sequence classification
+# model was proposed. See：https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+# Models converted offline using this method can not only be more efficient
+# and support the vllm score API, but also make the init parameters more
+# concise, for example.
+# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# If you want to load the official original version, the init parameters are
+# as follows.
+model = LLM(
+    model=model_name,
+    task="score",
+    hf_overrides={
+        "architectures": ["Qwen3ForSequenceClassification"],
+        "classifier_from_token": ["no", "yes"],
+        "is_original_qwen3_reranker": True,
+    },
+)
+# Why do we need hf_overrides for the official original version:
+# vllm converts it to Qwen3ForSequenceClassification when loaded for
+# better performance.
+# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
+# to manually route to Qwen3ForSequenceClassification.
+# - Then, we will extract the vector corresponding to classifier_from_token
+# from lm_head using `"classifier_from_token": ["no", "yes"]`.
+# - Third, we will convert these two vectors into one vector.  The use of
+# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
+# Please use the query_template and document_template to format the query and
+# document for better reranker results.
+prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
+suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
+document_template = "<Document>: {doc}{suffix}"
+if __name__ == "__main__":
+    instruction = (
+        "Given a web search query, retrieve relevant passages that answer the query"
+    )
+    queries = [
+        "What is the capital of China?",
+        "Explain gravity",
+    ]
+    documents = [
+        "The capital of China is Beijing.",
+        "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+    ]
+    queries = [
+        query_template.format(prefix=prefix, instruction=instruction, query=query)
+        for query in queries
+    ]
+    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
+    outputs = model.score(queries, documents)
+    print([output.outputs.score for output in outputs])
\ No newline at end of file
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -36,13 +36,15 @@ from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
                                               RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsCrossEncoding, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, PPMissingLayer, maybe_prefix
@@ -317,3 +319,122 @@ class Qwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                           if self.config.tie_word_embeddings else None),
        )
        return loader.load_weights(weights)
+class Qwen3ForSequenceClassification(nn.Module, SupportsLoRA,
+                                     SupportsCrossEncoding):
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
+        self.vllm_config = vllm_config
+        self.config = config
+        self.quant_config = quant_config
+        self.prefix = prefix
+        self.model = Qwen3Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.score = RowParallelLinear(config.hidden_size,
+                                       config.num_labels,
+                                       quant_config=quant_config,
+                                       input_is_parallel=False,
+                                       bias=False,
+                                       prefix=maybe_prefix(prefix, "score"))
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=False,
+            softmax=True)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.model(input_ids=input_ids,
+                          positions=positions,
+                          inputs_embeds=inputs_embeds,
+                          intermediate_tensors=intermediate_tensors)
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        hidden_states = self._pooler.extract_states(hidden_states,
+                                                    pooling_metadata)
+        logits, _ = self.score(hidden_states)
+        pooled_data = self._pooler.head(logits, pooling_metadata)
+        pooled_outputs = [
+            self._pooler.build_output(data.squeeze(-1)) for data in pooled_data
+        ]
+        return PoolerOutput(outputs=pooled_outputs)
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        is_original_qwen3_reranker = getattr(self.config,
+                                             "is_original_qwen3_reranker",
+                                             False)
+        if not is_original_qwen3_reranker:
+            loader = AutoWeightsLoader(self)
+            return loader.load_weights(weights)
+        return self.load_weights_from_original_qwen3_reranker(weights)
+    def load_weights_from_original_qwen3_reranker(
+            self, weights: Iterable[tuple[str, torch.Tensor]]):
+        tokens = getattr(self.config, "classifier_from_token", None)
+        assert tokens is not None and len(tokens) == 2, \
+            ("Try loading the original Qwen3 Reranker?, see: "
+             "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py")
+        self.config.num_labels = 1
+        model_config = self.vllm_config.model_config
+        device = self.score.weight.device
+        self.score = RowParallelLinear(self.config.hidden_size,
+                                       self.config.num_labels,
+                                       quant_config=self.quant_config,
+                                       input_is_parallel=False,
+                                       bias=False,
+                                       prefix=maybe_prefix(
+                                           self.prefix, "score")).to(device)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(self.config.vocab_size,
+                                          self.config.hidden_size,
+                                          quant_config=self.quant_config,
+                                          prefix=maybe_prefix(
+                                              self.prefix, "lm_head"))
+        loader = AutoWeightsLoader(self)
+        loaded_weights = loader.load_weights(weights)
+        from vllm.transformers_utils.tokenizer import get_tokenizer
+        tokenizer = get_tokenizer(
+            model_config.tokenizer,
+            revision=model_config.tokenizer_revision,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code)
+        a = tokenizer.convert_tokens_to_ids(tokens[0])
+        b = tokenizer.convert_tokens_to_ids(tokens[1])
+        weight = self.lm_head.weight.data[b].to(
+            device) - self.lm_head.weight.data[a].to(device)
+        self.score.weight.data.copy_(weight)
+        del self.lm_head
+        loaded_weights.add("classifier.weight")
+        loaded_weights.discard("lm_head.weight")
\ No newline at end of file
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -168,6 +168,7 @@ _CROSS_ENCODER_MODELS = {
                                            "RobertaForSequenceClassification"),
    "ModernBertForSequenceClassification": ("modernbert",
                                            "ModernBertForSequenceClassification"),
+    "Qwen3ForSequenceClassification": ("qwen3", "Qwen3ForSequenceClassification"), # noqa: E501
 }
 _MULTIMODAL_MODELS = {