[Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)

Signed-off-by: wang.yuqi <noooop@126.com>

[Model][2/N] Improve all pooling task | Support multi-vector retrieval (#25370)
Signed-off-by: wang.yuqi <noooop@126.com>
f54f8512 · wang.yuqi · GitHub · d4d1a602 · f54f8512 · f54f8512
Unverified Commit f54f8512 authored Oct 15, 2025 by wang.yuqi Committed by GitHub Oct 15, 2025
20 changed files
--- a/examples/offline_inference/pooling/README.md
+++ b/examples/offline_inference/pooling/README.md
@@ -26,6 +26,12 @@ python examples/offline_inference/pooling/embed_jina_embeddings_v3.py
 python examples/offline_inference/pooling/embed_matryoshka_fy.py
 ```

+## Multi vector retrieval usage
+
+```bash
+python examples/offline_inference/pooling/multi_vector_retrieval.py
+```
+
 ## Named Entity Recognition (NER) usage

 ```bash

--- a/examples/offline_inference/pooling/multi_vector_retrieval.py
+++ b/examples/offline_inference/pooling/multi_vector_retrieval.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(
+        model="BAAI/bge-m3",
+        runner="pooling",
+        enforce_eager=True,
+    )
+    return parser.parse_args()
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass runner="pooling" for embedding models
+    llm = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = llm.embed(prompts)
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        print(len(embeds))
+
+    # Generate embedding for each token. The output is a list of PoolingRequestOutput.
+    outputs = llm.encode(prompts, pooling_task="token_embed")
+
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for prompt, output in zip(prompts, outputs):
+        multi_vector = output.outputs.data
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -40,7 +40,7 @@ def main():
        model_impl="terratorch",
    )

-    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooling_params = PoolingParams(task="token_classify", activation=False)
    pooler_output = llm.encode(
        img_prompt,
        pooling_params=pooling_params,

--- a/examples/online_serving/pooling/README.md
+++ b/examples/online_serving/pooling/README.md
@@ -18,6 +18,12 @@ python examples/online_serving/pooling/embedding_embed_dtype_client.py
 python examples/online_serving/pooling/jinaai_rerank_client.py
 ```

+## Multi vector retrieval usage
+
+```bash
+python examples/online_serving/pooling/multi_vector_retrieval_client.py
+```
+
 ## Named Entity Recognition (NER) usage

 ```bash

--- a/examples/online_serving/pooling/multi_vector_retrieval_client.py
+++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Example online usage of Pooling API for multi vector retrieval.
+
+Run `vllm serve <model> --runner pooling`
+to start up the server in vLLM. e.g.
+
+vllm serve BAAI/bge-m3
+"""
+
+import argparse
+
+import requests
+import torch
+
+
+def post_http_request(prompt: dict, api_url: str) -> requests.Response:
+    headers = {"User-Agent": "Test Client"}
+    response = requests.post(api_url, headers=headers, json=prompt)
+    return response
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--model", type=str, default="BAAI/bge-m3")
+
+    return parser.parse_args()
+
+
+def main(args):
+    api_url = f"http://{args.host}:{args.port}/pooling"
+    model_name = args.model
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    prompt = {"model": model_name, "input": prompts}
+
+    pooling_response = post_http_request(prompt=prompt, api_url=api_url)
+    for output in pooling_response.json()["data"]:
+        multi_vector = torch.tensor(output["data"])
+        print(multi_vector.shape)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1011,8 +1011,12 @@ class VllmRunner:
        req_outputs = self.llm.embed(inputs, *args, **kwargs)
        return [req_output.outputs.embedding for req_output in req_outputs]

-    def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.llm.encode(prompts)
+    def token_embed(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_embed")
+        return [req_output.outputs.data for req_output in req_outputs]
+
+    def token_classify(self, prompts: list[str]) -> list[list[float]]:
+        req_outputs = self.llm.encode(prompts, pooling_task="token_classify")
        return [req_output.outputs.data for req_output in req_outputs]

    def reward(self, prompts: list[str]) -> list[list[float]]:

--- a/tests/entrypoints/pooling/llm/test_classify.py
+++ b/tests/entrypoints/pooling/llm/test_classify.py
@@ -63,7 +63,7 @@ def test_encode_api(llm: LLM):
    # chunked prefill does not support all pooling
    err_msg = "pooling_task must be one of.+"
    with pytest.raises(ValueError, match=err_msg):
-        llm.encode(prompts, use_tqdm=False)
+        llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)


 def test_score_api(llm: LLM):

--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/llm/test_embedding.py
@@ -35,6 +35,13 @@ def llm():
    cleanup_dist_env_and_memory()


+@pytest.mark.skip_global_cleanup
+def test_encode_api(llm: LLM):
+    outputs = llm.encode(prompts, pooling_task="token_embed", use_tqdm=False)
+    multi_vector = outputs[0].outputs.data
+    assert multi_vector.shape == (11, 384)
+
+
 def test_pooling_params(llm: LLM):
    def get_outputs(normalize):
        outputs = llm.embed(

--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/llm/test_encode.py
@@ -57,20 +57,24 @@ def test_multiple_pooling_params(llm: LLM):
    ]

    # Multiple PoolingParams should be matched with each prompt
-    outputs = llm.encode(PROMPTS, pooling_params=pooling_params)
+    outputs = llm.encode(PROMPTS, pooling_params=pooling_params, pooling_task="embed")
    assert len(PROMPTS) == len(outputs)

    # Exception raised, if the size of params does not match the size of prompts
    with pytest.raises(ValueError):
-        outputs = llm.encode(PROMPTS, pooling_params=pooling_params[:3])
+        outputs = llm.encode(
+            PROMPTS, pooling_params=pooling_params[:3], pooling_task="embed"
+        )

    # Single PoolingParams should be applied to every prompt
    single_pooling_params = PoolingParams()
-    outputs = llm.encode(PROMPTS, pooling_params=single_pooling_params)
+    outputs = llm.encode(
+        PROMPTS, pooling_params=single_pooling_params, pooling_task="embed"
+    )
    assert len(PROMPTS) == len(outputs)

    # pooling_params is None, default params should be applied
-    outputs = llm.encode(PROMPTS, pooling_params=None)
+    outputs = llm.encode(PROMPTS, pooling_params=None, pooling_task="embed")
    assert len(PROMPTS) == len(outputs)



--- a/tests/entrypoints/pooling/llm/test_reward.py
+++ b/tests/entrypoints/pooling/llm/test_reward.py
@@ -36,22 +36,23 @@ def llm():
    cleanup_dist_env_and_memory()


-@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
-    def get_outputs(softmax):
+    def get_outputs(activation):
        outputs = llm.reward(
-            prompts, pooling_params=PoolingParams(softmax=softmax), use_tqdm=False
+            prompts, pooling_params=PoolingParams(activation=activation), use_tqdm=False
        )
        return torch.cat([x.outputs.data for x in outputs])

-    default = get_outputs(softmax=None)
-    w_softmax = get_outputs(softmax=True)
-    wo_softmax = get_outputs(softmax=False)
+    default = get_outputs(activation=None)
+    w_activation = get_outputs(activation=True)
+    wo_activation = get_outputs(activation=False)

-    assert torch.allclose(default, w_softmax, atol=1e-2), "Default should use softmax."
-    assert not torch.allclose(w_softmax, wo_softmax, atol=1e-2), (
-        "wo_softmax should not use softmax."
+    assert torch.allclose(default, w_activation, atol=1e-2), (
+        "Default should use activation."
    )
-    assert torch.allclose(softmax(wo_softmax), w_softmax, atol=1e-2), (
-        "w_softmax should be close to softmax(wo_softmax)."
+    assert not torch.allclose(w_activation, wo_activation, atol=1e-2), (
+        "wo_activation should not use activation."
+    )
+    assert torch.allclose(softmax(wo_activation), w_activation, atol=1e-2), (
+        "w_activation should be close to activation(wo_activation)."
    )
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/openai/test_embedding.py
@@ -17,6 +17,7 @@ from tests.utils import RemoteOpenAIServer
 from vllm.entrypoints.openai.protocol import (
    EMBED_DTYPE_TO_TORCH_DTYPE,
    EmbeddingResponse,
+    PoolingResponse,
 )
 from vllm.transformers_utils.tokenizer import get_tokenizer

@@ -509,3 +510,20 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
    assert torch.allclose(w_normal, F.normalize(wo_normal, p=2, dim=-1), atol=1e-2), (
        "w_normal should be close to normal(wo_normal)."
    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 384
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/openai/test_rerank.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn.functional as F

 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import RerankResponse
+from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse

 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
@@ -159,3 +159,20 @@ async def test_activation(server: RemoteOpenAIServer, model_name: str):
    assert torch.allclose(F.sigmoid(wo_activation), w_activation, atol=1e-2), (
        "w_activation should be close to activation(wo_activation)."
    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_text = ["The chef prepared a delicious meal."]
+
+    response = requests.post(
+        server.url_for("pooling"),
+        json={"model": model_name, "input": input_text, "encoding_format": "float"},
+    )
+
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 11
+    assert len(poolings.data[0].data[0]) == 1
--- a/tests/models/language/pooling/test_multi_vector_retrieval.py
+++ b/tests/models/language/pooling/test_multi_vector_retrieval.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from transformers import AutoModel
+
+from tests.models.utils import check_embeddings_close
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["BAAI/bge-m3"],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@torch.inference_mode
+def test_embed_models(hf_runner, vllm_runner, example_prompts, model: str, dtype: str):
+    with vllm_runner(
+        model,
+        runner="pooling",
+        max_model_len=None,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.token_embed(example_prompts)
+
+    with hf_runner(
+        model,
+        auto_cls=AutoModel,
+    ) as hf_model:
+        tokenizer = hf_model.tokenizer
+        hf_outputs = []
+        for prompt in example_prompts:
+            inputs = tokenizer([prompt], return_tensors="pt")
+            inputs = hf_model.wrap_device(inputs)
+            output = hf_model.model(**inputs)
+            embedding = output.last_hidden_state[0].float()
+            # normal
+            hf_outputs.append(embedding.cpu())
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        check_embeddings_close(
+            embeddings_0_lst=hf_output,
+            embeddings_1_lst=vllm_output,
+            name_0="hf",
+            name_1="vllm",
+            tol=1e-2,
+        )
--- a/tests/models/language/pooling/test_pooler_config_init_behaviour.py
+++ b/tests/models/language/pooling/test_pooler_config_init_behaviour.py
@@ -93,7 +93,7 @@ def test_embed_models_using_normalize(
    ],
 )
 @pytest.mark.parametrize("dtype", ["half"])
-def test_reward_models_using_softmax(
+def test_reward_models_using_activation(
    hf_runner,
    vllm_runner,
    example_prompts,
@@ -104,22 +104,64 @@ def test_reward_models_using_softmax(
        model,
        max_model_len=1024,
        dtype=dtype,
-        pooler_config=PoolerConfig(softmax=False),
+        pooler_config=PoolerConfig(activation=False),
    ) as vllm_model:
-        wo_softmax = vllm_model.encode(example_prompts)
+        wo_activation = vllm_model.reward(example_prompts)

    with vllm_runner(
-        model, max_model_len=1024, dtype=dtype, pooler_config=PoolerConfig(softmax=True)
+        model,
+        max_model_len=1024,
+        dtype=dtype,
+        pooler_config=PoolerConfig(activation=True),
    ) as vllm_model:
-        w_softmax = vllm_model.encode(example_prompts)
+        w_activation = vllm_model.reward(example_prompts)

-    for wo, w in zip(wo_softmax, w_softmax):
+    for wo, w in zip(wo_activation, w_activation):
        wo = torch.tensor(wo)
        w = torch.tensor(w)

        assert not torch.allclose(wo, w, atol=1e-2), (
-            "pooler_config softmax is not working"
+            "pooler_config activation is not working"
        )
        assert torch.allclose(softmax(wo), w, atol=1e-2), (
-            "w_softmax should be close to softmax(wo_softmax)."
+            "w_activation should be close to activation(wo_activation)."
+        )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "intfloat/multilingual-e5-small",
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_multi_vector_retrieval_models_using_normalize(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=False),
+    ) as vllm_model:
+        wo_normalize = vllm_model.token_embed(example_prompts)
+
+    with vllm_runner(
+        model,
+        max_model_len=512,
+        dtype=dtype,
+        pooler_config=PoolerConfig(normalize=True),
+    ) as vllm_model:
+        w_normalize = vllm_model.token_embed(example_prompts)
+
+    for wo, w in zip(wo_normalize, w_normalize):
+        assert not torch.allclose(wo, w, atol=1e-2), (
+            "pooler_config normalize is not working"
+        )
+        assert torch.allclose(F.normalize(wo, p=2, dim=-1), w, atol=1e-2), (
+            "w_normal should be close to normal(wo_normal)."
        )
--- a/tests/models/language/pooling/test_token_classification.py
+++ b/tests/models/language/pooling/test_token_classification.py
@@ -19,7 +19,7 @@ def test_bert_models(
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification
@@ -50,7 +50,7 @@ def test_modernbert_models(
    dtype: str,
 ) -> None:
    with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.token_classify(example_prompts)

    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForTokenClassification

--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -39,7 +39,7 @@ def _run_test(
        max_num_seqs=32,
        default_torch_num_threads=1,
    ) as vllm_model:
-        vllm_model.encode(prompt)
+        vllm_model.llm.encode(prompt, pooling_task="token_classify")


 MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]

--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -30,7 +30,7 @@ class MyGemma2Embedding(nn.Module):

        self.pooler = DispatchPooler(
            {
-                "encode": Pooler.for_encode(pooler_config),
+                "token_embed": Pooler.for_token_embed(pooler_config),
                "embed": Pooler.for_embed(pooler_config),
            }
        )

--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -93,7 +93,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
        out_data_format="b64_json",
    )

-    pooling_params = PoolingParams(task="encode", softmax=False)
+    pooling_params = PoolingParams(activation=False)

    with vllm_runner(
        model_name,
@@ -108,8 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str):
        io_processor_plugin="prithvi_to_tiff",
    ) as llm_runner:
        pooler_output = llm_runner.get_llm().encode(
-            img_prompt,
-            pooling_params=pooling_params,
+            img_prompt, pooling_params=pooling_params, pooling_task="token_classify"
        )
    output = pooler_output[0].outputs


--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
 import pytest

 from tests.models.utils import EmbedModelInfo
 from vllm import PoolingParams
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, PoolerConfig

 EMBEDDING_MODELS = [
    EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
@@ -15,6 +17,15 @@ EMBEDDING_MODELS = [
    ),
 ]

+classify_parameters = ["activation"]
+embed_parameters = ["dimensions", "normalize"]
+step_pooling_parameters = ["step_tag_id", "returned_token_ids"]
+
+
+@dataclass()
+class MockModelConfig:
+    pooler_config: PoolerConfig
+

 def test_task():
    pooling_params = PoolingParams()
@@ -24,25 +35,27 @@ def test_task():
    pooling_params.verify(task="score")

    with pytest.raises(ValueError):
-        pooling_params.verify(task="encode")
+        pooling_params.verify(task="classify")


 def test_embed():
    task = "embed"
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
    pooling_params = PoolingParams(normalize=None)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)

    pooling_params = PoolingParams(normalize=True)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)

    pooling_params = PoolingParams(normalize=False)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)

-    invalid_parameters = ["activation", "softmax"]
+    invalid_parameters = classify_parameters + step_pooling_parameters
    for p in invalid_parameters:
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)


 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
@@ -73,35 +86,71 @@ def test_embed_dimensions(model_info: EmbedModelInfo):

 @pytest.mark.parametrize("task", ["score", "classify"])
 def test_classify(task):
+    model_config = MockModelConfig(pooler_config=PoolerConfig(pooling_type="CLS"))
+
    pooling_params = PoolingParams(activation=None)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)

    pooling_params = PoolingParams(activation=True)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)

    pooling_params = PoolingParams(activation=False)
-    pooling_params.verify(task=task)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    invalid_parameters = embed_parameters + step_pooling_parameters
+    for p in invalid_parameters:
+        with pytest.raises(ValueError):
+            pooling_params = PoolingParams(**{p: True})
+            pooling_params.verify(task=task, model_config=model_config)
+
+
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_embed(pooling_type: str):
+    task = "token_embed"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
+
+    pooling_params = PoolingParams(normalize=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(normalize=True)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(normalize=False)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    invalid_parameters = classify_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = classify_parameters + step_pooling_parameters

-    invalid_parameters = ["dimensions", "normalize", "softmax"]
    for p in invalid_parameters:
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)


-def test_encode():
-    task = "encode"
-    pooling_params = PoolingParams(softmax=None)
-    pooling_params.verify(task=task)
+@pytest.mark.parametrize("pooling_type", ["ALL", "STEP"])
+def test_token_classify(pooling_type: str):
+    task = "token_classify"
+    model_config = MockModelConfig(
+        pooler_config=PoolerConfig(pooling_type=pooling_type)
+    )
+
+    pooling_params = PoolingParams(activation=None)
+    pooling_params.verify(task=task, model_config=model_config)
+
+    pooling_params = PoolingParams(activation=True)
+    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(softmax=True)
-    pooling_params.verify(task=task)
+    pooling_params = PoolingParams(activation=False)
+    pooling_params.verify(task=task, model_config=model_config)

-    pooling_params = PoolingParams(softmax=False)
-    pooling_params.verify(task=task)
+    invalid_parameters = embed_parameters
+    if pooling_type != "STEP":
+        invalid_parameters = embed_parameters + step_pooling_parameters

-    invalid_parameters = ["dimensions", "normalize", "activation"]
    for p in invalid_parameters:
        with pytest.raises(ValueError):
            pooling_params = PoolingParams(**{p: True})
-            pooling_params.verify(task=task)
+            pooling_params.verify(task=task, model_config=model_config)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -951,7 +951,7 @@ class LLM:
        truncate_prompt_tokens: int | None = None,
        use_tqdm: bool | Callable[..., tqdm] = True,
        lora_request: list[LoRARequest] | LoRARequest | None = None,
-        pooling_task: PoolingTask = "encode",
+        pooling_task: PoolingTask | None = None,
        tokenization_kwargs: dict[str, Any] | None = None,
    ) -> list[PoolingRequestOutput]:
        """Apply pooling to the hidden states corresponding to the input
@@ -986,26 +986,25 @@ class LLM:
            instead pass them via the `inputs` parameter.
        """

-        if self.supported_tasks == ["encode"] and pooling_task is None:
-            pooling_task = "encode"
-
-        if pooling_task is None:
-            pooling_task = "embed" if "embed" in self.supported_tasks else "encode"
-
-            logger.warning_once(
-                "`LLM.encode` is currently using `pooling_task = %s`.\n"
+        error_str = (
+            "pooling_task required for `LLM.encode`\n"
            "Please use one of the more specific methods or set the "
-                "task directly when using `LLM.encode`:\n"
+            "pooling_task when using `LLM.encode`:\n"
            "  - For embeddings, use `LLM.embed(...)` "
            'or `pooling_task="embed"`.\n'
            "  - For classification logits, use `LLM.classify(...)` "
            'or `pooling_task="classify"`.\n'
+            "  - For similarity scores, use `LLM.score(...)`.\n"
            "  - For rewards, use `LLM.reward(...)` "
-                'or `pooling_task="reward"`\n'
-                "  - For similarity scores, use `LLM.score(...)`.",
-                pooling_task,
+            'or `pooling_task="token_classify"`\n'
+            "  - For token classification, "
+            'use `pooling_task="token_classify"`\n'
+            '  - For multi-vector retrieval, use `pooling_task="token_embed"`'
        )

+        if pooling_task is None:
+            raise ValueError(error_str)
+
        model_config = self.model_config
        runner_type = model_config.runner_type
        if runner_type != "pooling":
@@ -1206,7 +1205,7 @@ class LLM:
            lora_request=lora_request,
            pooling_params=pooling_params,
            truncate_prompt_tokens=truncate_prompt_tokens,
-            pooling_task="encode",
+            pooling_task="token_classify",
        )

    def _embedding_score(