Unverified Commit fff3711a authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][2/n] Improve pooling entrypoints | embed. (#36110)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
parent c4d859c2
...@@ -25,7 +25,7 @@ ROCM_ATTN_BACKENDS = [ ...@@ -25,7 +25,7 @@ ROCM_ATTN_BACKENDS = [
"FLEX_ATTENTION", "FLEX_ATTENTION",
] ]
ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else [] ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else ["auto"]
# Per-backend tolerance with explicit entries; "default" is the fallback # Per-backend tolerance with explicit entries; "default" is the fallback
BACKEND_TOL: dict[str, float] = { BACKEND_TOL: dict[str, float] = {
...@@ -105,13 +105,16 @@ def server(request): ...@@ -105,13 +105,16 @@ def server(request):
"8192", "8192",
"--chat-template", "--chat-template",
str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"), str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
"--attention-config", ]
json.dumps({"backend": backend}),
] + ROCM_EXTRA_ARGS
env = dict(ROCM_ENV_OVERRIDES) env = dict()
if backend != "ROCM_AITER_FA": if backend != "auto":
env["VLLM_ROCM_USE_AITER"] = "0" args += ["--attention-config", json.dumps({"backend": backend})]
args += ROCM_EXTRA_ARGS
env = dict(ROCM_ENV_OVERRIDES)
if backend != "ROCM_AITER_FA":
env["VLLM_ROCM_USE_AITER"] = "0"
with RemoteOpenAIServer( with RemoteOpenAIServer(
MODEL_NAME, args, override_hf_configs=HF_OVERRIDES, env_dict=env MODEL_NAME, args, override_hf_configs=HF_OVERRIDES, env_dict=env
......
...@@ -397,7 +397,7 @@ class LLM: ...@@ -397,7 +397,7 @@ class LLM:
self.io_processor = self.llm_engine.io_processor self.io_processor = self.llm_engine.io_processor
self.input_processor = self.llm_engine.input_processor self.input_processor = self.llm_engine.input_processor
self.chat_template_config = ChatTemplateConfig(chat_template=self.chat_template) self.chat_template_config = ChatTemplateConfig(chat_template=self.chat_template)
self.init_pooling_io_processors = init_pooling_io_processors( self.pooling_io_processors = init_pooling_io_processors(
supported_tasks=supported_tasks, supported_tasks=supported_tasks,
model_config=self.model_config, model_config=self.model_config,
renderer=self.renderer, renderer=self.renderer,
...@@ -1174,8 +1174,8 @@ class LLM: ...@@ -1174,8 +1174,8 @@ class LLM:
) )
raise ValueError(msg) raise ValueError(msg)
if pooling_task in self.init_pooling_io_processors: if pooling_task in self.pooling_io_processors:
io_processor = self.init_pooling_io_processors[pooling_task] io_processor = self.pooling_io_processors[pooling_task]
processor_inputs = io_processor.pre_process_offline( processor_inputs = io_processor.pre_process_offline(
prompts_seq, tokenization_kwargs prompts_seq, tokenization_kwargs
) )
...@@ -1194,7 +1194,7 @@ class LLM: ...@@ -1194,7 +1194,7 @@ class LLM:
outputs = self._run_engine( outputs = self._run_engine(
use_tqdm=use_tqdm, output_type=PoolingRequestOutput use_tqdm=use_tqdm, output_type=PoolingRequestOutput
) )
outputs = io_processor.post_process(outputs) outputs = io_processor.post_process_offline(outputs)
else: else:
outputs = self._run_completion( outputs = self._run_completion(
prompts=prompts_seq, prompts=prompts_seq,
......
...@@ -60,12 +60,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import ( ...@@ -60,12 +60,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranscriptionResponse, TranscriptionResponse,
TranslationRequest, TranslationRequest,
) )
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingResponse,
)
from vllm.entrypoints.pooling.pooling.protocol import ( from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest, IOProcessorRequest,
PoolingChatRequest, PoolingChatRequest,
...@@ -144,17 +138,13 @@ CompletionLikeRequest: TypeAlias = ( ...@@ -144,17 +138,13 @@ CompletionLikeRequest: TypeAlias = (
CompletionRequest CompletionRequest
| TokenizeCompletionRequest | TokenizeCompletionRequest
| DetokenizeRequest | DetokenizeRequest
| EmbeddingCompletionRequest
| RerankRequest | RerankRequest
| ScoreRequest | ScoreRequest
| PoolingCompletionRequest | PoolingCompletionRequest
) )
ChatLikeRequest: TypeAlias = ( ChatLikeRequest: TypeAlias = (
ChatCompletionRequest ChatCompletionRequest | TokenizeChatRequest | PoolingChatRequest
| TokenizeChatRequest
| EmbeddingChatRequest
| PoolingChatRequest
) )
SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
...@@ -171,8 +161,6 @@ AnyRequest: TypeAlias = ( ...@@ -171,8 +161,6 @@ AnyRequest: TypeAlias = (
AnyResponse: TypeAlias = ( AnyResponse: TypeAlias = (
CompletionResponse CompletionResponse
| ChatCompletionResponse | ChatCompletionResponse
| EmbeddingResponse
| EmbeddingBytesResponse
| TranscriptionResponse | TranscriptionResponse
| TokenizeResponse | TokenizeResponse
| PoolingResponse | PoolingResponse
...@@ -203,8 +191,7 @@ class ServeContext(Generic[RequestT]): ...@@ -203,8 +191,7 @@ class ServeContext(Generic[RequestT]):
class OpenAIServing: class OpenAIServing:
request_id_prefix: ClassVar[str] = """ request_id_prefix: ClassVar[str] = """
A short string prepended to every request’s ID (e.g. "embd") A short string prepended to every request’s ID.
so you can easily tell “this ID came from Embedding.”
""" """
def __init__( def __init__(
...@@ -432,8 +419,7 @@ class OpenAIServing: ...@@ -432,8 +419,7 @@ class OpenAIServing:
ctx: ServeContext, ctx: ServeContext,
) -> ErrorResponse | None: ) -> ErrorResponse | None:
""" """
Default preprocessing hook. Subclasses may override Default preprocessing hook. Subclasses may override to prepare `ctx`.
to prepare `ctx` (embedding, etc.).
""" """
return None return None
...@@ -730,13 +716,10 @@ class OpenAIServing: ...@@ -730,13 +716,10 @@ class OpenAIServing:
token_num = len(input_ids) token_num = len(input_ids)
max_model_len = self.model_config.max_model_len max_model_len = self.model_config.max_model_len
# Note: EmbeddingRequest, # Note: ScoreRequest doesn't have max_tokens
# and ScoreRequest doesn't have max_tokens
if isinstance( if isinstance(
request, request,
( (
EmbeddingChatRequest,
EmbeddingCompletionRequest,
ScoreDataRequest, ScoreDataRequest,
ScoreTextRequest, ScoreTextRequest,
ScoreQueriesDocumentsRequest, ScoreQueriesDocumentsRequest,
......
...@@ -53,6 +53,7 @@ from vllm.entrypoints.pooling.score.protocol import ( ...@@ -53,6 +53,7 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest, ScoreRequest,
ScoreResponse, ScoreResponse,
) )
from vllm.entrypoints.utils import create_error_response
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager from vllm.reasoning import ReasoningParserManager
from vllm.utils import random_uuid from vllm.utils import random_uuid
...@@ -503,7 +504,10 @@ async def run_request( ...@@ -503,7 +504,10 @@ async def run_request(
request: BatchRequestInput, request: BatchRequestInput,
tracker: BatchProgressTracker, tracker: BatchProgressTracker,
) -> BatchRequestOutput: ) -> BatchRequestOutput:
response = await serving_engine_func(request.body) try:
response = await serving_engine_func(request.body)
except Exception as e:
response = create_error_response(e)
if isinstance( if isinstance(
response, response,
...@@ -678,10 +682,10 @@ async def build_endpoint_registry( ...@@ -678,10 +682,10 @@ async def build_endpoint_registry(
# Get serving objects from state (defaulting to None if not set) # Get serving objects from state (defaulting to None if not set)
openai_serving_chat = getattr(state, "openai_serving_chat", None) openai_serving_chat = getattr(state, "openai_serving_chat", None)
openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
openai_serving_scores = getattr(state, "openai_serving_scores", None)
openai_serving_transcription = getattr(state, "openai_serving_transcription", None) openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
openai_serving_translation = getattr(state, "openai_serving_translation", None) openai_serving_translation = getattr(state, "openai_serving_translation", None)
serving_embedding = getattr(state, "serving_embedding", None)
serving_scores = getattr(state, "serving_scores", None)
# Registry of endpoint configurations # Registry of endpoint configurations
endpoint_registry: dict[str, dict[str, Any]] = { endpoint_registry: dict[str, dict[str, Any]] = {
...@@ -697,27 +701,21 @@ async def build_endpoint_registry( ...@@ -697,27 +701,21 @@ async def build_endpoint_registry(
"embeddings": { "embeddings": {
"url_matcher": lambda url: url == "/v1/embeddings", "url_matcher": lambda url: url == "/v1/embeddings",
"handler_getter": lambda: ( "handler_getter": lambda: (
openai_serving_embedding.create_embedding serving_embedding if serving_embedding is not None else None
if openai_serving_embedding is not None
else None
), ),
"wrapper_fn": None, "wrapper_fn": None,
}, },
"score": { "score": {
"url_matcher": lambda url: url.endswith("/score"), "url_matcher": lambda url: url.endswith("/score"),
"handler_getter": lambda: ( "handler_getter": lambda: (
openai_serving_scores.create_score serving_scores.create_score if serving_scores is not None else None
if openai_serving_scores is not None
else None
), ),
"wrapper_fn": None, "wrapper_fn": None,
}, },
"rerank": { "rerank": {
"url_matcher": lambda url: url.endswith("/rerank"), "url_matcher": lambda url: url.endswith("/rerank"),
"handler_getter": lambda: ( "handler_getter": lambda: (
openai_serving_scores.do_rerank serving_scores.do_rerank if serving_scores is not None else None
if openai_serving_scores is not None
else None
), ),
"wrapper_fn": None, "wrapper_fn": None,
}, },
......
...@@ -56,14 +56,14 @@ def init_pooling_state( ...@@ -56,14 +56,14 @@ def init_pooling_state(
): ):
from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.entrypoints.pooling.score.serving import ServingScores
from vllm.tasks import POOLING_TASKS from vllm.tasks import POOLING_TASKS
resolved_chat_template = load_chat_template(args.chat_template) resolved_chat_template = load_chat_template(args.chat_template)
state.openai_serving_pooling = ( state.serving_pooling = (
( (
OpenAIServingPooling( OpenAIServingPooling(
engine_client, engine_client,
...@@ -77,8 +77,8 @@ def init_pooling_state( ...@@ -77,8 +77,8 @@ def init_pooling_state(
if any(t in supported_tasks for t in POOLING_TASKS) if any(t in supported_tasks for t in POOLING_TASKS)
else None else None
) )
state.openai_serving_embedding = ( state.serving_embedding = (
OpenAIServingEmbedding( ServingEmbedding(
engine_client, engine_client,
state.openai_serving_models, state.openai_serving_models,
request_logger=request_logger, request_logger=request_logger,
...@@ -89,7 +89,7 @@ def init_pooling_state( ...@@ -89,7 +89,7 @@ def init_pooling_state(
if "embed" in supported_tasks if "embed" in supported_tasks
else None else None
) )
state.openai_serving_classification = ( state.serving_classification = (
ServingClassification( ServingClassification(
engine_client, engine_client,
state.openai_serving_models, state.openai_serving_models,
...@@ -105,7 +105,7 @@ def init_pooling_state( ...@@ -105,7 +105,7 @@ def init_pooling_state(
# - "score" task (cross-encoder models) # - "score" task (cross-encoder models)
# - "embed" task (bi-encoder models) # - "embed" task (bi-encoder models)
# - "token_embed" task (late interaction models like ColBERT) # - "token_embed" task (late interaction models like ColBERT)
state.openai_serving_scores = ( state.serving_scores = (
ServingScores( ServingScores(
engine_client, engine_client,
state.openai_serving_models, state.openai_serving_models,
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable, Sequence from collections.abc import Callable, Sequence
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Final from typing import Any, Final
from vllm import PoolingRequestOutput, PromptType from vllm import PoolingRequestOutput, PromptType
...@@ -14,9 +13,13 @@ from vllm.entrypoints.chat_utils import ( ...@@ -14,9 +13,13 @@ from vllm.entrypoints.chat_utils import (
ConversationMessage, ConversationMessage,
) )
from vllm.entrypoints.openai.engine.serving import RendererChatRequest, RendererRequest from vllm.entrypoints.openai.engine.serving import RendererChatRequest, RendererRequest
from vllm.inputs import ProcessorInputs, SingletonPrompt from vllm.entrypoints.pooling.typing import (
PoolingChatLikeRequest,
PoolingCompletionLikeRequest,
PoolingServeContext,
)
from vllm.inputs.data import ProcessorInputs, SingletonPrompt
from vllm.renderers import BaseRenderer, merge_kwargs from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs import TokPrompt
from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser from vllm.tool_parsers import ToolParser
...@@ -24,14 +27,14 @@ from vllm.utils.mistral import is_mistral_tokenizer ...@@ -24,14 +27,14 @@ from vllm.utils.mistral import is_mistral_tokenizer
class PoolingIOProcessor: class PoolingIOProcessor:
name: str
def __init__( def __init__(
self, self,
model_config: ModelConfig, model_config: ModelConfig,
renderer: BaseRenderer, renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig, chat_template_config: ChatTemplateConfig,
): ):
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self.model_config = model_config self.model_config = model_config
self.renderer = renderer self.renderer = renderer
...@@ -43,37 +46,90 @@ class PoolingIOProcessor: ...@@ -43,37 +46,90 @@ class PoolingIOProcessor:
chat_template_config.trust_request_chat_template chat_template_config.trust_request_chat_template
) )
def pre_process_online(self, *args, **kwargs): def create_pooling_params(self, request):
raise NotImplementedError return request.to_pooling_params()
#######################################
# online APIs
def pre_process_online(self, ctx: PoolingServeContext):
request = ctx.request
if isinstance(ctx.request, PoolingChatLikeRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, PoolingCompletionLikeRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError(f"Invalid {self.name} request type")
ctx.engine_prompts = engine_prompts
async def pre_process_online_async(self, ctx: PoolingServeContext):
self.pre_process_online(ctx)
def post_process_online(
self,
ctx: PoolingServeContext,
):
pass
async def pre_process_online_async(self, *args, **kwargs): async def post_process_online_async(
return self.pre_process_online(*args, **kwargs) self,
ctx: PoolingServeContext,
):
self.post_process_online(ctx)
def pre_process_offline(self, *args, **kwargs): #######################################
raise NotImplementedError # offline APIs
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
async def pre_process_offline_async(self, *args, **kwargs): async def pre_process_offline_async(self, *args, **kwargs):
return self.pre_process_offline(*args, **kwargs) return self.pre_process_offline(*args, **kwargs)
def post_process( def post_process_offline(
self, outputs: list[PoolingRequestOutput] self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]: ) -> list[PoolingRequestOutput]:
return outputs return outputs
async def post_process_async( async def post_process_offline_async(
self, outputs: list[PoolingRequestOutput] self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]: ) -> list[PoolingRequestOutput]:
return self.post_process(outputs) return self.post_process_offline(outputs)
def create_pooling_params(self, request): #######################################
return request.to_pooling_params() # helpers
def _preprocess_completion_online( def _preprocess_completion_online(
self, self,
request: RendererRequest, request: RendererRequest,
prompt_input: str | list[str] | list[int] | list[list[int]] | None, prompt_input: str | list[str] | list[int] | list[list[int]] | None,
prompt_embeds: bytes | list[bytes] | None, prompt_embeds: bytes | list[bytes] | None,
) -> list[TokPrompt]: ) -> list[ProcessorInputs]:
renderer = self.renderer renderer = self.renderer
model_config = self.model_config model_config = self.model_config
...@@ -112,7 +168,7 @@ class PoolingIOProcessor: ...@@ -112,7 +168,7 @@ class PoolingIOProcessor:
default_template_kwargs: dict[str, Any] | None, default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None, tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
) -> tuple[list[ConversationMessage], list[TokPrompt]]: ) -> tuple[list[ConversationMessage], list[ProcessorInputs]]:
renderer = self.renderer renderer = self.renderer
default_template_kwargs = merge_kwargs( default_template_kwargs = merge_kwargs(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator, Mapping from collections.abc import AsyncGenerator, Mapping
from dataclasses import dataclass, field
from http import HTTPStatus from http import HTTPStatus
from typing import ClassVar, Generic, TypeVar from typing import ClassVar
from fastapi import Request from fastapi import Request
from pydantic import ConfigDict from fastapi.responses import Response
from starlette.datastructures import Headers from starlette.datastructures import Headers
from starlette.responses import JSONResponse
from vllm import PoolingParams, PoolingRequestOutput, envs
from vllm import (
PoolingParams,
PoolingRequestOutput,
PromptType,
SamplingParams,
envs,
)
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.chat_utils import (
...@@ -27,12 +18,12 @@ from vllm.entrypoints.chat_utils import ( ...@@ -27,12 +18,12 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.typing import AnyPoolingRequest, AnyPoolingResponse from vllm.entrypoints.pooling.typing import AnyPoolingRequest, PoolingServeContext
from vllm.inputs import ProcessorInputs from vllm.exceptions import VLLMNotFoundError
from vllm.inputs.data import ProcessorInputs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.renderers import BaseRenderer from vllm.renderers.base import BaseRenderer
from vllm.renderers.inputs.preprocess import extract_prompt_components from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import BeamSearchParams
from vllm.tracing import ( from vllm.tracing import (
contains_trace_headers, contains_trace_headers,
extract_trace_headers, extract_trace_headers,
...@@ -43,26 +34,6 @@ from vllm.utils.async_utils import merge_async_iterators ...@@ -43,26 +34,6 @@ from vllm.utils.async_utils import merge_async_iterators
from .io_processor import PoolingIOProcessor from .io_processor import PoolingIOProcessor
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
class PoolingServing: class PoolingServing:
request_id_prefix: ClassVar[str] request_id_prefix: ClassVar[str]
...@@ -109,8 +80,8 @@ class PoolingServing: ...@@ -109,8 +80,8 @@ class PoolingServing:
async def __call__( async def __call__(
self, self,
request: AnyPoolingRequest, request: AnyPoolingRequest,
raw_request: Request, raw_request: Request | None = None,
) -> JSONResponse: ) -> Response:
model_name = self.models.model_name() model_name = self.models.model_name()
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}" request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
...@@ -125,19 +96,11 @@ class PoolingServing: ...@@ -125,19 +96,11 @@ class PoolingServing:
self._validate_request(ctx) self._validate_request(ctx)
self._maybe_get_adapters(ctx) self._maybe_get_adapters(ctx)
await self._preprocess(ctx) await self.io_processor.pre_process_online_async(ctx)
await self._prepare_generators(ctx) await self._prepare_generators(ctx)
await self._collect_batch(ctx) await self._collect_batch(ctx)
response = await self._build_response(ctx) await self.io_processor.post_process_online_async(ctx)
return JSONResponse(content=response.model_dump()) return await self._build_response(ctx)
async def _preprocess(
self,
ctx: PoolingServeContext,
):
ctx.engine_prompts = await self.io_processor.pre_process_online_async(
ctx.request
)
async def _prepare_generators( async def _prepare_generators(
self, self,
...@@ -157,10 +120,14 @@ class PoolingServing: ...@@ -157,10 +120,14 @@ class PoolingServing:
pooling_params = self.io_processor.create_pooling_params(ctx.request) pooling_params = self.io_processor.create_pooling_params(ctx.request)
for i, engine_prompt in enumerate(ctx.engine_prompts): for i, engine_prompt in enumerate(ctx.engine_prompts):
request_id_item = f"{ctx.request_id}-{i}" prompt_request_id = (
f"{ctx.request_id}-{i}"
if ctx.prompt_request_ids is None
else ctx.prompt_request_ids[i]
)
self._log_inputs( self._log_inputs(
request_id_item, prompt_request_id,
engine_prompt, engine_prompt,
params=pooling_params, params=pooling_params,
lora_request=ctx.lora_request, lora_request=ctx.lora_request,
...@@ -169,7 +136,7 @@ class PoolingServing: ...@@ -169,7 +136,7 @@ class PoolingServing:
generator = self.engine_client.encode( generator = self.engine_client.encode(
engine_prompt, engine_prompt,
pooling_params, pooling_params,
request_id_item, prompt_request_id,
lora_request=ctx.lora_request, lora_request=ctx.lora_request,
trace_headers=trace_headers, trace_headers=trace_headers,
priority=getattr(ctx.request, "priority", 0), priority=getattr(ctx.request, "priority", 0),
...@@ -189,9 +156,9 @@ class PoolingServing: ...@@ -189,9 +156,9 @@ class PoolingServing:
if ctx.result_generator is None: if ctx.result_generator is None:
raise ValueError("Result generator not available") raise ValueError("Result generator not available")
num_prompts = len(ctx.engine_prompts) num_inputs = len(ctx.engine_prompts)
final_res_batch: list[PoolingRequestOutput | None] final_res_batch: list[PoolingRequestOutput | None]
final_res_batch = [None] * num_prompts final_res_batch = [None] * num_inputs
async for i, res in ctx.result_generator: async for i, res in ctx.result_generator:
final_res_batch[i] = res final_res_batch[i] = res
...@@ -204,7 +171,7 @@ class PoolingServing: ...@@ -204,7 +171,7 @@ class PoolingServing:
async def _build_response( async def _build_response(
self, self,
ctx: PoolingServeContext, ctx: PoolingServeContext,
) -> AnyPoolingResponse: ) -> Response:
raise NotImplementedError raise NotImplementedError
@staticmethod @staticmethod
...@@ -294,7 +261,7 @@ class PoolingServing: ...@@ -294,7 +261,7 @@ class PoolingServing:
return None return None
# if _check_model has been called earlier, this will be unreachable # if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.") raise VLLMNotFoundError(f"The model `{request.model}` does not exist.")
def _get_active_default_mm_loras( def _get_active_default_mm_loras(
self, request: AnyPoolingRequest self, request: AnyPoolingRequest
...@@ -349,8 +316,8 @@ class PoolingServing: ...@@ -349,8 +316,8 @@ class PoolingServing:
def _log_inputs( def _log_inputs(
self, self,
request_id: str, request_id: str,
inputs: PromptType | ProcessorInputs, inputs: ProcessorInputs,
params: SamplingParams | PoolingParams | BeamSearchParams | None, params: PoolingParams,
lora_request: LoRARequest | None, lora_request: LoRARequest | None,
) -> None: ) -> None:
if self.request_logger is None: if self.request_logger is None:
......
...@@ -2,12 +2,10 @@ ...@@ -2,12 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Depends, Request from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse from fastapi.responses import JSONResponse, Response
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
ClassificationRequest,
)
from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.utils import ( from vllm.entrypoints.utils import (
create_error_response, create_error_response,
...@@ -19,7 +17,7 @@ router = APIRouter() ...@@ -19,7 +17,7 @@ router = APIRouter()
def classify(request: Request) -> ServingClassification | None: def classify(request: Request) -> ServingClassification | None:
return request.app.state.openai_serving_classification return request.app.state.serving_classification
@router.post("/classify", dependencies=[Depends(validate_json_request)]) @router.post("/classify", dependencies=[Depends(validate_json_request)])
...@@ -27,7 +25,7 @@ def classify(request: Request) -> ServingClassification | None: ...@@ -27,7 +25,7 @@ def classify(request: Request) -> ServingClassification | None:
@load_aware_call @load_aware_call
async def create_classify( async def create_classify(
request: ClassificationRequest, raw_request: Request request: ClassificationRequest, raw_request: Request
) -> JSONResponse: ) -> Response:
handler = classify(raw_request) handler = classify(raw_request)
if handler is None: if handler is None:
error_response = create_error_response( error_response = create_error_response(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
from vllm import PromptType
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
)
from vllm.inputs import ProcessorInputs
from vllm.renderers.inputs import TokPrompt
class ClassifyIOProcessor(PoolingIOProcessor): class ClassifyIOProcessor(PoolingIOProcessor):
def pre_process_online( name = "classification"
self, request: ClassificationCompletionRequest | ClassificationChatRequest
) -> list[TokPrompt] | None:
if isinstance(request, ClassificationChatRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, ClassificationCompletionRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError("Invalid classification request type")
return engine_prompts
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
...@@ -4,13 +4,15 @@ ...@@ -4,13 +4,15 @@
from typing import TypeAlias from typing import TypeAlias
import numpy as np import numpy as np
from fastapi.responses import JSONResponse
from vllm import ClassificationOutput
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateConfig from vllm.entrypoints.chat_utils import ChatTemplateConfig
from vllm.entrypoints.openai.engine.protocol import UsageInfo from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.serving import PoolingServeContext, PoolingServing from vllm.entrypoints.pooling.base.serving import PoolingServing
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import ClassificationOutput
from vllm.renderers import BaseRenderer from vllm.renderers import BaseRenderer
from .io_processor import ClassifyIOProcessor from .io_processor import ClassifyIOProcessor
...@@ -44,15 +46,11 @@ class ServingClassification(PoolingServing): ...@@ -44,15 +46,11 @@ class ServingClassification(PoolingServing):
async def _build_response( async def _build_response(
self, self,
ctx: ClassificationServeContext, ctx: ClassificationServeContext,
) -> ClassificationResponse: ) -> JSONResponse:
final_res_batch_checked = await self.io_processor.post_process_async(
ctx.final_res_batch
)
id2label = getattr(self.model_config.hf_config, "id2label", {}) id2label = getattr(self.model_config.hf_config, "id2label", {})
num_prompt_tokens = 0 num_prompt_tokens = 0
items: list[ClassificationData] = [] items: list[ClassificationData] = []
for idx, final_res in enumerate(final_res_batch_checked): for idx, final_res in enumerate(ctx.final_res_batch):
classify_res = ClassificationOutput.from_base(final_res.outputs) classify_res = ClassificationOutput.from_base(final_res.outputs)
probs = classify_res.probs probs = classify_res.probs
...@@ -75,10 +73,12 @@ class ServingClassification(PoolingServing): ...@@ -75,10 +73,12 @@ class ServingClassification(PoolingServing):
total_tokens=num_prompt_tokens, total_tokens=num_prompt_tokens,
) )
return ClassificationResponse( response = ClassificationResponse(
id=ctx.request_id, id=ctx.request_id,
created=ctx.created_time, created=ctx.created_time,
model=ctx.model_name, model=ctx.model_name,
data=items, data=items,
usage=usage, usage=usage,
) )
return JSONResponse(content=response.model_dump())
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from functools import lru_cache
from http import HTTPStatus from http import HTTPStatus
from fastapi import APIRouter, Depends, Request from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse
from typing_extensions import assert_never
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import ( from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
EmbeddingBytesResponse, from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
EmbeddingRequest, from vllm.entrypoints.utils import (
EmbeddingResponse, create_error_response,
load_aware_call,
with_cancellation,
) )
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.logger import init_logger
router = APIRouter() router = APIRouter()
logger = init_logger(__name__)
@lru_cache(maxsize=1)
def _get_json_response_cls():
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
def embedding(request: Request) -> OpenAIServingEmbedding | None: def embedding(request: Request) -> ServingEmbedding | None:
return request.app.state.openai_serving_embedding return request.app.state.serving_embedding
@router.post( @router.post(
...@@ -56,24 +39,11 @@ async def create_embedding( ...@@ -56,24 +39,11 @@ async def create_embedding(
): ):
handler = embedding(raw_request) handler = embedding(raw_request)
if handler is None: if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization error_response = create_error_response(
return base_server.create_error_response(
message="The model does not support Embeddings API" message="The model does not support Embeddings API"
) )
generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=error_response.model_dump(),
status_code=error_response.error.code,
) )
elif isinstance(generator, EmbeddingResponse): return await handler(request, raw_request)
return _get_json_response_cls()(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse(
content=generator.content,
headers=generator.headers,
media_type=generator.media_type,
)
assert_never(generator)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, cast
import torch
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.inputs.data import ProcessorInputs, token_inputs
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.utils.collection_utils import chunk_list
class EmbedIOProcessor(PoolingIOProcessor):
name = "embedding"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.model_config.pooler_config is not None
self.pooler_config = self.model_config.pooler_config
self.enable_chunked_processing = self.pooler_config.enable_chunked_processing
#################################################################
# Long Text Embedding with Chunked Processing
# PTAL: examples/pooling/embed/openai_embedding_long_text
def pre_process_online(self, ctx: PoolingServeContext):
super().pre_process_online(ctx)
if not self.enable_chunked_processing:
return None
if ctx.engine_prompts is None:
raise ValueError("Engine prompts not available")
ctx.intermediates = ctx.engine_prompts
request_id = ctx.request_id
max_model_len = self.model_config.max_model_len
chunked_engine_prompts: list[ProcessorInputs] = []
prompt_request_ids: list[str] = []
for prompt_idx, engine_prompt in enumerate(ctx.engine_prompts):
token_ids = engine_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
prompt_token_ids = cast(list[int], token_ids)
for chunk_idx, chunk_tokens in enumerate(
chunk_list(prompt_token_ids, max_model_len)
):
chunked_engine_prompts.append(
token_inputs(prompt_token_ids=chunk_tokens)
)
prompt_request_ids.append(
f"{request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
)
ctx.engine_prompts = chunked_engine_prompts
ctx.prompt_request_ids = prompt_request_ids
return None
def post_process_online(
self,
ctx: PoolingServeContext,
):
if ctx.final_res_batch is None:
raise ValueError("Final response batch not available")
if not self.enable_chunked_processing:
return super().post_process_online(ctx)
# Online aggregation for chunked requests to
# minimize memory usage
# Track aggregation state for each prompt
prompt_aggregators: dict[int, dict[str, Any]] = {}
short_prompts_results: dict[int, PoolingRequestOutput] = {}
for result_idx, result in enumerate(ctx.final_res_batch):
if "-chunk-" not in result.request_id:
# Non-chunked result - extract prompt_idx from request_id
parts = result.request_id.split("-")
try:
# Last part should be prompt index
prompt_idx = int(parts[-1])
except (ValueError, IndexError):
prompt_idx = result_idx # Fallback to result_idx
short_prompts_results[prompt_idx] = result
else:
# Extract prompt_idx from chunked request_id
parts = result.request_id.split("-")
try:
prompt_idx = int(parts[parts.index("prompt") + 1])
except (ValueError, IndexError):
# Fallback: extract from result_idx if parsing fails
prompt_idx = result_idx
# Initialize aggregator for this prompt if needed
if prompt_idx not in prompt_aggregators:
prompt_aggregators[prompt_idx] = {
"weighted_sum": None,
"total_weight": 0,
"chunk_count": 0,
"request_id": result.request_id.split("-chunk-")[0],
}
aggregator = prompt_aggregators[prompt_idx]
# MEAN pooling with online weighted averaging
# Ensure result is PoolingRequestOutput
# for embedding processing
if not isinstance(result, PoolingRequestOutput):
raise ValueError(
f"Expected PoolingRequestOutput for "
f"chunked embedding, got "
f"{type(result).__name__}"
)
if result.prompt_token_ids is None:
raise ValueError(
"prompt_token_ids cannot be None for chunked processing"
)
weight = len(result.prompt_token_ids)
embedding_data = result.outputs.data
weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
if aggregator["weighted_sum"] is None:
# First chunk
aggregator["weighted_sum"] = weighted_embedding
else:
# Accumulate
aggregator["weighted_sum"] += weighted_embedding
aggregator["total_weight"] += weight
aggregator["chunk_count"] += 1
if ctx.intermediates is None:
raise ValueError("Original prompts inputs not available")
original_engine_prompts = cast(list[ProcessorInputs], ctx.intermediates)
num_prompts = len(original_engine_prompts)
# Finalize aggregated results
final_res_batch: list[PoolingRequestOutput] = []
for prompt_idx in range(num_prompts):
if prompt_idx in prompt_aggregators:
# Finalize MEAN aggregation for this chunked prompt
aggregator = prompt_aggregators[prompt_idx]
weighted_sum = aggregator["weighted_sum"]
total_weight = aggregator["total_weight"]
if (
weighted_sum is not None
and isinstance(weighted_sum, torch.Tensor)
and isinstance(total_weight, (int, float))
and total_weight > 0
):
# Compute final mean embedding
final_embedding = weighted_sum / total_weight
# Create a PoolingRequestOutput
# for the aggregated result
pooling_output_data = PoolingOutput(data=final_embedding)
# Get original prompt token IDs for this prompt
original_prompt = original_engine_prompts[prompt_idx]
token_ids = original_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
original_token_ids = cast(list[int], token_ids)
pooling_request_output = PoolingRequestOutput(
request_id=aggregator["request_id"],
prompt_token_ids=original_token_ids,
outputs=pooling_output_data,
num_cached_tokens=0,
finished=True,
)
final_res_batch.append(pooling_request_output)
else:
raise ValueError(
f"Failed to aggregate chunks for prompt {prompt_idx}"
)
elif prompt_idx in short_prompts_results:
final_res_batch.append(short_prompts_results[prompt_idx])
else:
raise ValueError(f"Result not found for prompt {prompt_idx}")
ctx.final_res_batch = final_res_batch
return None
This diff is collapsed.
...@@ -15,17 +15,21 @@ def init_pooling_io_processors( ...@@ -15,17 +15,21 @@ def init_pooling_io_processors(
renderer: BaseRenderer, renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig, chat_template_config: ChatTemplateConfig,
) -> dict[str, PoolingIOProcessor]: ) -> dict[str, PoolingIOProcessor]:
pooling_io_processors: dict[str, PoolingIOProcessor] = {} processors: list[tuple[str, type[PoolingIOProcessor]]] = []
if "classify" in supported_tasks: if "classify" in supported_tasks:
from vllm.entrypoints.pooling.classify.io_processor import ( from vllm.entrypoints.pooling.classify.io_processor import ClassifyIOProcessor
ClassifyIOProcessor,
) processors.append(("classify", ClassifyIOProcessor))
if "embed" in supported_tasks:
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
pooling_io_processors["classify"] = ClassifyIOProcessor( processors.append(("classify", EmbedIOProcessor))
return {
task: processor_cls(
model_config=model_config, model_config=model_config,
renderer=renderer, renderer=renderer,
chat_template_config=chat_template_config, chat_template_config=chat_template_config,
) )
for task, processor_cls in processors
return pooling_io_processors }
...@@ -21,7 +21,7 @@ router = APIRouter() ...@@ -21,7 +21,7 @@ router = APIRouter()
def pooling(request: Request) -> OpenAIServingPooling | None: def pooling(request: Request) -> OpenAIServingPooling | None:
return request.app.state.openai_serving_pooling return request.app.state.serving_pooling
@router.post( @router.post(
......
...@@ -24,11 +24,11 @@ logger = init_logger(__name__) ...@@ -24,11 +24,11 @@ logger = init_logger(__name__)
def score(request: Request) -> ServingScores | None: def score(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores return request.app.state.serving_scores
def rerank(request: Request) -> ServingScores | None: def rerank(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores return request.app.state.serving_scores
@router.post( @router.post(
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from typing import Any, Generic, TypeAlias, TypeVar
from typing import TypeAlias from fastapi import Request
from pydantic import ConfigDict
from vllm import PoolingRequestOutput
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest, ClassificationChatRequest,
ClassificationCompletionRequest, ClassificationCompletionRequest,
...@@ -25,12 +31,12 @@ from vllm.entrypoints.pooling.score.protocol import ( ...@@ -25,12 +31,12 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest, ScoreRequest,
ScoreResponse, ScoreResponse,
) )
from vllm.inputs import ProcessorInputs
from vllm.lora.request import LoRARequest
PoolingCompletionLikeRequest: TypeAlias = ( PoolingCompletionLikeRequest: TypeAlias = (
EmbeddingCompletionRequest EmbeddingCompletionRequest
| ClassificationCompletionRequest | ClassificationCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest | PoolingCompletionRequest
) )
...@@ -39,7 +45,11 @@ PoolingChatLikeRequest: TypeAlias = ( ...@@ -39,7 +45,11 @@ PoolingChatLikeRequest: TypeAlias = (
) )
AnyPoolingRequest: TypeAlias = ( AnyPoolingRequest: TypeAlias = (
PoolingCompletionLikeRequest | PoolingChatLikeRequest | IOProcessorRequest PoolingCompletionLikeRequest
| PoolingChatLikeRequest
| IOProcessorRequest
| RerankRequest
| ScoreRequest
) )
AnyPoolingResponse: TypeAlias = ( AnyPoolingResponse: TypeAlias = (
...@@ -49,3 +59,26 @@ AnyPoolingResponse: TypeAlias = ( ...@@ -49,3 +59,26 @@ AnyPoolingResponse: TypeAlias = (
| PoolingResponse | PoolingResponse
| ScoreResponse | ScoreResponse
) )
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
prompt_request_ids: list[str] | None = None
intermediates: Any | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
import math import math
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache
from typing import Any from typing import Any
import pybase64 import pybase64
import torch import torch
from fastapi.responses import JSONResponse
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput from vllm.outputs import PoolingRequestOutput
from vllm.utils.serial_utils import ( from vllm.utils.serial_utils import (
EMBED_DTYPES, EMBED_DTYPES,
...@@ -16,6 +21,8 @@ from vllm.utils.serial_utils import ( ...@@ -16,6 +21,8 @@ from vllm.utils.serial_utils import (
tensor2binary, tensor2binary,
) )
logger = init_logger(__name__)
@dataclass @dataclass
class MetadataItem: class MetadataItem:
...@@ -122,3 +129,15 @@ def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch. ...@@ -122,3 +129,15 @@ def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.
) )
for item in sorted(items, key=lambda x: x.index) for item in sorted(items, key=lambda x: x.index)
] ]
@lru_cache(maxsize=1)
def get_json_response_cls() -> type[JSONResponse]:
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
...@@ -303,12 +303,16 @@ def create_error_response( ...@@ -303,12 +303,16 @@ def create_error_response(
if isinstance(message, Exception): if isinstance(message, Exception):
exc = message exc = message
from vllm.exceptions import VLLMValidationError from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
if isinstance(exc, VLLMValidationError): if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError" err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter param = exc.parameter
elif isinstance(exc, VLLMNotFoundError):
err_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
param = None
elif isinstance(exc, (ValueError, TypeError, OverflowError)): elif isinstance(exc, (ValueError, TypeError, OverflowError)):
# Common validation errors from user input # Common validation errors from user input
err_type = "BadRequestError" err_type = "BadRequestError"
......
...@@ -34,3 +34,9 @@ class VLLMValidationError(ValueError): ...@@ -34,3 +34,9 @@ class VLLMValidationError(ValueError):
if self.value is not None: if self.value is not None:
extras.append(f"value={self.value}") extras.append(f"value={self.value}")
return f"{base} ({', '.join(extras)})" if extras else base return f"{base} ({', '.join(extras)})" if extras else base
class VLLMNotFoundError(ValueError):
"""vLLM-specific NotFoundError"""
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment