Unverified Commit fff3711a authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][2/n] Improve pooling entrypoints | embed. (#36110)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
parent c4d859c2
......@@ -25,7 +25,7 @@ ROCM_ATTN_BACKENDS = [
"FLEX_ATTENTION",
]
ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else []
ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else ["auto"]
# Per-backend tolerance with explicit entries; "default" is the fallback
BACKEND_TOL: dict[str, float] = {
......@@ -105,9 +105,12 @@ def server(request):
"8192",
"--chat-template",
str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
"--attention-config",
json.dumps({"backend": backend}),
] + ROCM_EXTRA_ARGS
]
env = dict()
if backend != "auto":
args += ["--attention-config", json.dumps({"backend": backend})]
args += ROCM_EXTRA_ARGS
env = dict(ROCM_ENV_OVERRIDES)
if backend != "ROCM_AITER_FA":
......
......@@ -397,7 +397,7 @@ class LLM:
self.io_processor = self.llm_engine.io_processor
self.input_processor = self.llm_engine.input_processor
self.chat_template_config = ChatTemplateConfig(chat_template=self.chat_template)
self.init_pooling_io_processors = init_pooling_io_processors(
self.pooling_io_processors = init_pooling_io_processors(
supported_tasks=supported_tasks,
model_config=self.model_config,
renderer=self.renderer,
......@@ -1174,8 +1174,8 @@ class LLM:
)
raise ValueError(msg)
if pooling_task in self.init_pooling_io_processors:
io_processor = self.init_pooling_io_processors[pooling_task]
if pooling_task in self.pooling_io_processors:
io_processor = self.pooling_io_processors[pooling_task]
processor_inputs = io_processor.pre_process_offline(
prompts_seq, tokenization_kwargs
)
......@@ -1194,7 +1194,7 @@ class LLM:
outputs = self._run_engine(
use_tqdm=use_tqdm, output_type=PoolingRequestOutput
)
outputs = io_processor.post_process(outputs)
outputs = io_processor.post_process_offline(outputs)
else:
outputs = self._run_completion(
prompts=prompts_seq,
......
......@@ -60,12 +60,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranscriptionResponse,
TranslationRequest,
)
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingResponse,
)
from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest,
PoolingChatRequest,
......@@ -144,17 +138,13 @@ CompletionLikeRequest: TypeAlias = (
CompletionRequest
| TokenizeCompletionRequest
| DetokenizeRequest
| EmbeddingCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest
)
ChatLikeRequest: TypeAlias = (
ChatCompletionRequest
| TokenizeChatRequest
| EmbeddingChatRequest
| PoolingChatRequest
ChatCompletionRequest | TokenizeChatRequest | PoolingChatRequest
)
SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
......@@ -171,8 +161,6 @@ AnyRequest: TypeAlias = (
AnyResponse: TypeAlias = (
CompletionResponse
| ChatCompletionResponse
| EmbeddingResponse
| EmbeddingBytesResponse
| TranscriptionResponse
| TokenizeResponse
| PoolingResponse
......@@ -203,8 +191,7 @@ class ServeContext(Generic[RequestT]):
class OpenAIServing:
request_id_prefix: ClassVar[str] = """
A short string prepended to every request’s ID (e.g. "embd")
so you can easily tell “this ID came from Embedding.”
A short string prepended to every request’s ID.
"""
def __init__(
......@@ -432,8 +419,7 @@ class OpenAIServing:
ctx: ServeContext,
) -> ErrorResponse | None:
"""
Default preprocessing hook. Subclasses may override
to prepare `ctx` (embedding, etc.).
Default preprocessing hook. Subclasses may override to prepare `ctx`.
"""
return None
......@@ -730,13 +716,10 @@ class OpenAIServing:
token_num = len(input_ids)
max_model_len = self.model_config.max_model_len
# Note: EmbeddingRequest,
# and ScoreRequest doesn't have max_tokens
# Note: ScoreRequest doesn't have max_tokens
if isinstance(
request,
(
EmbeddingChatRequest,
EmbeddingCompletionRequest,
ScoreDataRequest,
ScoreTextRequest,
ScoreQueriesDocumentsRequest,
......
......@@ -53,6 +53,7 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest,
ScoreResponse,
)
from vllm.entrypoints.utils import create_error_response
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.utils import random_uuid
......@@ -503,7 +504,10 @@ async def run_request(
request: BatchRequestInput,
tracker: BatchProgressTracker,
) -> BatchRequestOutput:
try:
response = await serving_engine_func(request.body)
except Exception as e:
response = create_error_response(e)
if isinstance(
response,
......@@ -678,10 +682,10 @@ async def build_endpoint_registry(
# Get serving objects from state (defaulting to None if not set)
openai_serving_chat = getattr(state, "openai_serving_chat", None)
openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
openai_serving_scores = getattr(state, "openai_serving_scores", None)
openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
openai_serving_translation = getattr(state, "openai_serving_translation", None)
serving_embedding = getattr(state, "serving_embedding", None)
serving_scores = getattr(state, "serving_scores", None)
# Registry of endpoint configurations
endpoint_registry: dict[str, dict[str, Any]] = {
......@@ -697,27 +701,21 @@ async def build_endpoint_registry(
"embeddings": {
"url_matcher": lambda url: url == "/v1/embeddings",
"handler_getter": lambda: (
openai_serving_embedding.create_embedding
if openai_serving_embedding is not None
else None
serving_embedding if serving_embedding is not None else None
),
"wrapper_fn": None,
},
"score": {
"url_matcher": lambda url: url.endswith("/score"),
"handler_getter": lambda: (
openai_serving_scores.create_score
if openai_serving_scores is not None
else None
serving_scores.create_score if serving_scores is not None else None
),
"wrapper_fn": None,
},
"rerank": {
"url_matcher": lambda url: url.endswith("/rerank"),
"handler_getter": lambda: (
openai_serving_scores.do_rerank
if openai_serving_scores is not None
else None
serving_scores.do_rerank if serving_scores is not None else None
),
"wrapper_fn": None,
},
......
......@@ -56,14 +56,14 @@ def init_pooling_state(
):
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
from vllm.entrypoints.pooling.score.serving import ServingScores
from vllm.tasks import POOLING_TASKS
resolved_chat_template = load_chat_template(args.chat_template)
state.openai_serving_pooling = (
state.serving_pooling = (
(
OpenAIServingPooling(
engine_client,
......@@ -77,8 +77,8 @@ def init_pooling_state(
if any(t in supported_tasks for t in POOLING_TASKS)
else None
)
state.openai_serving_embedding = (
OpenAIServingEmbedding(
state.serving_embedding = (
ServingEmbedding(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
......@@ -89,7 +89,7 @@ def init_pooling_state(
if "embed" in supported_tasks
else None
)
state.openai_serving_classification = (
state.serving_classification = (
ServingClassification(
engine_client,
state.openai_serving_models,
......@@ -105,7 +105,7 @@ def init_pooling_state(
# - "score" task (cross-encoder models)
# - "embed" task (bi-encoder models)
# - "token_embed" task (late interaction models like ColBERT)
state.openai_serving_scores = (
state.serving_scores = (
ServingScores(
engine_client,
state.openai_serving_models,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable, Sequence
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Final
from vllm import PoolingRequestOutput, PromptType
......@@ -14,9 +13,13 @@ from vllm.entrypoints.chat_utils import (
ConversationMessage,
)
from vllm.entrypoints.openai.engine.serving import RendererChatRequest, RendererRequest
from vllm.inputs import ProcessorInputs, SingletonPrompt
from vllm.entrypoints.pooling.typing import (
PoolingChatLikeRequest,
PoolingCompletionLikeRequest,
PoolingServeContext,
)
from vllm.inputs.data import ProcessorInputs, SingletonPrompt
from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs import TokPrompt
from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser
......@@ -24,14 +27,14 @@ from vllm.utils.mistral import is_mistral_tokenizer
class PoolingIOProcessor:
name: str
def __init__(
self,
model_config: ModelConfig,
renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig,
):
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self.model_config = model_config
self.renderer = renderer
......@@ -43,37 +46,90 @@ class PoolingIOProcessor:
chat_template_config.trust_request_chat_template
)
def pre_process_online(self, *args, **kwargs):
raise NotImplementedError
def create_pooling_params(self, request):
return request.to_pooling_params()
#######################################
# online APIs
def pre_process_online(self, ctx: PoolingServeContext):
request = ctx.request
if isinstance(ctx.request, PoolingChatLikeRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, PoolingCompletionLikeRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError(f"Invalid {self.name} request type")
ctx.engine_prompts = engine_prompts
async def pre_process_online_async(self, ctx: PoolingServeContext):
self.pre_process_online(ctx)
def post_process_online(
self,
ctx: PoolingServeContext,
):
pass
async def pre_process_online_async(self, *args, **kwargs):
return self.pre_process_online(*args, **kwargs)
async def post_process_online_async(
self,
ctx: PoolingServeContext,
):
self.post_process_online(ctx)
def pre_process_offline(self, *args, **kwargs):
raise NotImplementedError
#######################################
# offline APIs
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
async def pre_process_offline_async(self, *args, **kwargs):
return self.pre_process_offline(*args, **kwargs)
def post_process(
self, outputs: list[PoolingRequestOutput]
def post_process_offline(
self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]:
return outputs
async def post_process_async(
self, outputs: list[PoolingRequestOutput]
async def post_process_offline_async(
self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]:
return self.post_process(outputs)
return self.post_process_offline(outputs)
def create_pooling_params(self, request):
return request.to_pooling_params()
#######################################
# helpers
def _preprocess_completion_online(
self,
request: RendererRequest,
prompt_input: str | list[str] | list[int] | list[list[int]] | None,
prompt_embeds: bytes | list[bytes] | None,
) -> list[TokPrompt]:
) -> list[ProcessorInputs]:
renderer = self.renderer
model_config = self.model_config
......@@ -112,7 +168,7 @@ class PoolingIOProcessor:
default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
) -> tuple[list[ConversationMessage], list[TokPrompt]]:
) -> tuple[list[ConversationMessage], list[ProcessorInputs]]:
renderer = self.renderer
default_template_kwargs = merge_kwargs(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator, Mapping
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import ClassVar, Generic, TypeVar
from typing import ClassVar
from fastapi import Request
from pydantic import ConfigDict
from fastapi.responses import Response
from starlette.datastructures import Headers
from starlette.responses import JSONResponse
from vllm import (
PoolingParams,
PoolingRequestOutput,
PromptType,
SamplingParams,
envs,
)
from vllm import PoolingParams, PoolingRequestOutput, envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (
......@@ -27,12 +18,12 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.typing import AnyPoolingRequest, AnyPoolingResponse
from vllm.inputs import ProcessorInputs
from vllm.entrypoints.pooling.typing import AnyPoolingRequest, PoolingServeContext
from vllm.exceptions import VLLMNotFoundError
from vllm.inputs.data import ProcessorInputs
from vllm.lora.request import LoRARequest
from vllm.renderers import BaseRenderer
from vllm.renderers.base import BaseRenderer
from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import BeamSearchParams
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,
......@@ -43,26 +34,6 @@ from vllm.utils.async_utils import merge_async_iterators
from .io_processor import PoolingIOProcessor
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
class PoolingServing:
request_id_prefix: ClassVar[str]
......@@ -109,8 +80,8 @@ class PoolingServing:
async def __call__(
self,
request: AnyPoolingRequest,
raw_request: Request,
) -> JSONResponse:
raw_request: Request | None = None,
) -> Response:
model_name = self.models.model_name()
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
......@@ -125,19 +96,11 @@ class PoolingServing:
self._validate_request(ctx)
self._maybe_get_adapters(ctx)
await self._preprocess(ctx)
await self.io_processor.pre_process_online_async(ctx)
await self._prepare_generators(ctx)
await self._collect_batch(ctx)
response = await self._build_response(ctx)
return JSONResponse(content=response.model_dump())
async def _preprocess(
self,
ctx: PoolingServeContext,
):
ctx.engine_prompts = await self.io_processor.pre_process_online_async(
ctx.request
)
await self.io_processor.post_process_online_async(ctx)
return await self._build_response(ctx)
async def _prepare_generators(
self,
......@@ -157,10 +120,14 @@ class PoolingServing:
pooling_params = self.io_processor.create_pooling_params(ctx.request)
for i, engine_prompt in enumerate(ctx.engine_prompts):
request_id_item = f"{ctx.request_id}-{i}"
prompt_request_id = (
f"{ctx.request_id}-{i}"
if ctx.prompt_request_ids is None
else ctx.prompt_request_ids[i]
)
self._log_inputs(
request_id_item,
prompt_request_id,
engine_prompt,
params=pooling_params,
lora_request=ctx.lora_request,
......@@ -169,7 +136,7 @@ class PoolingServing:
generator = self.engine_client.encode(
engine_prompt,
pooling_params,
request_id_item,
prompt_request_id,
lora_request=ctx.lora_request,
trace_headers=trace_headers,
priority=getattr(ctx.request, "priority", 0),
......@@ -189,9 +156,9 @@ class PoolingServing:
if ctx.result_generator is None:
raise ValueError("Result generator not available")
num_prompts = len(ctx.engine_prompts)
num_inputs = len(ctx.engine_prompts)
final_res_batch: list[PoolingRequestOutput | None]
final_res_batch = [None] * num_prompts
final_res_batch = [None] * num_inputs
async for i, res in ctx.result_generator:
final_res_batch[i] = res
......@@ -204,7 +171,7 @@ class PoolingServing:
async def _build_response(
self,
ctx: PoolingServeContext,
) -> AnyPoolingResponse:
) -> Response:
raise NotImplementedError
@staticmethod
......@@ -294,7 +261,7 @@ class PoolingServing:
return None
# if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.")
raise VLLMNotFoundError(f"The model `{request.model}` does not exist.")
def _get_active_default_mm_loras(
self, request: AnyPoolingRequest
......@@ -349,8 +316,8 @@ class PoolingServing:
def _log_inputs(
self,
request_id: str,
inputs: PromptType | ProcessorInputs,
params: SamplingParams | PoolingParams | BeamSearchParams | None,
inputs: ProcessorInputs,
params: PoolingParams,
lora_request: LoRARequest | None,
) -> None:
if self.request_logger is None:
......
......@@ -2,12 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse
from fastapi.responses import JSONResponse, Response
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationRequest,
)
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.utils import (
create_error_response,
......@@ -19,7 +17,7 @@ router = APIRouter()
def classify(request: Request) -> ServingClassification | None:
return request.app.state.openai_serving_classification
return request.app.state.serving_classification
@router.post("/classify", dependencies=[Depends(validate_json_request)])
......@@ -27,7 +25,7 @@ def classify(request: Request) -> ServingClassification | None:
@load_aware_call
async def create_classify(
request: ClassificationRequest, raw_request: Request
) -> JSONResponse:
) -> Response:
handler = classify(raw_request)
if handler is None:
error_response = create_error_response(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
from vllm import PromptType
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
)
from vllm.inputs import ProcessorInputs
from vllm.renderers.inputs import TokPrompt
class ClassifyIOProcessor(PoolingIOProcessor):
def pre_process_online(
self, request: ClassificationCompletionRequest | ClassificationChatRequest
) -> list[TokPrompt] | None:
if isinstance(request, ClassificationChatRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, ClassificationCompletionRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError("Invalid classification request type")
return engine_prompts
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
name = "classification"
......@@ -4,13 +4,15 @@
from typing import TypeAlias
import numpy as np
from fastapi.responses import JSONResponse
from vllm import ClassificationOutput
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateConfig
from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.serving import PoolingServeContext, PoolingServing
from vllm.entrypoints.pooling.base.serving import PoolingServing
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.logger import init_logger
from vllm.outputs import ClassificationOutput
from vllm.renderers import BaseRenderer
from .io_processor import ClassifyIOProcessor
......@@ -44,15 +46,11 @@ class ServingClassification(PoolingServing):
async def _build_response(
self,
ctx: ClassificationServeContext,
) -> ClassificationResponse:
final_res_batch_checked = await self.io_processor.post_process_async(
ctx.final_res_batch
)
) -> JSONResponse:
id2label = getattr(self.model_config.hf_config, "id2label", {})
num_prompt_tokens = 0
items: list[ClassificationData] = []
for idx, final_res in enumerate(final_res_batch_checked):
for idx, final_res in enumerate(ctx.final_res_batch):
classify_res = ClassificationOutput.from_base(final_res.outputs)
probs = classify_res.probs
......@@ -75,10 +73,12 @@ class ServingClassification(PoolingServing):
total_tokens=num_prompt_tokens,
)
return ClassificationResponse(
response = ClassificationResponse(
id=ctx.request_id,
created=ctx.created_time,
model=ctx.model_name,
data=items,
usage=usage,
)
return JSONResponse(content=response.model_dump())
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from functools import lru_cache
from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never
from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingRequest,
EmbeddingResponse,
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import (
create_error_response,
load_aware_call,
with_cancellation,
)
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.logger import init_logger
router = APIRouter()
logger = init_logger(__name__)
@lru_cache(maxsize=1)
def _get_json_response_cls():
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
def embedding(request: Request) -> OpenAIServingEmbedding | None:
return request.app.state.openai_serving_embedding
def embedding(request: Request) -> ServingEmbedding | None:
return request.app.state.serving_embedding
@router.post(
......@@ -56,24 +39,11 @@ async def create_embedding(
):
handler = embedding(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
error_response = create_error_response(
message="The model does not support Embeddings API"
)
generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
content=error_response.model_dump(),
status_code=error_response.error.code,
)
elif isinstance(generator, EmbeddingResponse):
return _get_json_response_cls()(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse(
content=generator.content,
headers=generator.headers,
media_type=generator.media_type,
)
assert_never(generator)
return await handler(request, raw_request)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, cast
import torch
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.inputs.data import ProcessorInputs, token_inputs
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.utils.collection_utils import chunk_list
class EmbedIOProcessor(PoolingIOProcessor):
name = "embedding"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.model_config.pooler_config is not None
self.pooler_config = self.model_config.pooler_config
self.enable_chunked_processing = self.pooler_config.enable_chunked_processing
#################################################################
# Long Text Embedding with Chunked Processing
# PTAL: examples/pooling/embed/openai_embedding_long_text
def pre_process_online(self, ctx: PoolingServeContext):
super().pre_process_online(ctx)
if not self.enable_chunked_processing:
return None
if ctx.engine_prompts is None:
raise ValueError("Engine prompts not available")
ctx.intermediates = ctx.engine_prompts
request_id = ctx.request_id
max_model_len = self.model_config.max_model_len
chunked_engine_prompts: list[ProcessorInputs] = []
prompt_request_ids: list[str] = []
for prompt_idx, engine_prompt in enumerate(ctx.engine_prompts):
token_ids = engine_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
prompt_token_ids = cast(list[int], token_ids)
for chunk_idx, chunk_tokens in enumerate(
chunk_list(prompt_token_ids, max_model_len)
):
chunked_engine_prompts.append(
token_inputs(prompt_token_ids=chunk_tokens)
)
prompt_request_ids.append(
f"{request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
)
ctx.engine_prompts = chunked_engine_prompts
ctx.prompt_request_ids = prompt_request_ids
return None
def post_process_online(
self,
ctx: PoolingServeContext,
):
if ctx.final_res_batch is None:
raise ValueError("Final response batch not available")
if not self.enable_chunked_processing:
return super().post_process_online(ctx)
# Online aggregation for chunked requests to
# minimize memory usage
# Track aggregation state for each prompt
prompt_aggregators: dict[int, dict[str, Any]] = {}
short_prompts_results: dict[int, PoolingRequestOutput] = {}
for result_idx, result in enumerate(ctx.final_res_batch):
if "-chunk-" not in result.request_id:
# Non-chunked result - extract prompt_idx from request_id
parts = result.request_id.split("-")
try:
# Last part should be prompt index
prompt_idx = int(parts[-1])
except (ValueError, IndexError):
prompt_idx = result_idx # Fallback to result_idx
short_prompts_results[prompt_idx] = result
else:
# Extract prompt_idx from chunked request_id
parts = result.request_id.split("-")
try:
prompt_idx = int(parts[parts.index("prompt") + 1])
except (ValueError, IndexError):
# Fallback: extract from result_idx if parsing fails
prompt_idx = result_idx
# Initialize aggregator for this prompt if needed
if prompt_idx not in prompt_aggregators:
prompt_aggregators[prompt_idx] = {
"weighted_sum": None,
"total_weight": 0,
"chunk_count": 0,
"request_id": result.request_id.split("-chunk-")[0],
}
aggregator = prompt_aggregators[prompt_idx]
# MEAN pooling with online weighted averaging
# Ensure result is PoolingRequestOutput
# for embedding processing
if not isinstance(result, PoolingRequestOutput):
raise ValueError(
f"Expected PoolingRequestOutput for "
f"chunked embedding, got "
f"{type(result).__name__}"
)
if result.prompt_token_ids is None:
raise ValueError(
"prompt_token_ids cannot be None for chunked processing"
)
weight = len(result.prompt_token_ids)
embedding_data = result.outputs.data
weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
if aggregator["weighted_sum"] is None:
# First chunk
aggregator["weighted_sum"] = weighted_embedding
else:
# Accumulate
aggregator["weighted_sum"] += weighted_embedding
aggregator["total_weight"] += weight
aggregator["chunk_count"] += 1
if ctx.intermediates is None:
raise ValueError("Original prompts inputs not available")
original_engine_prompts = cast(list[ProcessorInputs], ctx.intermediates)
num_prompts = len(original_engine_prompts)
# Finalize aggregated results
final_res_batch: list[PoolingRequestOutput] = []
for prompt_idx in range(num_prompts):
if prompt_idx in prompt_aggregators:
# Finalize MEAN aggregation for this chunked prompt
aggregator = prompt_aggregators[prompt_idx]
weighted_sum = aggregator["weighted_sum"]
total_weight = aggregator["total_weight"]
if (
weighted_sum is not None
and isinstance(weighted_sum, torch.Tensor)
and isinstance(total_weight, (int, float))
and total_weight > 0
):
# Compute final mean embedding
final_embedding = weighted_sum / total_weight
# Create a PoolingRequestOutput
# for the aggregated result
pooling_output_data = PoolingOutput(data=final_embedding)
# Get original prompt token IDs for this prompt
original_prompt = original_engine_prompts[prompt_idx]
token_ids = original_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
original_token_ids = cast(list[int], token_ids)
pooling_request_output = PoolingRequestOutput(
request_id=aggregator["request_id"],
prompt_token_ids=original_token_ids,
outputs=pooling_output_data,
num_cached_tokens=0,
finished=True,
)
final_res_batch.append(pooling_request_output)
else:
raise ValueError(
f"Failed to aggregate chunks for prompt {prompt_idx}"
)
elif prompt_idx in short_prompts_results:
final_res_batch.append(short_prompts_results[prompt_idx])
else:
raise ValueError(f"Result not found for prompt {prompt_idx}")
ctx.final_res_batch = final_res_batch
return None
This diff is collapsed.
......@@ -15,17 +15,21 @@ def init_pooling_io_processors(
renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig,
) -> dict[str, PoolingIOProcessor]:
pooling_io_processors: dict[str, PoolingIOProcessor] = {}
processors: list[tuple[str, type[PoolingIOProcessor]]] = []
if "classify" in supported_tasks:
from vllm.entrypoints.pooling.classify.io_processor import (
ClassifyIOProcessor,
)
from vllm.entrypoints.pooling.classify.io_processor import ClassifyIOProcessor
processors.append(("classify", ClassifyIOProcessor))
if "embed" in supported_tasks:
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
pooling_io_processors["classify"] = ClassifyIOProcessor(
processors.append(("classify", EmbedIOProcessor))
return {
task: processor_cls(
model_config=model_config,
renderer=renderer,
chat_template_config=chat_template_config,
)
return pooling_io_processors
for task, processor_cls in processors
}
......@@ -21,7 +21,7 @@ router = APIRouter()
def pooling(request: Request) -> OpenAIServingPooling | None:
return request.app.state.openai_serving_pooling
return request.app.state.serving_pooling
@router.post(
......
......@@ -24,11 +24,11 @@ logger = init_logger(__name__)
def score(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores
return request.app.state.serving_scores
def rerank(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores
return request.app.state.serving_scores
@router.post(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from typing import Any, Generic, TypeAlias, TypeVar
from typing import TypeAlias
from fastapi import Request
from pydantic import ConfigDict
from vllm import PoolingRequestOutput
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
......@@ -25,12 +31,12 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest,
ScoreResponse,
)
from vllm.inputs import ProcessorInputs
from vllm.lora.request import LoRARequest
PoolingCompletionLikeRequest: TypeAlias = (
EmbeddingCompletionRequest
| ClassificationCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest
)
......@@ -39,7 +45,11 @@ PoolingChatLikeRequest: TypeAlias = (
)
AnyPoolingRequest: TypeAlias = (
PoolingCompletionLikeRequest | PoolingChatLikeRequest | IOProcessorRequest
PoolingCompletionLikeRequest
| PoolingChatLikeRequest
| IOProcessorRequest
| RerankRequest
| ScoreRequest
)
AnyPoolingResponse: TypeAlias = (
......@@ -49,3 +59,26 @@ AnyPoolingResponse: TypeAlias = (
| PoolingResponse
| ScoreResponse
)
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
prompt_request_ids: list[str] | None = None
intermediates: Any | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
import math
from dataclasses import dataclass
from functools import lru_cache
from typing import Any
import pybase64
import torch
from fastapi.responses import JSONResponse
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput
from vllm.utils.serial_utils import (
EMBED_DTYPES,
......@@ -16,6 +21,8 @@ from vllm.utils.serial_utils import (
tensor2binary,
)
logger = init_logger(__name__)
@dataclass
class MetadataItem:
......@@ -122,3 +129,15 @@ def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.
)
for item in sorted(items, key=lambda x: x.index)
]
@lru_cache(maxsize=1)
def get_json_response_cls() -> type[JSONResponse]:
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
......@@ -303,12 +303,16 @@ def create_error_response(
if isinstance(message, Exception):
exc = message
from vllm.exceptions import VLLMValidationError
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, VLLMNotFoundError):
err_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
param = None
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
......
......@@ -34,3 +34,9 @@ class VLLMValidationError(ValueError):
if self.value is not None:
extras.append(f"value={self.value}")
return f"{base} ({', '.join(extras)})" if extras else base
class VLLMNotFoundError(ValueError):
"""vLLM-specific NotFoundError"""
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment