Unverified Commit fff3711a authored by wang.yuqi's avatar wang.yuqi Committed by GitHub
Browse files

[Frontend][2/n] Improve pooling entrypoints | embed. (#36110)


Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
parent c4d859c2
......@@ -25,7 +25,7 @@ ROCM_ATTN_BACKENDS = [
"FLEX_ATTENTION",
]
ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else []
ATTN_BACKENDS = ROCM_ATTN_BACKENDS if current_platform.is_rocm() else ["auto"]
# Per-backend tolerance with explicit entries; "default" is the fallback
BACKEND_TOL: dict[str, float] = {
......@@ -105,13 +105,16 @@ def server(request):
"8192",
"--chat-template",
str(VLLM_PATH / "examples/pooling/score/template/qwen3_vl_reranker.jinja"),
"--attention-config",
json.dumps({"backend": backend}),
] + ROCM_EXTRA_ARGS
]
env = dict(ROCM_ENV_OVERRIDES)
if backend != "ROCM_AITER_FA":
env["VLLM_ROCM_USE_AITER"] = "0"
env = dict()
if backend != "auto":
args += ["--attention-config", json.dumps({"backend": backend})]
args += ROCM_EXTRA_ARGS
env = dict(ROCM_ENV_OVERRIDES)
if backend != "ROCM_AITER_FA":
env["VLLM_ROCM_USE_AITER"] = "0"
with RemoteOpenAIServer(
MODEL_NAME, args, override_hf_configs=HF_OVERRIDES, env_dict=env
......
......@@ -397,7 +397,7 @@ class LLM:
self.io_processor = self.llm_engine.io_processor
self.input_processor = self.llm_engine.input_processor
self.chat_template_config = ChatTemplateConfig(chat_template=self.chat_template)
self.init_pooling_io_processors = init_pooling_io_processors(
self.pooling_io_processors = init_pooling_io_processors(
supported_tasks=supported_tasks,
model_config=self.model_config,
renderer=self.renderer,
......@@ -1174,8 +1174,8 @@ class LLM:
)
raise ValueError(msg)
if pooling_task in self.init_pooling_io_processors:
io_processor = self.init_pooling_io_processors[pooling_task]
if pooling_task in self.pooling_io_processors:
io_processor = self.pooling_io_processors[pooling_task]
processor_inputs = io_processor.pre_process_offline(
prompts_seq, tokenization_kwargs
)
......@@ -1194,7 +1194,7 @@ class LLM:
outputs = self._run_engine(
use_tqdm=use_tqdm, output_type=PoolingRequestOutput
)
outputs = io_processor.post_process(outputs)
outputs = io_processor.post_process_offline(outputs)
else:
outputs = self._run_completion(
prompts=prompts_seq,
......
......@@ -60,12 +60,6 @@ from vllm.entrypoints.openai.speech_to_text.protocol import (
TranscriptionResponse,
TranslationRequest,
)
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingResponse,
)
from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest,
PoolingChatRequest,
......@@ -144,17 +138,13 @@ CompletionLikeRequest: TypeAlias = (
CompletionRequest
| TokenizeCompletionRequest
| DetokenizeRequest
| EmbeddingCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest
)
ChatLikeRequest: TypeAlias = (
ChatCompletionRequest
| TokenizeChatRequest
| EmbeddingChatRequest
| PoolingChatRequest
ChatCompletionRequest | TokenizeChatRequest | PoolingChatRequest
)
SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
......@@ -171,8 +161,6 @@ AnyRequest: TypeAlias = (
AnyResponse: TypeAlias = (
CompletionResponse
| ChatCompletionResponse
| EmbeddingResponse
| EmbeddingBytesResponse
| TranscriptionResponse
| TokenizeResponse
| PoolingResponse
......@@ -203,8 +191,7 @@ class ServeContext(Generic[RequestT]):
class OpenAIServing:
request_id_prefix: ClassVar[str] = """
A short string prepended to every request’s ID (e.g. "embd")
so you can easily tell “this ID came from Embedding.”
A short string prepended to every request’s ID.
"""
def __init__(
......@@ -432,8 +419,7 @@ class OpenAIServing:
ctx: ServeContext,
) -> ErrorResponse | None:
"""
Default preprocessing hook. Subclasses may override
to prepare `ctx` (embedding, etc.).
Default preprocessing hook. Subclasses may override to prepare `ctx`.
"""
return None
......@@ -730,13 +716,10 @@ class OpenAIServing:
token_num = len(input_ids)
max_model_len = self.model_config.max_model_len
# Note: EmbeddingRequest,
# and ScoreRequest doesn't have max_tokens
# Note: ScoreRequest doesn't have max_tokens
if isinstance(
request,
(
EmbeddingChatRequest,
EmbeddingCompletionRequest,
ScoreDataRequest,
ScoreTextRequest,
ScoreQueriesDocumentsRequest,
......
......@@ -53,6 +53,7 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest,
ScoreResponse,
)
from vllm.entrypoints.utils import create_error_response
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.utils import random_uuid
......@@ -503,7 +504,10 @@ async def run_request(
request: BatchRequestInput,
tracker: BatchProgressTracker,
) -> BatchRequestOutput:
response = await serving_engine_func(request.body)
try:
response = await serving_engine_func(request.body)
except Exception as e:
response = create_error_response(e)
if isinstance(
response,
......@@ -678,10 +682,10 @@ async def build_endpoint_registry(
# Get serving objects from state (defaulting to None if not set)
openai_serving_chat = getattr(state, "openai_serving_chat", None)
openai_serving_embedding = getattr(state, "openai_serving_embedding", None)
openai_serving_scores = getattr(state, "openai_serving_scores", None)
openai_serving_transcription = getattr(state, "openai_serving_transcription", None)
openai_serving_translation = getattr(state, "openai_serving_translation", None)
serving_embedding = getattr(state, "serving_embedding", None)
serving_scores = getattr(state, "serving_scores", None)
# Registry of endpoint configurations
endpoint_registry: dict[str, dict[str, Any]] = {
......@@ -697,27 +701,21 @@ async def build_endpoint_registry(
"embeddings": {
"url_matcher": lambda url: url == "/v1/embeddings",
"handler_getter": lambda: (
openai_serving_embedding.create_embedding
if openai_serving_embedding is not None
else None
serving_embedding if serving_embedding is not None else None
),
"wrapper_fn": None,
},
"score": {
"url_matcher": lambda url: url.endswith("/score"),
"handler_getter": lambda: (
openai_serving_scores.create_score
if openai_serving_scores is not None
else None
serving_scores.create_score if serving_scores is not None else None
),
"wrapper_fn": None,
},
"rerank": {
"url_matcher": lambda url: url.endswith("/rerank"),
"handler_getter": lambda: (
openai_serving_scores.do_rerank
if openai_serving_scores is not None
else None
serving_scores.do_rerank if serving_scores is not None else None
),
"wrapper_fn": None,
},
......
......@@ -56,14 +56,14 @@ def init_pooling_state(
):
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
from vllm.entrypoints.pooling.score.serving import ServingScores
from vllm.tasks import POOLING_TASKS
resolved_chat_template = load_chat_template(args.chat_template)
state.openai_serving_pooling = (
state.serving_pooling = (
(
OpenAIServingPooling(
engine_client,
......@@ -77,8 +77,8 @@ def init_pooling_state(
if any(t in supported_tasks for t in POOLING_TASKS)
else None
)
state.openai_serving_embedding = (
OpenAIServingEmbedding(
state.serving_embedding = (
ServingEmbedding(
engine_client,
state.openai_serving_models,
request_logger=request_logger,
......@@ -89,7 +89,7 @@ def init_pooling_state(
if "embed" in supported_tasks
else None
)
state.openai_serving_classification = (
state.serving_classification = (
ServingClassification(
engine_client,
state.openai_serving_models,
......@@ -105,7 +105,7 @@ def init_pooling_state(
# - "score" task (cross-encoder models)
# - "embed" task (bi-encoder models)
# - "token_embed" task (late interaction models like ColBERT)
state.openai_serving_scores = (
state.serving_scores = (
ServingScores(
engine_client,
state.openai_serving_models,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable, Sequence
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Final
from vllm import PoolingRequestOutput, PromptType
......@@ -14,9 +13,13 @@ from vllm.entrypoints.chat_utils import (
ConversationMessage,
)
from vllm.entrypoints.openai.engine.serving import RendererChatRequest, RendererRequest
from vllm.inputs import ProcessorInputs, SingletonPrompt
from vllm.entrypoints.pooling.typing import (
PoolingChatLikeRequest,
PoolingCompletionLikeRequest,
PoolingServeContext,
)
from vllm.inputs.data import ProcessorInputs, SingletonPrompt
from vllm.renderers import BaseRenderer, merge_kwargs
from vllm.renderers.inputs import TokPrompt
from vllm.renderers.inputs.preprocess import parse_model_prompt, prompt_to_seq
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers import ToolParser
......@@ -24,14 +27,14 @@ from vllm.utils.mistral import is_mistral_tokenizer
class PoolingIOProcessor:
name: str
def __init__(
self,
model_config: ModelConfig,
renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig,
):
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
self.model_config = model_config
self.renderer = renderer
......@@ -43,37 +46,90 @@ class PoolingIOProcessor:
chat_template_config.trust_request_chat_template
)
def pre_process_online(self, *args, **kwargs):
raise NotImplementedError
def create_pooling_params(self, request):
return request.to_pooling_params()
#######################################
# online APIs
def pre_process_online(self, ctx: PoolingServeContext):
request = ctx.request
if isinstance(ctx.request, PoolingChatLikeRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, PoolingCompletionLikeRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError(f"Invalid {self.name} request type")
ctx.engine_prompts = engine_prompts
async def pre_process_online_async(self, ctx: PoolingServeContext):
self.pre_process_online(ctx)
def post_process_online(
self,
ctx: PoolingServeContext,
):
pass
async def pre_process_online_async(self, *args, **kwargs):
return self.pre_process_online(*args, **kwargs)
async def post_process_online_async(
self,
ctx: PoolingServeContext,
):
self.post_process_online(ctx)
def pre_process_offline(self, *args, **kwargs):
raise NotImplementedError
#######################################
# offline APIs
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
async def pre_process_offline_async(self, *args, **kwargs):
return self.pre_process_offline(*args, **kwargs)
def post_process(
self, outputs: list[PoolingRequestOutput]
def post_process_offline(
self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]:
return outputs
async def post_process_async(
self, outputs: list[PoolingRequestOutput]
async def post_process_offline_async(
self,
outputs: list[PoolingRequestOutput],
) -> list[PoolingRequestOutput]:
return self.post_process(outputs)
return self.post_process_offline(outputs)
def create_pooling_params(self, request):
return request.to_pooling_params()
#######################################
# helpers
def _preprocess_completion_online(
self,
request: RendererRequest,
prompt_input: str | list[str] | list[int] | list[list[int]] | None,
prompt_embeds: bytes | list[bytes] | None,
) -> list[TokPrompt]:
) -> list[ProcessorInputs]:
renderer = self.renderer
model_config = self.model_config
......@@ -112,7 +168,7 @@ class PoolingIOProcessor:
default_template_kwargs: dict[str, Any] | None,
tool_dicts: list[dict[str, Any]] | None = None,
tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
) -> tuple[list[ConversationMessage], list[TokPrompt]]:
) -> tuple[list[ConversationMessage], list[ProcessorInputs]]:
renderer = self.renderer
default_template_kwargs = merge_kwargs(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator, Mapping
from dataclasses import dataclass, field
from http import HTTPStatus
from typing import ClassVar, Generic, TypeVar
from typing import ClassVar
from fastapi import Request
from pydantic import ConfigDict
from fastapi.responses import Response
from starlette.datastructures import Headers
from starlette.responses import JSONResponse
from vllm import (
PoolingParams,
PoolingRequestOutput,
PromptType,
SamplingParams,
envs,
)
from vllm import PoolingParams, PoolingRequestOutput, envs
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (
......@@ -27,12 +18,12 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.typing import AnyPoolingRequest, AnyPoolingResponse
from vllm.inputs import ProcessorInputs
from vllm.entrypoints.pooling.typing import AnyPoolingRequest, PoolingServeContext
from vllm.exceptions import VLLMNotFoundError
from vllm.inputs.data import ProcessorInputs
from vllm.lora.request import LoRARequest
from vllm.renderers import BaseRenderer
from vllm.renderers.base import BaseRenderer
from vllm.renderers.inputs.preprocess import extract_prompt_components
from vllm.sampling_params import BeamSearchParams
from vllm.tracing import (
contains_trace_headers,
extract_trace_headers,
......@@ -43,26 +34,6 @@ from vllm.utils.async_utils import merge_async_iterators
from .io_processor import PoolingIOProcessor
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
class PoolingServing:
request_id_prefix: ClassVar[str]
......@@ -109,8 +80,8 @@ class PoolingServing:
async def __call__(
self,
request: AnyPoolingRequest,
raw_request: Request,
) -> JSONResponse:
raw_request: Request | None = None,
) -> Response:
model_name = self.models.model_name()
request_id = f"{self.request_id_prefix}-{self._base_request_id(raw_request)}"
......@@ -125,19 +96,11 @@ class PoolingServing:
self._validate_request(ctx)
self._maybe_get_adapters(ctx)
await self._preprocess(ctx)
await self.io_processor.pre_process_online_async(ctx)
await self._prepare_generators(ctx)
await self._collect_batch(ctx)
response = await self._build_response(ctx)
return JSONResponse(content=response.model_dump())
async def _preprocess(
self,
ctx: PoolingServeContext,
):
ctx.engine_prompts = await self.io_processor.pre_process_online_async(
ctx.request
)
await self.io_processor.post_process_online_async(ctx)
return await self._build_response(ctx)
async def _prepare_generators(
self,
......@@ -157,10 +120,14 @@ class PoolingServing:
pooling_params = self.io_processor.create_pooling_params(ctx.request)
for i, engine_prompt in enumerate(ctx.engine_prompts):
request_id_item = f"{ctx.request_id}-{i}"
prompt_request_id = (
f"{ctx.request_id}-{i}"
if ctx.prompt_request_ids is None
else ctx.prompt_request_ids[i]
)
self._log_inputs(
request_id_item,
prompt_request_id,
engine_prompt,
params=pooling_params,
lora_request=ctx.lora_request,
......@@ -169,7 +136,7 @@ class PoolingServing:
generator = self.engine_client.encode(
engine_prompt,
pooling_params,
request_id_item,
prompt_request_id,
lora_request=ctx.lora_request,
trace_headers=trace_headers,
priority=getattr(ctx.request, "priority", 0),
......@@ -189,9 +156,9 @@ class PoolingServing:
if ctx.result_generator is None:
raise ValueError("Result generator not available")
num_prompts = len(ctx.engine_prompts)
num_inputs = len(ctx.engine_prompts)
final_res_batch: list[PoolingRequestOutput | None]
final_res_batch = [None] * num_prompts
final_res_batch = [None] * num_inputs
async for i, res in ctx.result_generator:
final_res_batch[i] = res
......@@ -204,7 +171,7 @@ class PoolingServing:
async def _build_response(
self,
ctx: PoolingServeContext,
) -> AnyPoolingResponse:
) -> Response:
raise NotImplementedError
@staticmethod
......@@ -294,7 +261,7 @@ class PoolingServing:
return None
# if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.")
raise VLLMNotFoundError(f"The model `{request.model}` does not exist.")
def _get_active_default_mm_loras(
self, request: AnyPoolingRequest
......@@ -349,8 +316,8 @@ class PoolingServing:
def _log_inputs(
self,
request_id: str,
inputs: PromptType | ProcessorInputs,
params: SamplingParams | PoolingParams | BeamSearchParams | None,
inputs: ProcessorInputs,
params: PoolingParams,
lora_request: LoRARequest | None,
) -> None:
if self.request_logger is None:
......
......@@ -2,12 +2,10 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, Depends, Request
from starlette.responses import JSONResponse
from fastapi.responses import JSONResponse, Response
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationRequest,
)
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.classify.serving import ServingClassification
from vllm.entrypoints.utils import (
create_error_response,
......@@ -19,7 +17,7 @@ router = APIRouter()
def classify(request: Request) -> ServingClassification | None:
return request.app.state.openai_serving_classification
return request.app.state.serving_classification
@router.post("/classify", dependencies=[Depends(validate_json_request)])
......@@ -27,7 +25,7 @@ def classify(request: Request) -> ServingClassification | None:
@load_aware_call
async def create_classify(
request: ClassificationRequest, raw_request: Request
) -> JSONResponse:
) -> Response:
handler = classify(raw_request)
if handler is None:
error_response = create_error_response(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from typing import Any
from vllm import PromptType
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
)
from vllm.inputs import ProcessorInputs
from vllm.renderers.inputs import TokPrompt
class ClassifyIOProcessor(PoolingIOProcessor):
def pre_process_online(
self, request: ClassificationCompletionRequest | ClassificationChatRequest
) -> list[TokPrompt] | None:
if isinstance(request, ClassificationChatRequest):
self._validate_chat_template(
request_chat_template=request.chat_template,
chat_template_kwargs=request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
)
_, engine_prompts = self._preprocess_chat_online(
request,
request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(request, ClassificationCompletionRequest):
engine_prompts = self._preprocess_completion_online(
request,
prompt_input=request.input,
prompt_embeds=None,
)
else:
raise ValueError("Invalid classification request type")
return engine_prompts
def pre_process_offline(
self,
prompts: PromptType | Sequence[PromptType],
tokenization_kwargs: dict[str, Any] | None = None,
) -> Sequence[ProcessorInputs]:
return self._preprocess_completion_offline(
prompts=prompts, tokenization_kwargs=tokenization_kwargs
)
name = "classification"
......@@ -4,13 +4,15 @@
from typing import TypeAlias
import numpy as np
from fastapi.responses import JSONResponse
from vllm import ClassificationOutput
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateConfig
from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.serving import PoolingServeContext, PoolingServing
from vllm.entrypoints.pooling.base.serving import PoolingServing
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.logger import init_logger
from vllm.outputs import ClassificationOutput
from vllm.renderers import BaseRenderer
from .io_processor import ClassifyIOProcessor
......@@ -44,15 +46,11 @@ class ServingClassification(PoolingServing):
async def _build_response(
self,
ctx: ClassificationServeContext,
) -> ClassificationResponse:
final_res_batch_checked = await self.io_processor.post_process_async(
ctx.final_res_batch
)
) -> JSONResponse:
id2label = getattr(self.model_config.hf_config, "id2label", {})
num_prompt_tokens = 0
items: list[ClassificationData] = []
for idx, final_res in enumerate(final_res_batch_checked):
for idx, final_res in enumerate(ctx.final_res_batch):
classify_res = ClassificationOutput.from_base(final_res.outputs)
probs = classify_res.probs
......@@ -75,10 +73,12 @@ class ServingClassification(PoolingServing):
total_tokens=num_prompt_tokens,
)
return ClassificationResponse(
response = ClassificationResponse(
id=ctx.request_id,
created=ctx.created_time,
model=ctx.model_name,
data=items,
usage=usage,
)
return JSONResponse(content=response.model_dump())
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
from functools import lru_cache
from http import HTTPStatus
from fastapi import APIRouter, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never
from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingRequest,
EmbeddingResponse,
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
from vllm.entrypoints.pooling.embed.serving import ServingEmbedding
from vllm.entrypoints.utils import (
create_error_response,
load_aware_call,
with_cancellation,
)
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.utils import load_aware_call, with_cancellation
from vllm.logger import init_logger
router = APIRouter()
logger = init_logger(__name__)
@lru_cache(maxsize=1)
def _get_json_response_cls():
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
def embedding(request: Request) -> OpenAIServingEmbedding | None:
return request.app.state.openai_serving_embedding
def embedding(request: Request) -> ServingEmbedding | None:
return request.app.state.serving_embedding
@router.post(
......@@ -56,24 +39,11 @@ async def create_embedding(
):
handler = embedding(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
error_response = create_error_response(
message="The model does not support Embeddings API"
)
generator = await handler.create_embedding(request, raw_request)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
content=error_response.model_dump(),
status_code=error_response.error.code,
)
elif isinstance(generator, EmbeddingResponse):
return _get_json_response_cls()(content=generator.model_dump())
elif isinstance(generator, EmbeddingBytesResponse):
return StreamingResponse(
content=generator.content,
headers=generator.headers,
media_type=generator.media_type,
)
assert_never(generator)
return await handler(request, raw_request)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, cast
import torch
from vllm.entrypoints.pooling.base.io_processor import PoolingIOProcessor
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.inputs.data import ProcessorInputs, token_inputs
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.utils.collection_utils import chunk_list
class EmbedIOProcessor(PoolingIOProcessor):
name = "embedding"
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.model_config.pooler_config is not None
self.pooler_config = self.model_config.pooler_config
self.enable_chunked_processing = self.pooler_config.enable_chunked_processing
#################################################################
# Long Text Embedding with Chunked Processing
# PTAL: examples/pooling/embed/openai_embedding_long_text
def pre_process_online(self, ctx: PoolingServeContext):
super().pre_process_online(ctx)
if not self.enable_chunked_processing:
return None
if ctx.engine_prompts is None:
raise ValueError("Engine prompts not available")
ctx.intermediates = ctx.engine_prompts
request_id = ctx.request_id
max_model_len = self.model_config.max_model_len
chunked_engine_prompts: list[ProcessorInputs] = []
prompt_request_ids: list[str] = []
for prompt_idx, engine_prompt in enumerate(ctx.engine_prompts):
token_ids = engine_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
prompt_token_ids = cast(list[int], token_ids)
for chunk_idx, chunk_tokens in enumerate(
chunk_list(prompt_token_ids, max_model_len)
):
chunked_engine_prompts.append(
token_inputs(prompt_token_ids=chunk_tokens)
)
prompt_request_ids.append(
f"{request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
)
ctx.engine_prompts = chunked_engine_prompts
ctx.prompt_request_ids = prompt_request_ids
return None
def post_process_online(
self,
ctx: PoolingServeContext,
):
if ctx.final_res_batch is None:
raise ValueError("Final response batch not available")
if not self.enable_chunked_processing:
return super().post_process_online(ctx)
# Online aggregation for chunked requests to
# minimize memory usage
# Track aggregation state for each prompt
prompt_aggregators: dict[int, dict[str, Any]] = {}
short_prompts_results: dict[int, PoolingRequestOutput] = {}
for result_idx, result in enumerate(ctx.final_res_batch):
if "-chunk-" not in result.request_id:
# Non-chunked result - extract prompt_idx from request_id
parts = result.request_id.split("-")
try:
# Last part should be prompt index
prompt_idx = int(parts[-1])
except (ValueError, IndexError):
prompt_idx = result_idx # Fallback to result_idx
short_prompts_results[prompt_idx] = result
else:
# Extract prompt_idx from chunked request_id
parts = result.request_id.split("-")
try:
prompt_idx = int(parts[parts.index("prompt") + 1])
except (ValueError, IndexError):
# Fallback: extract from result_idx if parsing fails
prompt_idx = result_idx
# Initialize aggregator for this prompt if needed
if prompt_idx not in prompt_aggregators:
prompt_aggregators[prompt_idx] = {
"weighted_sum": None,
"total_weight": 0,
"chunk_count": 0,
"request_id": result.request_id.split("-chunk-")[0],
}
aggregator = prompt_aggregators[prompt_idx]
# MEAN pooling with online weighted averaging
# Ensure result is PoolingRequestOutput
# for embedding processing
if not isinstance(result, PoolingRequestOutput):
raise ValueError(
f"Expected PoolingRequestOutput for "
f"chunked embedding, got "
f"{type(result).__name__}"
)
if result.prompt_token_ids is None:
raise ValueError(
"prompt_token_ids cannot be None for chunked processing"
)
weight = len(result.prompt_token_ids)
embedding_data = result.outputs.data
weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
if aggregator["weighted_sum"] is None:
# First chunk
aggregator["weighted_sum"] = weighted_embedding
else:
# Accumulate
aggregator["weighted_sum"] += weighted_embedding
aggregator["total_weight"] += weight
aggregator["chunk_count"] += 1
if ctx.intermediates is None:
raise ValueError("Original prompts inputs not available")
original_engine_prompts = cast(list[ProcessorInputs], ctx.intermediates)
num_prompts = len(original_engine_prompts)
# Finalize aggregated results
final_res_batch: list[PoolingRequestOutput] = []
for prompt_idx in range(num_prompts):
if prompt_idx in prompt_aggregators:
# Finalize MEAN aggregation for this chunked prompt
aggregator = prompt_aggregators[prompt_idx]
weighted_sum = aggregator["weighted_sum"]
total_weight = aggregator["total_weight"]
if (
weighted_sum is not None
and isinstance(weighted_sum, torch.Tensor)
and isinstance(total_weight, (int, float))
and total_weight > 0
):
# Compute final mean embedding
final_embedding = weighted_sum / total_weight
# Create a PoolingRequestOutput
# for the aggregated result
pooling_output_data = PoolingOutput(data=final_embedding)
# Get original prompt token IDs for this prompt
original_prompt = original_engine_prompts[prompt_idx]
token_ids = original_prompt.get("prompt_token_ids", None)
if token_ids is None:
raise NotImplementedError(
"Long Text Embedding with Chunked Processing does "
"not support EmbedsPrompt and EncoderDecoderInputs."
)
original_token_ids = cast(list[int], token_ids)
pooling_request_output = PoolingRequestOutput(
request_id=aggregator["request_id"],
prompt_token_ids=original_token_ids,
outputs=pooling_output_data,
num_cached_tokens=0,
finished=True,
)
final_res_batch.append(pooling_request_output)
else:
raise ValueError(
f"Failed to aggregate chunks for prompt {prompt_idx}"
)
elif prompt_idx in short_prompts_results:
final_res_batch.append(short_prompts_results[prompt_idx])
else:
raise ValueError(f"Result not found for prompt {prompt_idx}")
ctx.final_res_batch = final_res_batch
return None
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from collections.abc import AsyncGenerator, Callable, Mapping
from collections.abc import Callable
from functools import partial
from typing import Any, Final, Literal, TypeAlias, cast
from typing import Literal, TypeAlias, cast
import torch
from fastapi import Request
from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ErrorResponse, UsageInfo
from vllm.entrypoints.openai.engine.serving import OpenAIServing, ServeContext
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import ChatTemplateConfig
from vllm.entrypoints.openai.engine.protocol import UsageInfo
from vllm.entrypoints.pooling.base.serving import PoolingServing
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingRequest,
EmbeddingResponse,
EmbeddingResponseData,
)
from vllm.entrypoints.pooling.typing import PoolingServeContext
from vllm.entrypoints.pooling.utils import (
encode_pooling_bytes,
encode_pooling_output_base64,
encode_pooling_output_float,
get_json_response_cls,
)
from vllm.inputs.data import ProcessorInputs, TokensPrompt, token_inputs
from vllm.logger import init_logger
from vllm.outputs import PoolingOutput, PoolingRequestOutput
from vllm.pooling_params import PoolingParams
from vllm.utils.async_utils import merge_async_iterators
from vllm.utils.collection_utils import chunk_list
from vllm.outputs import PoolingRequestOutput
from vllm.renderers import BaseRenderer
from vllm.utils.serial_utils import EmbedDType, Endianness
logger = init_logger(__name__)
JSONResponseCLS = get_json_response_cls()
EmbeddingServeContext: TypeAlias = PoolingServeContext[EmbeddingRequest]
EmbeddingServeContext: TypeAlias = ServeContext[EmbeddingRequest]
class ServingEmbedding(PoolingServing):
"""
Embedding API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/embeddings/create
for the API specification. This API mimics the OpenAI Embedding API.
"""
class OpenAIServingEmbedding(OpenAIServing):
request_id_prefix = "embd"
def __init__(
def init_io_processor(
self,
engine_client: EngineClient,
models: OpenAIServingModels,
*,
request_logger: RequestLogger | None,
chat_template: str | None,
chat_template_content_format: ChatTemplateContentFormatOption,
trust_request_chat_template: bool = False,
) -> None:
super().__init__(
engine_client=engine_client,
models=models,
request_logger=request_logger,
model_config: ModelConfig,
renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig,
) -> EmbedIOProcessor:
return EmbedIOProcessor(
model_config=model_config,
renderer=renderer,
chat_template_config=chat_template_config,
)
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.trust_request_chat_template = trust_request_chat_template
pooler_config = self.model_config.pooler_config
assert pooler_config is not None
self.pooler_config = pooler_config
async def _preprocess(
async def _build_response(
self,
ctx: EmbeddingServeContext,
) -> ErrorResponse | None:
ctx.lora_request = self._maybe_get_adapters(ctx.request)
) -> JSONResponse | StreamingResponse:
encoding_format = ctx.request.encoding_format
embed_dtype = ctx.request.embed_dtype
endianness = ctx.request.endianness
if isinstance(ctx.request, EmbeddingChatRequest):
error_check_ret = self._validate_chat_template(
request_chat_template=ctx.request.chat_template,
chat_template_kwargs=ctx.request.chat_template_kwargs,
trust_request_chat_template=self.trust_request_chat_template,
if encoding_format == "float" or encoding_format == "base64":
return self._request_output_to_embed_json_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
ctx.model_name,
encoding_format,
embed_dtype,
endianness,
)
if error_check_ret is not None:
return error_check_ret
_, ctx.engine_prompts = await self._preprocess_chat(
ctx.request,
ctx.request.messages,
default_template=self.chat_template,
default_template_content_format=self.chat_template_content_format,
default_template_kwargs=None,
)
elif isinstance(ctx.request, EmbeddingCompletionRequest):
ctx.engine_prompts = await self._preprocess_completion(
ctx.request,
prompt_input=ctx.request.input,
prompt_embeds=None,
if encoding_format == "bytes" or encoding_format == "bytes_only":
return self._request_output_to_to_embed_bytes_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
ctx.model_name,
encoding_format,
embed_dtype,
endianness,
)
else:
return self.create_error_response("Invalid classification request type")
return None
assert_never(encoding_format)
def request_output_to_embed_json_response(
def _request_output_to_embed_json_response(
self,
final_res_batch: list[PoolingRequestOutput],
request_id: str,
......@@ -111,7 +98,7 @@ class OpenAIServingEmbedding(OpenAIServing):
encoding_format: Literal["float", "base64"],
embed_dtype: EmbedDType,
endianness: Endianness,
) -> EmbeddingResponse:
) -> JSONResponse:
encode_fn = cast(
Callable[[PoolingRequestOutput], list[float] | str],
(
......@@ -143,15 +130,16 @@ class OpenAIServingEmbedding(OpenAIServing):
total_tokens=num_prompt_tokens,
)
return EmbeddingResponse(
response = EmbeddingResponse(
id=request_id,
created=created_time,
model=model_name,
data=items,
usage=usage,
)
return JSONResponseCLS(content=response.model_dump())
def request_output_to_embed_bytes_response(
def _request_output_to_to_embed_bytes_response(
self,
final_res_batch: list[PoolingRequestOutput],
request_id: str,
......@@ -160,7 +148,7 @@ class OpenAIServingEmbedding(OpenAIServing):
encoding_format: Literal["bytes", "bytes_only"],
embed_dtype: EmbedDType,
endianness: Endianness,
) -> EmbeddingBytesResponse:
) -> StreamingResponse:
content, items, usage = encode_pooling_bytes(
pooling_outputs=final_res_batch,
embed_dtype=embed_dtype,
......@@ -183,441 +171,9 @@ class OpenAIServingEmbedding(OpenAIServing):
}
)
return EmbeddingBytesResponse(content=content, headers=headers)
def _build_response(
self,
ctx: EmbeddingServeContext,
) -> EmbeddingResponse | EmbeddingBytesResponse | ErrorResponse:
encoding_format = ctx.request.encoding_format
embed_dtype = ctx.request.embed_dtype
endianness = ctx.request.endianness
if encoding_format == "float" or encoding_format == "base64":
return self.request_output_to_embed_json_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
ctx.model_name,
encoding_format,
embed_dtype,
endianness,
)
if encoding_format == "bytes" or encoding_format == "bytes_only":
return self.request_output_to_embed_bytes_response(
ctx.final_res_batch,
ctx.request_id,
ctx.created_time,
ctx.model_name,
encoding_format,
embed_dtype,
endianness,
)
assert_never(encoding_format)
def _get_max_position_embeddings(self) -> int:
"""Get the model's effective maximum sequence length for chunking."""
return self.model_config.max_model_len
def _should_use_chunked_processing(self, request) -> bool:
"""Check if chunked processing should be used for this request."""
return (
isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest))
and self.pooler_config.enable_chunked_processing
)
async def _process_chunked_request(
self,
ctx: EmbeddingServeContext,
token_ids: list[int],
pooling_params: PoolingParams,
trace_headers: Mapping[str, str] | None,
prompt_idx: int,
) -> list[AsyncGenerator[PoolingRequestOutput, None]]:
"""Process a single prompt using chunked processing."""
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
# Split into chunks using max_position_embeddings
max_pos_embeddings = self._get_max_position_embeddings()
# Process all chunks for MEAN aggregation
for chunk_idx, chunk_tokens in enumerate(
chunk_list(token_ids, max_pos_embeddings)
):
# Create a request ID for this chunk
chunk_request_id = f"{ctx.request_id}-prompt-{prompt_idx}-chunk-{chunk_idx}"
# Create engine prompt for this chunk
chunk_engine_prompt = token_inputs(chunk_tokens)
# Log the chunk
self._log_inputs(
chunk_request_id,
chunk_engine_prompt,
params=pooling_params,
lora_request=ctx.lora_request,
)
# Create generator for this chunk and wrap it to return indices
original_generator = self.engine_client.encode(
chunk_engine_prompt,
pooling_params,
chunk_request_id,
lora_request=ctx.lora_request,
trace_headers=trace_headers,
priority=ctx.request.priority,
)
generators.append(original_generator)
return generators
def _validate_input(
self,
request: object,
input_ids: list[int],
input_text: str,
) -> TokensPrompt:
"""Override to support chunked processing for embedding requests."""
token_num = len(input_ids)
# Note: EmbeddingRequest doesn't have max_tokens
if isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)):
# Check if chunked processing is enabled for pooling models
enable_chunked = self._should_use_chunked_processing(request)
# Use max_position_embeddings for chunked processing decisions
max_pos_embeddings = self._get_max_position_embeddings()
# Determine the effective max length for validation
if self.pooler_config.max_embed_len:
# Use max_embed_len for validation instead of max_model_len
length_type = "maximum embedding input length"
max_length_value = self.pooler_config.max_embed_len
else:
# Fall back to max_model_len validation (original behavior)
length_type = "maximum context length"
max_length_value = self.model_config.max_model_len
validation_error_msg = (
"This model's {length_type} is {max_length_value} tokens. "
"However, you requested {token_num} tokens in the input for "
"embedding generation. Please reduce the length of the input."
)
chunked_processing_error_msg = (
"This model's {length_type} is {max_length_value} tokens. "
"However, you requested {token_num} tokens in the input for "
"embedding generation. Please reduce the length of the input "
"or enable chunked processing."
)
# Check if input exceeds max length
if token_num > max_length_value:
raise ValueError(
validation_error_msg.format(
length_type=length_type,
max_length_value=max_length_value,
token_num=token_num,
)
)
# Check for chunked processing
# when exceeding max_position_embeddings
if token_num > max_pos_embeddings:
if enable_chunked:
# Allow long inputs when chunked processing is enabled
logger.info(
"Input length %s exceeds max_position_embeddings "
"%s, will use chunked processing",
token_num,
max_pos_embeddings,
)
else:
raise ValueError(
chunked_processing_error_msg.format(
length_type="maximum position embeddings length",
max_length_value=max_pos_embeddings,
token_num=token_num,
)
)
return TokensPrompt(prompt=input_text, prompt_token_ids=input_ids)
# For other request types, use the parent's implementation
return super()._validate_input(request, input_ids, input_text)
async def _create_single_prompt_generator(
self,
ctx: EmbeddingServeContext,
engine_prompt: ProcessorInputs,
pooling_params: PoolingParams,
trace_headers: Mapping[str, str] | None,
prompt_index: int,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Create a generator for a single prompt using standard processing."""
request_id_item = f"{ctx.request_id}-{prompt_index}"
self._log_inputs(
request_id_item,
engine_prompt,
params=pooling_params,
lora_request=ctx.lora_request,
)
# Return the original generator without wrapping
return self.engine_client.encode(
engine_prompt,
pooling_params,
request_id_item,
lora_request=ctx.lora_request,
trace_headers=trace_headers,
priority=ctx.request.priority,
)
async def _prepare_generators(
self,
ctx: EmbeddingServeContext,
) -> ErrorResponse | None:
"""Override to support chunked processing."""
# Check if we should use chunked processing
use_chunked = self._should_use_chunked_processing(ctx.request)
# If no chunked processing needed, delegate to parent class
if not use_chunked:
return await super()._prepare_generators(ctx)
# Custom logic for chunked processing
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
trace_headers = (
None
if ctx.raw_request is None
else await self._get_trace_headers(ctx.raw_request.headers)
response = EmbeddingBytesResponse(content=content, headers=headers)
return StreamingResponse(
content=response.content,
headers=response.headers,
media_type=response.media_type,
)
pooling_params = self._create_pooling_params(ctx)
if isinstance(pooling_params, ErrorResponse):
return pooling_params
if ctx.engine_prompts is None:
return self.create_error_response("Engine prompts not available")
max_pos_embeddings = self._get_max_position_embeddings()
for i, engine_prompt in enumerate(ctx.engine_prompts):
# Check if this specific prompt needs chunked processing
if "prompt_token_ids" in engine_prompt:
prompt_token_ids = engine_prompt["prompt_token_ids"] # type: ignore[typeddict-item]
if len(prompt_token_ids) > max_pos_embeddings:
# Use chunked processing for this prompt
chunk_generators = await self._process_chunked_request(
ctx,
prompt_token_ids,
pooling_params,
trace_headers,
i,
)
generators.extend(chunk_generators)
continue
# Normal processing for short prompts or non-token prompts
generator = await self._create_single_prompt_generator(
ctx, engine_prompt, pooling_params, trace_headers, i
)
generators.append(generator)
ctx.result_generator = merge_async_iterators(*generators)
return None
async def _collect_batch(
self,
ctx: EmbeddingServeContext,
) -> ErrorResponse | None:
"""Collect and aggregate batch results
with support for chunked processing.
For chunked requests, performs online aggregation to
minimize memory usage.
For regular requests, collects results normally.
"""
if ctx.engine_prompts is None:
return self.create_error_response("Engine prompts not available")
# Check if we used chunked processing
use_chunked = self._should_use_chunked_processing(ctx.request)
if not use_chunked:
return await super()._collect_batch(ctx=ctx)
if ctx.result_generator is None:
return self.create_error_response("Result generator not available")
# Online aggregation for chunked requests to
# minimize memory usage
# Track aggregation state for each prompt
prompt_aggregators: dict[int, dict[str, Any]] = {}
short_prompts_results: dict[int, PoolingRequestOutput] = {}
async for result_idx, result in ctx.result_generator:
if "-chunk-" in result.request_id:
# Extract prompt_idx from chunked request_id
parts = result.request_id.split("-")
try:
prompt_idx = int(parts[parts.index("prompt") + 1])
except (ValueError, IndexError):
# Fallback: extract from result_idx if parsing fails
prompt_idx = result_idx
# Initialize aggregator for this prompt if needed
if prompt_idx not in prompt_aggregators:
prompt_aggregators[prompt_idx] = {
"weighted_sum": None,
"total_weight": 0,
"chunk_count": 0,
"request_id": result.request_id.split("-chunk-")[0],
}
aggregator = prompt_aggregators[prompt_idx]
# MEAN pooling with online weighted averaging
# Ensure result is PoolingRequestOutput
# for embedding processing
if not isinstance(result, PoolingRequestOutput):
return self.create_error_response(
f"Expected PoolingRequestOutput for "
f"chunked embedding, got "
f"{type(result).__name__}"
)
# Handle both PoolingOutput and
# EmbeddingOutput types
if hasattr(result.outputs, "data"):
# PoolingOutput case
embedding_data = result.outputs.data
elif hasattr(result.outputs, "embedding"):
# EmbeddingOutput case -
# convert embedding list to tensor
embedding_data = result.outputs.embedding
else:
return self.create_error_response(
f"Unsupported output type: {type(result.outputs).__name__}"
)
if not isinstance(embedding_data, torch.Tensor):
embedding_data = torch.tensor(embedding_data, dtype=torch.float32)
if result.prompt_token_ids is None:
return self.create_error_response(
"prompt_token_ids cannot be None for chunked processing"
)
weight = len(result.prompt_token_ids)
weighted_embedding = embedding_data.to(dtype=torch.float32) * weight
if aggregator["weighted_sum"] is None:
# First chunk
aggregator["weighted_sum"] = weighted_embedding
else:
# Accumulate
aggregator["weighted_sum"] += weighted_embedding
aggregator["total_weight"] += weight
aggregator["chunk_count"] += 1
else:
# Non-chunked result - extract prompt_idx from request_id
parts = result.request_id.split("-")
try:
# Last part should be prompt index
prompt_idx = int(parts[-1])
except (ValueError, IndexError):
prompt_idx = result_idx # Fallback to result_idx
short_prompts_results[prompt_idx] = result
# Finalize aggregated results
final_res_batch: list[PoolingRequestOutput] = []
num_prompts = len(ctx.engine_prompts)
for prompt_idx in range(num_prompts):
if prompt_idx in prompt_aggregators:
# Finalize MEAN aggregation for this chunked prompt
aggregator = prompt_aggregators[prompt_idx]
weighted_sum = aggregator["weighted_sum"]
total_weight = aggregator["total_weight"]
if (
weighted_sum is not None
and isinstance(weighted_sum, torch.Tensor)
and isinstance(total_weight, (int, float))
and total_weight > 0
):
# Compute final mean embedding
final_embedding = weighted_sum / total_weight
# Create a PoolingRequestOutput
# for the aggregated result
pooling_output_data = PoolingOutput(data=final_embedding)
# Get original prompt token IDs for this prompt
original_prompt = ctx.engine_prompts[prompt_idx]
if "prompt_token_ids" not in original_prompt:
return self.create_error_response(
f"Chunked prompt {prompt_idx} does not contain token IDs"
)
original_token_ids = original_prompt["prompt_token_ids"] # type: ignore[typeddict-item]
pooling_request_output = PoolingRequestOutput(
request_id=aggregator["request_id"],
prompt_token_ids=original_token_ids,
outputs=pooling_output_data,
num_cached_tokens=0,
finished=True,
)
final_res_batch.append(pooling_request_output)
else:
return self.create_error_response(
f"Failed to aggregate chunks for prompt {prompt_idx}"
)
elif prompt_idx in short_prompts_results:
final_res_batch.append(short_prompts_results[prompt_idx])
else:
return self.create_error_response(
f"Result not found for prompt {prompt_idx}"
)
ctx.final_res_batch = final_res_batch
return None
async def create_embedding(
self,
request: EmbeddingRequest,
raw_request: Request | None = None,
) -> EmbeddingResponse | ErrorResponse:
"""
Embedding API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/embeddings/create
for the API specification. This API mimics the OpenAI Embedding API.
"""
model_name = self.models.model_name()
request_id = (
f"{self.request_id_prefix}-"
f"{self._base_request_id(raw_request, request.request_id)}"
)
ctx = EmbeddingServeContext(
request=request,
raw_request=raw_request,
model_name=model_name,
request_id=request_id,
)
return await self.handle(ctx) # type: ignore[return-value]
......@@ -15,17 +15,21 @@ def init_pooling_io_processors(
renderer: BaseRenderer,
chat_template_config: ChatTemplateConfig,
) -> dict[str, PoolingIOProcessor]:
pooling_io_processors: dict[str, PoolingIOProcessor] = {}
processors: list[tuple[str, type[PoolingIOProcessor]]] = []
if "classify" in supported_tasks:
from vllm.entrypoints.pooling.classify.io_processor import (
ClassifyIOProcessor,
)
from vllm.entrypoints.pooling.classify.io_processor import ClassifyIOProcessor
processors.append(("classify", ClassifyIOProcessor))
if "embed" in supported_tasks:
from vllm.entrypoints.pooling.embed.io_processor import EmbedIOProcessor
pooling_io_processors["classify"] = ClassifyIOProcessor(
processors.append(("classify", EmbedIOProcessor))
return {
task: processor_cls(
model_config=model_config,
renderer=renderer,
chat_template_config=chat_template_config,
)
return pooling_io_processors
for task, processor_cls in processors
}
......@@ -21,7 +21,7 @@ router = APIRouter()
def pooling(request: Request) -> OpenAIServingPooling | None:
return request.app.state.openai_serving_pooling
return request.app.state.serving_pooling
@router.post(
......
......@@ -24,11 +24,11 @@ logger = init_logger(__name__)
def score(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores
return request.app.state.serving_scores
def rerank(request: Request) -> ServingScores | None:
return request.app.state.openai_serving_scores
return request.app.state.serving_scores
@router.post(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from collections.abc import AsyncGenerator
from dataclasses import dataclass, field
from typing import Any, Generic, TypeAlias, TypeVar
from typing import TypeAlias
from fastapi import Request
from pydantic import ConfigDict
from vllm import PoolingRequestOutput
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
......@@ -25,12 +31,12 @@ from vllm.entrypoints.pooling.score.protocol import (
ScoreRequest,
ScoreResponse,
)
from vllm.inputs import ProcessorInputs
from vllm.lora.request import LoRARequest
PoolingCompletionLikeRequest: TypeAlias = (
EmbeddingCompletionRequest
| ClassificationCompletionRequest
| RerankRequest
| ScoreRequest
| PoolingCompletionRequest
)
......@@ -39,7 +45,11 @@ PoolingChatLikeRequest: TypeAlias = (
)
AnyPoolingRequest: TypeAlias = (
PoolingCompletionLikeRequest | PoolingChatLikeRequest | IOProcessorRequest
PoolingCompletionLikeRequest
| PoolingChatLikeRequest
| IOProcessorRequest
| RerankRequest
| ScoreRequest
)
AnyPoolingResponse: TypeAlias = (
......@@ -49,3 +59,26 @@ AnyPoolingResponse: TypeAlias = (
| PoolingResponse
| ScoreResponse
)
PoolingRequestT = TypeVar("PoolingRequestT", bound=AnyPoolingRequest)
@dataclass(kw_only=True)
class PoolingServeContext(Generic[PoolingRequestT]):
request: PoolingRequestT
raw_request: Request | None = None
model_name: str
request_id: str
created_time: int = field(default_factory=lambda: int(time.time()))
lora_request: LoRARequest | None = None
engine_prompts: list[ProcessorInputs] | None = None
prompt_request_ids: list[str] | None = None
intermediates: Any | None = None
result_generator: AsyncGenerator[tuple[int, PoolingRequestOutput], None] | None = (
None
)
final_res_batch: list[PoolingRequestOutput] = field(default_factory=list)
model_config = ConfigDict(arbitrary_types_allowed=True)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib.util
import math
from dataclasses import dataclass
from functools import lru_cache
from typing import Any
import pybase64
import torch
from fastapi.responses import JSONResponse
from vllm.logger import init_logger
from vllm.outputs import PoolingRequestOutput
from vllm.utils.serial_utils import (
EMBED_DTYPES,
......@@ -16,6 +21,8 @@ from vllm.utils.serial_utils import (
tensor2binary,
)
logger = init_logger(__name__)
@dataclass
class MetadataItem:
......@@ -122,3 +129,15 @@ def decode_pooling_output(items: list[MetadataItem], body: bytes) -> list[torch.
)
for item in sorted(items, key=lambda x: x.index)
]
@lru_cache(maxsize=1)
def get_json_response_cls() -> type[JSONResponse]:
if importlib.util.find_spec("orjson") is not None:
from fastapi.responses import ORJSONResponse
return ORJSONResponse
logger.warning_once(
"To make v1/embeddings API fast, please install orjson by `pip install orjson`"
)
return JSONResponse
......@@ -303,12 +303,16 @@ def create_error_response(
if isinstance(message, Exception):
exc = message
from vllm.exceptions import VLLMValidationError
from vllm.exceptions import VLLMNotFoundError, VLLMValidationError
if isinstance(exc, VLLMValidationError):
err_type = "BadRequestError"
status_code = HTTPStatus.BAD_REQUEST
param = exc.parameter
elif isinstance(exc, VLLMNotFoundError):
err_type = "NotFoundError"
status_code = HTTPStatus.NOT_FOUND
param = None
elif isinstance(exc, (ValueError, TypeError, OverflowError)):
# Common validation errors from user input
err_type = "BadRequestError"
......
......@@ -34,3 +34,9 @@ class VLLMValidationError(ValueError):
if self.value is not None:
extras.append(f"value={self.value}")
return f"{base} ({', '.join(extras)})" if extras else base
class VLLMNotFoundError(ValueError):
"""vLLM-specific NotFoundError"""
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment