Unverified Commit cf069aa8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update deprecated Python 3.8 typing (#13971)

parent bf33700e
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Union from typing import Optional, Union
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -22,7 +22,7 @@ class RequestLogger: ...@@ -22,7 +22,7 @@ class RequestLogger:
self, self,
request_id: str, request_id: str,
prompt: Optional[str], prompt: Optional[str],
prompt_token_ids: Optional[List[int]], prompt_token_ids: Optional[list[int]],
params: Optional[Union[SamplingParams, PoolingParams, params: Optional[Union[SamplingParams, PoolingParams,
BeamSearchParams]], BeamSearchParams]],
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
......
...@@ -13,10 +13,11 @@ import socket ...@@ -13,10 +13,11 @@ import socket
import tempfile import tempfile
import uuid import uuid
from argparse import Namespace from argparse import Namespace
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from functools import partial from functools import partial
from http import HTTPStatus from http import HTTPStatus
from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union from typing import Annotated, Optional, Union
import uvloop import uvloop
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
...@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory ...@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
logger = init_logger('vllm.entrypoints.openai.api_server') logger = init_logger('vllm.entrypoints.openai.api_server')
_running_tasks: Set[asyncio.Task] = set() _running_tasks: set[asyncio.Task] = set()
@asynccontextmanager @asynccontextmanager
...@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request): ...@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
return await do_rerank(request, raw_request) return await do_rerank(request, raw_request)
TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { TASK_HANDLERS: dict[str, dict[str, tuple]] = {
"generate": { "generate": {
"messages": (ChatCompletionRequest, create_chat_completion), "messages": (ChatCompletionRequest, create_chat_completion),
"default": (CompletionRequest, create_completion), "default": (CompletionRequest, create_completion),
...@@ -894,7 +895,7 @@ async def init_app_state( ...@@ -894,7 +895,7 @@ async def init_app_state(
state.task = model_config.task state.task = model_config.task
def create_server_socket(addr: Tuple[str, int]) -> socket.socket: def create_server_socket(addr: tuple[str, int]) -> socket.socket:
family = socket.AF_INET family = socket.AF_INET
if is_valid_ipv6_address(addr[0]): if is_valid_ipv6_address(addr[0]):
family = socket.AF_INET6 family = socket.AF_INET6
......
...@@ -8,7 +8,8 @@ purposes. ...@@ -8,7 +8,8 @@ purposes.
import argparse import argparse
import json import json
import ssl import ssl
from typing import List, Optional, Sequence, Union, get_args from collections.abc import Sequence
from typing import Optional, Union, get_args
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
...@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action): ...@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action):
if isinstance(values, str): if isinstance(values, str):
raise TypeError("Expected values to be a list") raise TypeError("Expected values to be a list")
lora_list: List[LoRAModulePath] = [] lora_list: list[LoRAModulePath] = []
for item in values: for item in values:
if item in [None, '']: # Skip if item is None or empty string if item in [None, '']: # Skip if item is None or empty string
continue continue
...@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action): ...@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action):
if isinstance(values, str): if isinstance(values, str):
raise TypeError("Expected values to be a list") raise TypeError("Expected values to be a list")
adapter_list: List[PromptAdapterPath] = [] adapter_list: list[PromptAdapterPath] = []
for item in values: for item in values:
name, path = item.split('=') name, path = item.split('=')
adapter_list.append(PromptAdapterPath(name, path)) adapter_list.append(PromptAdapterPath(name, path))
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from collections.abc import Iterable
from functools import lru_cache, partial from functools import lru_cache, partial
from typing import Dict, FrozenSet, Iterable, List, Optional, Union from typing import Optional, Union
import torch import torch
...@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor: ...@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
specific set of token ids.""" specific set of token ids."""
def __init__(self, allowed_ids: Iterable[int]): def __init__(self, allowed_ids: Iterable[int]):
self.allowed_ids: Optional[List[int]] = list(allowed_ids) self.allowed_ids: Optional[list[int]] = list(allowed_ids)
self.mask: Optional[torch.Tensor] = None self.mask: Optional[torch.Tensor] = None
def __call__(self, token_ids: List[int], def __call__(self, token_ids: list[int],
logits: torch.Tensor) -> torch.Tensor: logits: torch.Tensor) -> torch.Tensor:
if self.mask is None: if self.mask is None:
self.mask = torch.ones((logits.shape[-1], ), self.mask = torch.ones((logits.shape[-1], ),
...@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor: ...@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor:
@lru_cache(maxsize=32) @lru_cache(maxsize=32)
def _get_allowed_token_ids_logits_processor( def _get_allowed_token_ids_logits_processor(
allowed_token_ids: FrozenSet[int], allowed_token_ids: frozenset[int],
vocab_size: int, vocab_size: int,
) -> LogitsProcessor: ) -> LogitsProcessor:
if not allowed_token_ids: if not allowed_token_ids:
...@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor( ...@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
def logit_bias_logits_processor( def logit_bias_logits_processor(
logit_bias: Dict[int, float], logit_bias: dict[int, float],
token_ids: List[int], token_ids: list[int],
logits: torch.Tensor, logits: torch.Tensor,
) -> torch.Tensor: ) -> torch.Tensor:
for token_id, bias in logit_bias.items(): for token_id, bias in logit_bias.items():
...@@ -53,16 +54,16 @@ def logit_bias_logits_processor( ...@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
def get_logits_processors( def get_logits_processors(
logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]], logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
allowed_token_ids: Optional[List[int]], allowed_token_ids: Optional[list[int]],
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
) -> List[LogitsProcessor]: ) -> list[LogitsProcessor]:
logits_processors: List[LogitsProcessor] = [] logits_processors: list[LogitsProcessor] = []
if logit_bias: if logit_bias:
try: try:
# Convert token_id to integer # Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec # Clamp the bias between -100 and 100 per OpenAI API spec
clamped_logit_bias: Dict[int, float] = { clamped_logit_bias: dict[int, float] = {
int(token_id): min(100.0, max(-100.0, bias)) int(token_id): min(100.0, max(-100.0, bias))
for token_id, bias in logit_bias.items() for token_id, bias in logit_bias.items()
} }
......
...@@ -5,13 +5,13 @@ ...@@ -5,13 +5,13 @@
import re import re
import time import time
from argparse import Namespace from argparse import Namespace
from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union from typing import Annotated, Any, ClassVar, Literal, Optional, Union
import torch import torch
from fastapi import UploadFile from fastapi import UploadFile
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
ValidationInfo, field_validator, model_validator) ValidationInfo, field_validator, model_validator)
from typing_extensions import Annotated, TypeAlias from typing_extensions import TypeAlias
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel): ...@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
model_config = ConfigDict(extra="allow") model_config = ConfigDict(extra="allow")
# Cache class field names # Cache class field names
field_names: ClassVar[Optional[Set[str]]] = None field_names: ClassVar[Optional[set[str]]] = None
@model_validator(mode="wrap") @model_validator(mode="wrap")
@classmethod @classmethod
...@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel): ...@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
root: Optional[str] = None root: Optional[str] = None
parent: Optional[str] = None parent: Optional[str] = None
max_model_len: Optional[int] = None max_model_len: Optional[int] = None
permission: List[ModelPermission] = Field(default_factory=list) permission: list[ModelPermission] = Field(default_factory=list)
class ModelList(OpenAIBaseModel): class ModelList(OpenAIBaseModel):
object: str = "list" object: str = "list"
data: List[ModelCard] = Field(default_factory=list) data: list[ModelCard] = Field(default_factory=list)
class PromptTokenUsageInfo(OpenAIBaseModel): class PromptTokenUsageInfo(OpenAIBaseModel):
...@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel): ...@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
description: Optional[str] = None description: Optional[str] = None
# schema is the field in openai but that causes conflicts with pydantic so # schema is the field in openai but that causes conflicts with pydantic so
# instead use json_schema with an alias # instead use json_schema with an alias
json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema') json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
strict: Optional[bool] = None strict: Optional[bool] = None
...@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel): ...@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
class FunctionDefinition(OpenAIBaseModel): class FunctionDefinition(OpenAIBaseModel):
name: str name: str
description: Optional[str] = None description: Optional[str] = None
parameters: Optional[Dict[str, Any]] = None parameters: Optional[dict[str, Any]] = None
class ChatCompletionToolsParam(OpenAIBaseModel): class ChatCompletionToolsParam(OpenAIBaseModel):
...@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): ...@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
class LogitsProcessorConstructor(BaseModel): class LogitsProcessorConstructor(BaseModel):
qualname: str qualname: str
args: Optional[List[Any]] = None args: Optional[list[Any]] = None
kwargs: Optional[Dict[str, Any]] = None kwargs: Optional[dict[str, Any]] = None
LogitsProcessors = List[Union[str, LogitsProcessorConstructor]] LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
def get_logits_processors(processors: Optional[LogitsProcessors], def get_logits_processors(processors: Optional[LogitsProcessors],
pattern: Optional[str]) -> Optional[List[Any]]: pattern: Optional[str]) -> Optional[list[Any]]:
if processors and pattern: if processors and pattern:
logits_processors = [] logits_processors = []
for processor in processors: for processor in processors:
...@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors], ...@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
class ChatCompletionRequest(OpenAIBaseModel): class ChatCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation # Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create # https://platform.openai.com/docs/api-reference/chat/create
messages: List[ChatCompletionMessageParam] messages: list[ChatCompletionMessageParam]
model: Optional[str] = None model: Optional[str] = None
frequency_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None logit_bias: Optional[dict[str, float]] = None
logprobs: Optional[bool] = False logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0 top_logprobs: Optional[int] = 0
# TODO(#9845): remove max_tokens when field is removed from OpenAI API # TODO(#9845): remove max_tokens when field is removed from OpenAI API
...@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
presence_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None response_format: Optional[ResponseFormat] = None
seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
stream: Optional[bool] = False stream: Optional[bool] = False
stream_options: Optional[StreamOptions] = None stream_options: Optional[StreamOptions] = None
temperature: Optional[float] = None temperature: Optional[float] = None
top_p: Optional[float] = None top_p: Optional[float] = None
tools: Optional[List[ChatCompletionToolsParam]] = None tools: Optional[list[ChatCompletionToolsParam]] = None
tool_choice: Optional[Union[Literal["none"], Literal["auto"], tool_choice: Optional[Union[Literal["none"], Literal["auto"],
ChatCompletionNamedToolChoiceParam]] = "none" ChatCompletionNamedToolChoiceParam]] = "none"
...@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
min_p: Optional[float] = None min_p: Optional[float] = None
repetition_penalty: Optional[float] = None repetition_penalty: Optional[float] = None
length_penalty: float = 1.0 length_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = Field(default_factory=list) stop_token_ids: Optional[list[int]] = Field(default_factory=list)
include_stop_str_in_output: bool = False include_stop_str_in_output: bool = False
ignore_eos: bool = False ignore_eos: bool = False
min_tokens: int = 0 min_tokens: int = 0
...@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to false (as is the " "special tokens so this should be set to false (as is the "
"default)."), "default)."),
) )
documents: Optional[List[Dict[str, str]]] = Field( documents: Optional[list[dict[str, str]]] = Field(
default=None, default=None,
description= description=
("A list of dicts representing documents that will be accessible to " ("A list of dicts representing documents that will be accessible to "
...@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer " "allowed, so you must provide a chat template if the tokenizer "
"does not define one."), "does not define one."),
) )
chat_template_kwargs: Optional[Dict[str, Any]] = Field( chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the template renderer. " description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."), "Will be accessible by the chat template."),
) )
mm_processor_kwargs: Optional[Dict[str, Any]] = Field( mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the HF processor."), description=("Additional kwargs to pass to the HF processor."),
) )
...@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
description=( description=(
"If specified, the output will follow the regex pattern."), "If specified, the output will follow the regex pattern."),
) )
guided_choice: Optional[List[str]] = Field( guided_choice: Optional[list[str]] = Field(
default=None, default=None,
description=( description=(
"If specified, the output will be exactly one of the choices."), "If specified, the output will be exactly one of the choices."),
...@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel): ...@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation # Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/completions/create # https://platform.openai.com/docs/api-reference/completions/create
model: Optional[str] = None model: Optional[str] = None
prompt: Union[List[int], List[List[int]], str, List[str]] prompt: Union[list[int], list[list[int]], str, list[str]]
best_of: Optional[int] = None best_of: Optional[int] = None
echo: Optional[bool] = False echo: Optional[bool] = False
frequency_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None logit_bias: Optional[dict[str, float]] = None
logprobs: Optional[int] = None logprobs: Optional[int] = None
max_tokens: Optional[int] = 16 max_tokens: Optional[int] = 16
n: int = 1 n: int = 1
presence_penalty: Optional[float] = 0.0 presence_penalty: Optional[float] = 0.0
seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: Optional[Union[str, List[str]]] = Field(default_factory=list) stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
stream: Optional[bool] = False stream: Optional[bool] = False
stream_options: Optional[StreamOptions] = None stream_options: Optional[StreamOptions] = None
suffix: Optional[str] = None suffix: Optional[str] = None
...@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel): ...@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
min_p: Optional[float] = None min_p: Optional[float] = None
repetition_penalty: Optional[float] = None repetition_penalty: Optional[float] = None
length_penalty: float = 1.0 length_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = Field(default_factory=list) stop_token_ids: Optional[list[int]] = Field(default_factory=list)
include_stop_str_in_output: bool = False include_stop_str_in_output: bool = False
ignore_eos: bool = False ignore_eos: bool = False
min_tokens: int = 0 min_tokens: int = 0
skip_special_tokens: bool = True skip_special_tokens: bool = True
spaces_between_special_tokens: bool = True spaces_between_special_tokens: bool = True
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
allowed_token_ids: Optional[List[int]] = None allowed_token_ids: Optional[list[int]] = None
prompt_logprobs: Optional[int] = None prompt_logprobs: Optional[int] = None
# doc: end-completion-sampling-params # doc: end-completion-sampling-params
...@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel): ...@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
description=( description=(
"If specified, the output will follow the regex pattern."), "If specified, the output will follow the regex pattern."),
) )
guided_choice: Optional[List[str]] = Field( guided_choice: Optional[list[str]] = Field(
default=None, default=None,
description=( description=(
"If specified, the output will be exactly one of the choices."), "If specified, the output will be exactly one of the choices."),
...@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): ...@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation # Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings # https://platform.openai.com/docs/api-reference/embeddings
model: Optional[str] = None model: Optional[str] = None
input: Union[List[int], List[List[int]], str, List[str]] input: Union[list[int], list[list[int]], str, list[str]]
encoding_format: Literal["float", "base64"] = "float" encoding_format: Literal["float", "base64"] = "float"
dimensions: Optional[int] = None dimensions: Optional[int] = None
user: Optional[str] = None user: Optional[str] = None
...@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel): ...@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
class EmbeddingChatRequest(OpenAIBaseModel): class EmbeddingChatRequest(OpenAIBaseModel):
model: Optional[str] = None model: Optional[str] = None
messages: List[ChatCompletionMessageParam] messages: list[ChatCompletionMessageParam]
encoding_format: Literal["float", "base64"] = "float" encoding_format: Literal["float", "base64"] = "float"
dimensions: Optional[int] = None dimensions: Optional[int] = None
...@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel): ...@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer " "allowed, so you must provide a chat template if the tokenizer "
"does not define one."), "does not define one."),
) )
chat_template_kwargs: Optional[Dict[str, Any]] = Field( chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the template renderer. " description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."), "Will be accessible by the chat template."),
) )
mm_processor_kwargs: Optional[Dict[str, Any]] = Field( mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the HF processor."), description=("Additional kwargs to pass to the HF processor."),
) )
...@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest] ...@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
class ScoreRequest(OpenAIBaseModel): class ScoreRequest(OpenAIBaseModel):
model: Optional[str] = None model: Optional[str] = None
text_1: Union[List[str], str] text_1: Union[list[str], str]
text_2: Union[List[str], str] text_2: Union[list[str], str]
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
# doc: begin-score-pooling-params # doc: begin-score-pooling-params
...@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel): ...@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel):
class RerankRequest(OpenAIBaseModel): class RerankRequest(OpenAIBaseModel):
model: Optional[str] = None model: Optional[str] = None
query: str query: str
documents: List[str] documents: list[str]
top_n: int = Field(default_factory=lambda: 0) top_n: int = Field(default_factory=lambda: 0)
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
...@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel): ...@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
id: str id: str
model: str model: str
usage: RerankUsage usage: RerankUsage
results: List[RerankResult] results: list[RerankResult]
class CompletionLogProbs(OpenAIBaseModel): class CompletionLogProbs(OpenAIBaseModel):
text_offset: List[int] = Field(default_factory=list) text_offset: list[int] = Field(default_factory=list)
token_logprobs: List[Optional[float]] = Field(default_factory=list) token_logprobs: list[Optional[float]] = Field(default_factory=list)
tokens: List[str] = Field(default_factory=list) tokens: list[str] = Field(default_factory=list)
top_logprobs: List[Optional[Dict[str, top_logprobs: list[Optional[dict[str,
float]]] = Field(default_factory=list) float]]] = Field(default_factory=list)
...@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel): ...@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason " "to stop, None if the completion finished for some other reason "
"including encountering the EOS token"), "including encountering the EOS token"),
) )
prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
class CompletionResponse(OpenAIBaseModel): class CompletionResponse(OpenAIBaseModel):
...@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel): ...@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
object: str = "text_completion" object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
choices: List[CompletionResponseChoice] choices: list[CompletionResponseChoice]
usage: UsageInfo usage: UsageInfo
...@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel): ...@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
object: str = "text_completion" object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
choices: List[CompletionResponseStreamChoice] choices: list[CompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(default=None) usage: Optional[UsageInfo] = Field(default=None)
class EmbeddingResponseData(OpenAIBaseModel): class EmbeddingResponseData(OpenAIBaseModel):
index: int index: int
object: str = "embedding" object: str = "embedding"
embedding: Union[List[float], str] embedding: Union[list[float], str]
class EmbeddingResponse(OpenAIBaseModel): class EmbeddingResponse(OpenAIBaseModel):
...@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel): ...@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
object: str = "list" object: str = "list"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
data: List[EmbeddingResponseData] data: list[EmbeddingResponseData]
usage: UsageInfo usage: UsageInfo
class PoolingResponseData(OpenAIBaseModel): class PoolingResponseData(OpenAIBaseModel):
index: int index: int
object: str = "pooling" object: str = "pooling"
data: Union[List[List[float]], List[float], str] data: Union[list[list[float]], list[float], str]
class PoolingResponse(OpenAIBaseModel): class PoolingResponse(OpenAIBaseModel):
...@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel): ...@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
object: str = "list" object: str = "list"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
data: List[PoolingResponseData] data: list[PoolingResponseData]
usage: UsageInfo usage: UsageInfo
...@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel): ...@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
object: str = "list" object: str = "list"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
data: List[ScoreResponseData] data: list[ScoreResponseData]
usage: UsageInfo usage: UsageInfo
...@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel): ...@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
tools_called: bool tools_called: bool
# extracted tool calls # extracted tool calls
tool_calls: List[ToolCall] tool_calls: list[ToolCall]
# content - per OpenAI spec, content AND tool calls can be returned rarely # content - per OpenAI spec, content AND tool calls can be returned rarely
# But some models will do this intentionally # But some models will do this intentionally
...@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel): ...@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
role: str role: str
reasoning_content: Optional[str] = None reasoning_content: Optional[str] = None
content: Optional[str] = None content: Optional[str] = None
tool_calls: List[ToolCall] = Field(default_factory=list) tool_calls: list[ToolCall] = Field(default_factory=list)
class ChatCompletionLogProb(OpenAIBaseModel): class ChatCompletionLogProb(OpenAIBaseModel):
token: str token: str
logprob: float = -9999.0 logprob: float = -9999.0
bytes: Optional[List[int]] = None bytes: Optional[list[int]] = None
class ChatCompletionLogProbsContent(ChatCompletionLogProb): class ChatCompletionLogProbsContent(ChatCompletionLogProb):
top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list) top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
class ChatCompletionLogProbs(OpenAIBaseModel): class ChatCompletionLogProbs(OpenAIBaseModel):
content: Optional[List[ChatCompletionLogProbsContent]] = None content: Optional[list[ChatCompletionLogProbsContent]] = None
class ChatCompletionResponseChoice(OpenAIBaseModel): class ChatCompletionResponseChoice(OpenAIBaseModel):
...@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel): ...@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
object: Literal["chat.completion"] = "chat.completion" object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
choices: List[ChatCompletionResponseChoice] choices: list[ChatCompletionResponseChoice]
usage: UsageInfo usage: UsageInfo
prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
class DeltaMessage(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel):
role: Optional[str] = None role: Optional[str] = None
content: Optional[str] = None content: Optional[str] = None
reasoning_content: Optional[str] = None reasoning_content: Optional[str] = None
tool_calls: List[DeltaToolCall] = Field(default_factory=list) tool_calls: list[DeltaToolCall] = Field(default_factory=list)
class ChatCompletionResponseStreamChoice(OpenAIBaseModel): class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
...@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel): ...@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
object: Literal["chat.completion.chunk"] = "chat.completion.chunk" object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time())) created: int = Field(default_factory=lambda: int(time.time()))
model: str model: str
choices: List[ChatCompletionResponseStreamChoice] choices: list[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(default=None) usage: Optional[UsageInfo] = Field(default=None)
...@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel): ...@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
class TokenizeChatRequest(OpenAIBaseModel): class TokenizeChatRequest(OpenAIBaseModel):
model: Optional[str] = None model: Optional[str] = None
messages: List[ChatCompletionMessageParam] messages: list[ChatCompletionMessageParam]
add_generation_prompt: bool = Field( add_generation_prompt: bool = Field(
default=True, default=True,
...@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel): ...@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer " "allowed, so you must provide a chat template if the tokenizer "
"does not define one."), "does not define one."),
) )
chat_template_kwargs: Optional[Dict[str, Any]] = Field( chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the template renderer. " description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."), "Will be accessible by the chat template."),
) )
mm_processor_kwargs: Optional[Dict[str, Any]] = Field( mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None, default=None,
description=("Additional kwargs to pass to the HF processor."), description=("Additional kwargs to pass to the HF processor."),
) )
...@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest] ...@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
class TokenizeResponse(OpenAIBaseModel): class TokenizeResponse(OpenAIBaseModel):
count: int count: int
max_model_len: int max_model_len: int
tokens: List[int] tokens: list[int]
class DetokenizeRequest(OpenAIBaseModel): class DetokenizeRequest(OpenAIBaseModel):
model: Optional[str] = None model: Optional[str] = None
tokens: List[int] tokens: list[int]
class DetokenizeResponse(OpenAIBaseModel): class DetokenizeResponse(OpenAIBaseModel):
...@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel): ...@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to automatically increase the temperature until certain thresholds are hit. to automatically increase the temperature until certain thresholds are hit.
""" """
timestamp_granularities: List[Literal["word", "segment"]] = Field( timestamp_granularities: list[Literal["word", "segment"]] = Field(
alias="timestamp_granularities[]", default=[]) alias="timestamp_granularities[]", default=[])
"""The timestamp granularities to populate for this transcription. """The timestamp granularities to populate for this transcription.
...@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel): ...@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
text: str text: str
"""Text content of the segment.""" """Text content of the segment."""
tokens: List[int] tokens: list[int]
"""Array of token IDs for the text content.""" """Array of token IDs for the text content."""
...@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel): ...@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
text: str text: str
"""The transcribed text.""" """The transcribed text."""
segments: Optional[List[TranscriptionSegment]] = None segments: Optional[list[TranscriptionSegment]] = None
"""Segments of the transcribed text and their corresponding details.""" """Segments of the transcribed text and their corresponding details."""
words: Optional[List[TranscriptionWord]] = None words: Optional[list[TranscriptionWord]] = None
"""Extracted words and their corresponding timestamps.""" """Extracted words and their corresponding timestamps."""
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
from collections.abc import Sequence
from functools import cached_property from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union from typing import Callable, Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage) DeltaMessage)
...@@ -25,14 +26,14 @@ class ReasoningParser: ...@@ -25,14 +26,14 @@ class ReasoningParser:
self.model_tokenizer = tokenizer self.model_tokenizer = tokenizer
@cached_property @cached_property
def vocab(self) -> Dict[str, int]: def vocab(self) -> dict[str, int]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab() # whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab() return self.model_tokenizer.get_vocab()
def extract_reasoning_content( def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> Tuple[Optional[str], Optional[str]]: ) -> tuple[Optional[str], Optional[str]]:
""" """
Extract reasoning content from a complete model-generated string. Extract reasoning content from a complete model-generated string.
...@@ -47,7 +48,7 @@ class ReasoningParser: ...@@ -47,7 +48,7 @@ class ReasoningParser:
The request object that was used to generate the model_output. The request object that was used to generate the model_output.
Returns: Returns:
Tuple[Optional[str], Optional[str]] tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content. A tuple containing the reasoning content and the content.
""" """
...@@ -77,10 +78,10 @@ class ReasoningParser: ...@@ -77,10 +78,10 @@ class ReasoningParser:
class ReasoningParserManager: class ReasoningParserManager:
reasoning_parsers: Dict[str, Type] = {} reasoning_parsers: dict[str, type] = {}
@classmethod @classmethod
def get_reasoning_parser(cls, name) -> Type: def get_reasoning_parser(cls, name) -> type:
""" """
Get reasoning parser by name which is registered by `register_module`. Get reasoning parser by name which is registered by `register_module`.
...@@ -94,8 +95,8 @@ class ReasoningParserManager: ...@@ -94,8 +95,8 @@ class ReasoningParserManager:
@classmethod @classmethod
def _register_module(cls, def _register_module(cls,
module: Type, module: type,
module_name: Optional[Union[str, List[str]]] = None, module_name: Optional[Union[str, list[str]]] = None,
force: bool = True) -> None: force: bool = True) -> None:
if not issubclass(module, ReasoningParser): if not issubclass(module, ReasoningParser):
raise TypeError("module must be subclass of ReasoningParser, " raise TypeError("module must be subclass of ReasoningParser, "
...@@ -114,9 +115,9 @@ class ReasoningParserManager: ...@@ -114,9 +115,9 @@ class ReasoningParserManager:
@classmethod @classmethod
def register_module( def register_module(
cls, cls,
name: Optional[Union[str, List[str]]] = None, name: Optional[Union[str, list[str]]] = None,
force: bool = True, force: bool = True,
module: Union[Type, None] = None) -> Union[type, Callable]: module: Union[type, None] = None) -> Union[type, Callable]:
""" """
Register module with the given name or name list. it can be used as a Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not decoder(with module as None) or normal function(with module as not
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import re import re
from typing import Optional, Sequence, Tuple, Union from collections.abc import Sequence
from typing import Optional, Union
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
...@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser): ...@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
def extract_reasoning_content( def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest self, model_output: str, request: ChatCompletionRequest
) -> Tuple[Optional[str], Optional[str]]: ) -> tuple[Optional[str], Optional[str]]:
# DeepSeek R1 doesn't generate <think> now. # DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start. # Thus we assume the reasoning content is always at the start.
......
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
import asyncio import asyncio
import tempfile import tempfile
from collections.abc import Awaitable
from http import HTTPStatus from http import HTTPStatus
from io import StringIO from io import StringIO
from typing import Awaitable, Callable, List, Optional from typing import Callable, Optional
import aiohttp import aiohttp
import torch import torch
...@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str: ...@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
async def write_local_file(output_path: str, async def write_local_file(output_path: str,
batch_outputs: List[BatchRequestOutput]) -> None: batch_outputs: list[BatchRequestOutput]) -> None:
""" """
Write the responses to a local file. Write the responses to a local file.
output_path: The path to write the responses to. output_path: The path to write the responses to.
...@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str, ...@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
f"Error message: {str(e)}.") from e f"Error message: {str(e)}.") from e
async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput], async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
output_tmp_dir: str) -> None: output_tmp_dir: str) -> None:
""" """
Write batch_outputs to a file or upload to a URL. Write batch_outputs to a file or upload to a URL.
...@@ -353,7 +354,7 @@ async def main(args): ...@@ -353,7 +354,7 @@ async def main(args):
logger.info("Reading batch from %s...", args.input_file) logger.info("Reading batch from %s...", args.input_file)
# Submit all requests in the file to the engine "concurrently". # Submit all requests in the file to the engine "concurrently".
response_futures: List[Awaitable[BatchRequestOutput]] = [] response_futures: list[Awaitable[BatchRequestOutput]] = []
for request_json in (await read_file(args.input_file)).strip().split("\n"): for request_json in (await read_file(args.input_file)).strip().split("\n"):
# Skip empty lines. # Skip empty lines.
request_json = request_json.strip() request_json = request_json.strip()
......
...@@ -3,10 +3,9 @@ ...@@ -3,10 +3,9 @@
import asyncio import asyncio
import json import json
import time import time
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List, from collections.abc import AsyncGenerator, AsyncIterator
Optional) from collections.abc import Sequence as GenericSequence
from typing import Sequence as GenericSequence from typing import Callable, Final, Optional, Union
from typing import Union
from fastapi import Request from fastapi import Request
...@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing):
raw_request.state.request_metadata = request_metadata raw_request.state.request_metadata = request_metadata
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[RequestOutput, None]] = [] generators: list[AsyncGenerator[RequestOutput, None]] = []
try: try:
for i, engine_prompt in enumerate(engine_prompts): for i, engine_prompt in enumerate(engine_prompts):
sampling_params: Union[SamplingParams, BeamSearchParams] sampling_params: Union[SamplingParams, BeamSearchParams]
...@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator: AsyncIterator[RequestOutput], result_generator: AsyncIterator[RequestOutput],
request_id: str, request_id: str,
model_name: str, model_name: str,
conversation: List[ConversationMessage], conversation: list[ConversationMessage],
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata, request_metadata: RequestResponseMetadata,
) -> AsyncGenerator[str, None]: ) -> AsyncGenerator[str, None]:
...@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing):
should_stream_with_reasoning_parsing = ( should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request)) self._should_stream_with_reasoning_parsing(request))
all_previous_token_ids: Optional[List[List[int]]] all_previous_token_ids: Optional[list[list[int]]]
# Only one of these will be used, thus previous_texts and # Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration. # all_previous_token_ids will not be used twice in the same iteration.
...@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing):
# Prepare the tool parser if it's needed # Prepare the tool parser if it's needed
try: try:
if tool_choice_auto and self.tool_parser: if tool_choice_auto and self.tool_parser:
tool_parsers: List[Optional[ToolParser]] = [ tool_parsers: list[Optional[ToolParser]] = [
self.tool_parser(tokenizer) self.tool_parser(tokenizer)
] * num_choices ] * num_choices
else: else:
...@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing):
# Send response to echo the input portion of the # Send response to echo the input portion of the
# last message # last message
if request.echo: if request.echo:
last_msg_content: Union[str, List[Dict[str, str]]] = "" last_msg_content: Union[str, list[dict[str, str]]] = ""
if conversation and "content" in conversation[ if conversation and "content" in conversation[
-1] and conversation[-1].get("role") == role: -1] and conversation[-1].get("role") == role:
last_msg_content = conversation[-1]["content"] or "" last_msg_content = conversation[-1]["content"] or ""
...@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator: AsyncIterator[RequestOutput], result_generator: AsyncIterator[RequestOutput],
request_id: str, request_id: str,
model_name: str, model_name: str,
conversation: List[ConversationMessage], conversation: list[ConversationMessage],
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata, request_metadata: RequestResponseMetadata,
) -> Union[ErrorResponse, ChatCompletionResponse]: ) -> Union[ErrorResponse, ChatCompletionResponse]:
...@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing):
assert final_res is not None assert final_res is not None
choices: List[ChatCompletionResponseChoice] = [] choices: list[ChatCompletionResponseChoice] = []
role = self.get_chat_request_role(request) role = self.get_chat_request_role(request)
for output in final_res.outputs: for output in final_res.outputs:
...@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing):
choices.append(choice_data) choices.append(choice_data)
if request.echo: if request.echo:
last_msg_content: Union[str, List[Dict[str, str]]] = "" last_msg_content: Union[str, list[dict[str, str]]] = ""
if conversation and "content" in conversation[-1] and conversation[ if conversation and "content" in conversation[-1] and conversation[
-1].get("role") == role: -1].get("role") == role:
last_msg_content = conversation[-1]["content"] or "" last_msg_content = conversation[-1]["content"] or ""
...@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing): ...@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing):
return response return response
def _get_top_logprobs( def _get_top_logprobs(
self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int], self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]: tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
return [ return [
ChatCompletionLogProb(token=(token := self._get_decoded_token( ChatCompletionLogProb(token=(token := self._get_decoded_token(
p[1], p[1],
...@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing): ...@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing):
def _create_chat_logprobs( def _create_chat_logprobs(
self, self,
token_ids: GenericSequence[int], token_ids: GenericSequence[int],
top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
num_output_top_logprobs: Optional[int] = None, num_output_top_logprobs: Optional[int] = None,
) -> ChatCompletionLogProbs: ) -> ChatCompletionLogProbs:
"""Create OpenAI-style logprobs.""" """Create OpenAI-style logprobs."""
logprobs_content: List[ChatCompletionLogProbsContent] = [] logprobs_content: list[ChatCompletionLogProbsContent] = []
for i, token_id in enumerate(token_ids): for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i] step_top_logprobs = top_logprobs[i]
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
import asyncio import asyncio
import time import time
from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional from collections.abc import AsyncGenerator, AsyncIterator
from typing import Sequence as GenericSequence from collections.abc import Sequence as GenericSequence
from typing import Tuple, Union, cast from typing import Optional, Union, cast
from fastapi import Request from fastapi import Request
...@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing):
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[RequestOutput, None]] = [] generators: list[AsyncGenerator[RequestOutput, None]] = []
try: try:
for i, engine_prompt in enumerate(engine_prompts): for i, engine_prompt in enumerate(engine_prompts):
sampling_params: Union[SamplingParams, BeamSearchParams] sampling_params: Union[SamplingParams, BeamSearchParams]
...@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_metadata=request_metadata) request_metadata=request_metadata)
# Non-streaming response # Non-streaming response
final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
try: try:
async for i, res in result_generator: async for i, res in result_generator:
final_res_batch[i] = res final_res_batch[i] = res
...@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
if final_res.prompt is None: if final_res.prompt is None:
final_res.prompt = request_prompts[i]["prompt"] final_res.prompt = request_prompts[i]["prompt"]
final_res_batch_checked = cast(List[RequestOutput], final_res_batch_checked = cast(list[RequestOutput],
final_res_batch) final_res_batch)
response = self.request_output_to_completion_response( response = self.request_output_to_completion_response(
...@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing):
async def completion_stream_generator( async def completion_stream_generator(
self, self,
request: CompletionRequest, request: CompletionRequest,
result_generator: AsyncIterator[Tuple[int, RequestOutput]], result_generator: AsyncIterator[tuple[int, RequestOutput]],
request_id: str, request_id: str,
created_time: int, created_time: int,
model_name: str, model_name: str,
...@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing):
num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids) num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
delta_token_ids: GenericSequence[int] delta_token_ids: GenericSequence[int]
out_logprobs: Optional[GenericSequence[Optional[Dict[ out_logprobs: Optional[GenericSequence[Optional[dict[
int, Logprob]]]] int, Logprob]]]]
for output in res.outputs: for output in res.outputs:
...@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing):
def request_output_to_completion_response( def request_output_to_completion_response(
self, self,
final_res_batch: List[RequestOutput], final_res_batch: list[RequestOutput],
request: CompletionRequest, request: CompletionRequest,
request_id: str, request_id: str,
created_time: int, created_time: int,
...@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing):
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata, request_metadata: RequestResponseMetadata,
) -> CompletionResponse: ) -> CompletionResponse:
choices: List[CompletionResponseChoice] = [] choices: list[CompletionResponseChoice] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
num_generated_tokens = 0 num_generated_tokens = 0
...@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text = final_res.prompt prompt_text = final_res.prompt
token_ids: GenericSequence[int] token_ids: GenericSequence[int]
out_logprobs: Optional[GenericSequence[Optional[Dict[int, out_logprobs: Optional[GenericSequence[Optional[dict[int,
Logprob]]]] Logprob]]]]
for output in final_res.outputs: for output in final_res.outputs:
...@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing):
def _create_completion_logprobs( def _create_completion_logprobs(
self, self,
token_ids: GenericSequence[int], token_ids: GenericSequence[int],
top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
num_output_top_logprobs: int, num_output_top_logprobs: int,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
initial_text_offset: int = 0, initial_text_offset: int = 0,
) -> CompletionLogProbs: ) -> CompletionLogProbs:
"""Create logprobs for OpenAI Completion API.""" """Create logprobs for OpenAI Completion API."""
out_text_offset: List[int] = [] out_text_offset: list[int] = []
out_token_logprobs: List[Optional[float]] = [] out_token_logprobs: list[Optional[float]] = []
out_tokens: List[str] = [] out_tokens: list[str] = []
out_top_logprobs: List[Optional[Dict[str, float]]] = [] out_top_logprobs: list[Optional[dict[str, float]]] = []
last_token_len = 0 last_token_len = 0
......
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
import asyncio import asyncio
import base64 import base64
import time import time
from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast from collections.abc import AsyncGenerator
from typing import Final, Literal, Optional, Union, cast
import numpy as np import numpy as np
from fastapi import Request from fastapi import Request
...@@ -31,7 +32,7 @@ logger = init_logger(__name__) ...@@ -31,7 +32,7 @@ logger = init_logger(__name__)
def _get_embedding( def _get_embedding(
output: EmbeddingOutput, output: EmbeddingOutput,
encoding_format: Literal["float", "base64"], encoding_format: Literal["float", "base64"],
) -> Union[List[float], str]: ) -> Union[list[float], str]:
if encoding_format == "float": if encoding_format == "float":
return output.embedding return output.embedding
elif encoding_format == "base64": elif encoding_format == "base64":
...@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing):
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
try: try:
pooling_params = request.to_pooling_params() pooling_params = request.to_pooling_params()
...@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing):
num_prompts = len(engine_prompts) num_prompts = len(engine_prompts)
# Non-streaming response # Non-streaming response
final_res_batch: List[Optional[PoolingRequestOutput]] final_res_batch: list[Optional[PoolingRequestOutput]]
final_res_batch = [None] * num_prompts final_res_batch = [None] * num_prompts
try: try:
async for i, res in result_generator: async for i, res in result_generator:
...@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing):
assert all(final_res is not None for final_res in final_res_batch) assert all(final_res is not None for final_res in final_res_batch)
final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch_checked = cast(list[PoolingRequestOutput],
final_res_batch) final_res_batch)
response = self.request_output_to_embedding_response( response = self.request_output_to_embedding_response(
...@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing):
def request_output_to_embedding_response( def request_output_to_embedding_response(
self, self,
final_res_batch: List[PoolingRequestOutput], final_res_batch: list[PoolingRequestOutput],
request_id: str, request_id: str,
created_time: int, created_time: int,
model_name: str, model_name: str,
encoding_format: Literal["float", "base64"], encoding_format: Literal["float", "base64"],
) -> EmbeddingResponse: ) -> EmbeddingResponse:
items: List[EmbeddingResponseData] = [] items: list[EmbeddingResponseData] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json import json
from collections.abc import Iterable, Iterator, Mapping, Sequence
from concurrent.futures.thread import ThreadPoolExecutor from concurrent.futures.thread import ThreadPoolExecutor
from http import HTTPStatus from http import HTTPStatus
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, from typing import Annotated, Any, Callable, Optional, TypedDict, Union
Optional, Sequence, Tuple, TypedDict, Union)
from fastapi import Request from fastapi import Request
from pydantic import Field from pydantic import Field
from starlette.datastructures import Headers from starlette.datastructures import Headers
from typing_extensions import Annotated
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
...@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, ...@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
class TextTokensPrompt(TypedDict): class TextTokensPrompt(TypedDict):
prompt: str prompt: str
prompt_token_ids: List[int] prompt_token_ids: list[int]
RequestPrompt = Union[List[int], str, TextTokensPrompt] RequestPrompt = Union[list[int], str, TextTokensPrompt]
class OpenAIServing: class OpenAIServing:
...@@ -144,7 +143,7 @@ class OpenAIServing: ...@@ -144,7 +143,7 @@ class OpenAIServing:
def _maybe_get_adapters( def _maybe_get_adapters(
self, request: AnyRequest self, request: AnyRequest
) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[ ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[
None, PromptAdapterRequest]]: None, PromptAdapterRequest]]:
if self._is_model_supported(request.model): if self._is_model_supported(request.model):
return None, None return None, None
...@@ -188,7 +187,7 @@ class OpenAIServing: ...@@ -188,7 +187,7 @@ class OpenAIServing:
self, self,
request: AnyRequest, request: AnyRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
prompt_ids: List[int], prompt_ids: list[int],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
) -> TextTokensPrompt: ) -> TextTokensPrompt:
if truncate_prompt_tokens is None: if truncate_prompt_tokens is None:
...@@ -203,7 +202,7 @@ class OpenAIServing: ...@@ -203,7 +202,7 @@ class OpenAIServing:
def _validate_input( def _validate_input(
self, self,
request: AnyRequest, request: AnyRequest,
input_ids: List[int], input_ids: list[int],
input_text: str, input_text: str,
) -> TextTokensPrompt: ) -> TextTokensPrompt:
token_num = len(input_ids) token_num = len(input_ids)
...@@ -259,7 +258,7 @@ class OpenAIServing: ...@@ -259,7 +258,7 @@ class OpenAIServing:
self, self,
request: AnyRequest, request: AnyRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
prompt_input: Union[str, List[int]], prompt_input: Union[str, list[int]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> TextTokensPrompt: ) -> TextTokensPrompt:
...@@ -280,7 +279,7 @@ class OpenAIServing: ...@@ -280,7 +279,7 @@ class OpenAIServing:
self, self,
request: AnyRequest, request: AnyRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
prompt_inputs: Iterable[Union[str, List[int]]], prompt_inputs: Iterable[Union[str, list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]: ) -> Iterator[TextTokensPrompt]:
...@@ -309,10 +308,10 @@ class OpenAIServing: ...@@ -309,10 +308,10 @@ class OpenAIServing:
self, self,
request: AnyRequest, request: AnyRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
input_or_inputs: Union[str, List[str], List[int], List[List[int]]], input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> List[TextTokensPrompt]: ) -> list[TextTokensPrompt]:
""" """
Tokenize/detokenize depending on the input format. Tokenize/detokenize depending on the input format.
...@@ -344,10 +343,10 @@ class OpenAIServing: ...@@ -344,10 +343,10 @@ class OpenAIServing:
self, self,
request: CompletionLikeRequest, request: CompletionLikeRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
input_or_inputs: Union[str, List[str], List[int], List[List[int]]], input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]: ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]:
request_prompts = await self._tokenize_prompt_input_or_inputs_async( request_prompts = await self._tokenize_prompt_input_or_inputs_async(
request, request,
tokenizer, tokenizer,
...@@ -367,19 +366,19 @@ class OpenAIServing: ...@@ -367,19 +366,19 @@ class OpenAIServing:
self, self,
request: ChatLikeRequest, request: ChatLikeRequest,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
messages: List[ChatCompletionMessageParam], messages: list[ChatCompletionMessageParam],
chat_template: Optional[str], chat_template: Optional[str],
chat_template_content_format: ChatTemplateContentFormatOption, chat_template_content_format: ChatTemplateContentFormatOption,
add_generation_prompt: bool = True, add_generation_prompt: bool = True,
continue_final_message: bool = False, continue_final_message: bool = False,
tool_dicts: Optional[List[Dict[str, Any]]] = None, tool_dicts: Optional[list[dict[str, Any]]] = None,
documents: Optional[List[Dict[str, str]]] = None, documents: Optional[list[dict[str, str]]] = None,
chat_template_kwargs: Optional[Dict[str, Any]] = None, chat_template_kwargs: Optional[dict[str, Any]] = None,
tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = False, add_special_tokens: bool = False,
) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
List[TokensPrompt]]: list[TokensPrompt]]:
resolved_content_format = resolve_chat_template_content_format( resolved_content_format = resolve_chat_template_content_format(
chat_template, chat_template,
chat_template_content_format, chat_template_content_format,
...@@ -392,7 +391,7 @@ class OpenAIServing: ...@@ -392,7 +391,7 @@ class OpenAIServing:
content_format=resolved_content_format, content_format=resolved_content_format,
) )
_chat_template_kwargs: Dict[str, Any] = dict( _chat_template_kwargs: dict[str, Any] = dict(
chat_template=chat_template, chat_template=chat_template,
add_generation_prompt=add_generation_prompt, add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message, continue_final_message=continue_final_message,
...@@ -401,7 +400,7 @@ class OpenAIServing: ...@@ -401,7 +400,7 @@ class OpenAIServing:
) )
_chat_template_kwargs.update(chat_template_kwargs or {}) _chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: Union[str, List[int]] request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer): if isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template( request_prompt = apply_mistral_chat_template(
tokenizer, tokenizer,
......
...@@ -4,7 +4,7 @@ import json ...@@ -4,7 +4,7 @@ import json
import pathlib import pathlib
from dataclasses import dataclass from dataclasses import dataclass
from http import HTTPStatus from http import HTTPStatus
from typing import List, Optional, Union from typing import Optional, Union
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
...@@ -53,10 +53,10 @@ class OpenAIServingModels: ...@@ -53,10 +53,10 @@ class OpenAIServingModels:
self, self,
engine_client: EngineClient, engine_client: EngineClient,
model_config: ModelConfig, model_config: ModelConfig,
base_model_paths: List[BaseModelPath], base_model_paths: list[BaseModelPath],
*, *,
lora_modules: Optional[List[LoRAModulePath]] = None, lora_modules: Optional[list[LoRAModulePath]] = None,
prompt_adapters: Optional[List[PromptAdapterPath]] = None, prompt_adapters: Optional[list[PromptAdapterPath]] = None,
): ):
super().__init__() super().__init__()
...@@ -65,7 +65,7 @@ class OpenAIServingModels: ...@@ -65,7 +65,7 @@ class OpenAIServingModels:
self.engine_client = engine_client self.engine_client = engine_client
self.static_lora_modules = lora_modules self.static_lora_modules = lora_modules
self.lora_requests: List[LoRARequest] = [] self.lora_requests: list[LoRARequest] = []
self.lora_id_counter = AtomicCounter(0) self.lora_id_counter = AtomicCounter(0)
self.prompt_adapter_requests = [] self.prompt_adapter_requests = []
......
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
import asyncio import asyncio
import base64 import base64
import time import time
from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast from collections.abc import AsyncGenerator
from typing import Final, Literal, Optional, Union, cast
import numpy as np import numpy as np
from fastapi import Request from fastapi import Request
...@@ -29,7 +30,7 @@ logger = init_logger(__name__) ...@@ -29,7 +30,7 @@ logger = init_logger(__name__)
def _get_data( def _get_data(
output: PoolingOutput, output: PoolingOutput,
encoding_format: Literal["float", "base64"], encoding_format: Literal["float", "base64"],
) -> Union[List[float], str]: ) -> Union[list[float], str]:
if encoding_format == "float": if encoding_format == "float":
return output.data.tolist() return output.data.tolist()
elif encoding_format == "base64": elif encoding_format == "base64":
...@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing): ...@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing):
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
try: try:
pooling_params = request.to_pooling_params() pooling_params = request.to_pooling_params()
...@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing): ...@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
num_prompts = len(engine_prompts) num_prompts = len(engine_prompts)
# Non-streaming response # Non-streaming response
final_res_batch: List[Optional[PoolingRequestOutput]] final_res_batch: list[Optional[PoolingRequestOutput]]
final_res_batch = [None] * num_prompts final_res_batch = [None] * num_prompts
try: try:
async for i, res in result_generator: async for i, res in result_generator:
...@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing): ...@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing):
assert all(final_res is not None for final_res in final_res_batch) assert all(final_res is not None for final_res in final_res_batch)
final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch_checked = cast(list[PoolingRequestOutput],
final_res_batch) final_res_batch)
response = self.request_output_to_pooling_response( response = self.request_output_to_pooling_response(
...@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing): ...@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing):
def request_output_to_pooling_response( def request_output_to_pooling_response(
self, self,
final_res_batch: List[PoolingRequestOutput], final_res_batch: list[PoolingRequestOutput],
request_id: str, request_id: str,
created_time: int, created_time: int,
model_name: str, model_name: str,
encoding_format: Literal["float", "base64"], encoding_format: Literal["float", "base64"],
) -> PoolingResponse: ) -> PoolingResponse:
items: List[PoolingResponseData] = [] items: list[PoolingResponseData] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import time import time
from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union from collections.abc import AsyncGenerator, Mapping
from typing import Any, Optional, Union
from fastapi import Request from fastapi import Request
...@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing): ...@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing):
async def _embedding_score( async def _embedding_score(
self, self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
texts_1: List[str], texts_1: list[str],
texts_2: List[str], texts_2: list[str],
request: Union[RerankRequest, ScoreRequest], request: Union[RerankRequest, ScoreRequest],
request_id=str, request_id=str,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
...@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing): ...@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing):
prompt_adapter_request: Optional[Union[PromptAdapterRequest, prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None, None]] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
) -> List[PoolingRequestOutput]: ) -> list[PoolingRequestOutput]:
input_texts = texts_1 + texts_2 input_texts = texts_1 + texts_2
engine_prompts: List[TokensPrompt] = [] engine_prompts: list[TokensPrompt] = []
tokenize_async = make_async(tokenizer.__call__, tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor) executor=self._tokenizer_executor)
...@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing): ...@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing):
prompt_token_ids=text_token_prompt["prompt_token_ids"])) prompt_token_ids=text_token_prompt["prompt_token_ids"]))
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
pooling_params = request.to_pooling_params() pooling_params = request.to_pooling_params()
for i, engine_prompt in enumerate(engine_prompts): for i, engine_prompt in enumerate(engine_prompts):
...@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing): ...@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing):
result_generator = merge_async_iterators(*generators) result_generator = merge_async_iterators(*generators)
# Non-streaming response # Non-streaming response
final_res_batch: List[PoolingRequestOutput] = [] final_res_batch: list[PoolingRequestOutput] = []
embeddings: List[Optional[PoolingRequestOutput]] =\ embeddings: list[Optional[PoolingRequestOutput]] =\
[None] * len(engine_prompts) [None] * len(engine_prompts)
async for i, res in result_generator: async for i, res in result_generator:
embeddings[i] = res embeddings[i] = res
emb_texts_1: List[PoolingRequestOutput] = [] emb_texts_1: list[PoolingRequestOutput] = []
emb_texts_2: List[PoolingRequestOutput] = [] emb_texts_2: list[PoolingRequestOutput] = []
for i in range(0, len(texts_1)): for i in range(0, len(texts_1)):
assert (emb := embeddings[i]) is not None assert (emb := embeddings[i]) is not None
...@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing): ...@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing):
async def _cross_encoding_score( async def _cross_encoding_score(
self, self,
tokenizer: Union[AnyTokenizer], tokenizer: Union[AnyTokenizer],
texts_1: List[str], texts_1: list[str],
texts_2: List[str], texts_2: list[str],
request: Union[RerankRequest, ScoreRequest], request: Union[RerankRequest, ScoreRequest],
request_id=str, request_id=str,
tokenization_kwargs: Optional[dict[str, Any]] = None, tokenization_kwargs: Optional[dict[str, Any]] = None,
...@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing): ...@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing):
prompt_adapter_request: Optional[Union[PromptAdapterRequest, prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None, None]] = None,
trace_headers: Optional[Mapping[str, str]] = None, trace_headers: Optional[Mapping[str, str]] = None,
) -> List[PoolingRequestOutput]: ) -> list[PoolingRequestOutput]:
request_prompts: List[str] = [] request_prompts: list[str] = []
engine_prompts: List[TokensPrompt] = [] engine_prompts: list[TokensPrompt] = []
if len(texts_1) == 1: if len(texts_1) == 1:
texts_1 = texts_1 * len(texts_2) texts_1 = texts_1 * len(texts_2)
...@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing): ...@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing):
engine_prompts.append(engine_prompt) engine_prompts.append(engine_prompt)
# Schedule the request and get the result generator. # Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
pooling_params = request.to_pooling_params() pooling_params = request.to_pooling_params()
...@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing): ...@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing):
result_generator = merge_async_iterators(*generators) result_generator = merge_async_iterators(*generators)
# Non-streaming response # Non-streaming response
final_res_batch: List[ final_res_batch: list[
Optional[PoolingRequestOutput]] = [None] * len(engine_prompts) Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
async for i, res in result_generator: async for i, res in result_generator:
...@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing): ...@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing):
request_id: str, request_id: str,
raw_request: Optional[Request] = None, raw_request: Optional[Request] = None,
truncate_prompt_tokens: Optional[int] = None, truncate_prompt_tokens: Optional[int] = None,
) -> List[PoolingRequestOutput]: ) -> list[PoolingRequestOutput]:
tokenization_kwargs: Dict[str, Any] = {} tokenization_kwargs: dict[str, Any] = {}
if truncate_prompt_tokens is not None: if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens tokenization_kwargs["max_length"] = truncate_prompt_tokens
...@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing): ...@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing):
def request_output_to_score_response( def request_output_to_score_response(
self, self,
final_res_batch: List[PoolingRequestOutput], final_res_batch: list[PoolingRequestOutput],
request_id: str, request_id: str,
created_time: int, created_time: int,
model_name: str, model_name: str,
) -> ScoreResponse: ) -> ScoreResponse:
items: List[ScoreResponseData] = [] items: list[ScoreResponseData] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
...@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing): ...@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing):
) )
def request_output_to_rerank_response( def request_output_to_rerank_response(
self, final_res_batch: List[PoolingRequestOutput], request_id: str, self, final_res_batch: list[PoolingRequestOutput], request_id: str,
model_name: str, documents: List[str], model_name: str, documents: list[str],
top_n: int) -> RerankResponse: top_n: int) -> RerankResponse:
""" """
Convert the output of do_rank to a RerankResponse Convert the output of do_rank to a RerankResponse
""" """
results: List[RerankResult] = [] results: list[RerankResult] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
classify_res = ScoringRequestOutput.from_base(final_res) classify_res = ScoringRequestOutput.from_base(final_res)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Final, List, Optional, Union from typing import Final, Optional, Union
from fastapi import Request from fastapi import Request
...@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing): ...@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing):
logger.exception("Error in preprocessing prompt inputs") logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e)) return self.create_error_response(str(e))
input_ids: List[int] = [] input_ids: list[int] = []
for i, engine_prompt in enumerate(engine_prompts): for i, engine_prompt in enumerate(engine_prompts):
self._log_inputs(request_id, self._log_inputs(request_id,
request_prompts[i], request_prompts[i],
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import io import io
from typing import AsyncGenerator, Optional, Union, cast from collections.abc import AsyncGenerator
from typing import Optional, Union, cast
from fastapi import Request from fastapi import Request
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
from collections.abc import Sequence
from functools import cached_property from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Type, Union from typing import Callable, Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage, DeltaMessage,
...@@ -22,16 +23,16 @@ class ToolParser: ...@@ -22,16 +23,16 @@ class ToolParser:
""" """
def __init__(self, tokenizer: AnyTokenizer): def __init__(self, tokenizer: AnyTokenizer):
self.prev_tool_call_arr: List[Dict] = [] self.prev_tool_call_arr: list[dict] = []
# the index of the tool call that is currently being parsed # the index of the tool call that is currently being parsed
self.current_tool_id: int = -1 self.current_tool_id: int = -1
self.current_tool_name_sent: bool = False self.current_tool_name_sent: bool = False
self.streamed_args_for_tool: List[str] = [] self.streamed_args_for_tool: list[str] = []
self.model_tokenizer = tokenizer self.model_tokenizer = tokenizer
@cached_property @cached_property
def vocab(self) -> Dict[str, int]: def vocab(self) -> dict[str, int]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab() # whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab() return self.model_tokenizer.get_vocab()
...@@ -79,10 +80,10 @@ class ToolParser: ...@@ -79,10 +80,10 @@ class ToolParser:
class ToolParserManager: class ToolParserManager:
tool_parsers: Dict[str, Type] = {} tool_parsers: dict[str, type] = {}
@classmethod @classmethod
def get_tool_parser(cls, name) -> Type: def get_tool_parser(cls, name) -> type:
""" """
Get tool parser by name which is registered by `register_module`. Get tool parser by name which is registered by `register_module`.
...@@ -95,8 +96,8 @@ class ToolParserManager: ...@@ -95,8 +96,8 @@ class ToolParserManager:
@classmethod @classmethod
def _register_module(cls, def _register_module(cls,
module: Type, module: type,
module_name: Optional[Union[str, List[str]]] = None, module_name: Optional[Union[str, list[str]]] = None,
force: bool = True) -> None: force: bool = True) -> None:
if not issubclass(module, ToolParser): if not issubclass(module, ToolParser):
raise TypeError( raise TypeError(
...@@ -116,9 +117,9 @@ class ToolParserManager: ...@@ -116,9 +117,9 @@ class ToolParserManager:
@classmethod @classmethod
def register_module( def register_module(
cls, cls,
name: Optional[Union[str, List[str]]] = None, name: Optional[Union[str, list[str]]] = None,
force: bool = True, force: bool = True,
module: Union[Type, None] = None) -> Union[type, Callable]: module: Union[type, None] = None) -> Union[type, Callable]:
""" """
Register module with the given name or name list. it can be used as a Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not decoder(with module as None) or normal function(with module as not
......
...@@ -2,8 +2,9 @@ ...@@ -2,8 +2,9 @@
import json import json
import re import re
from collections.abc import Sequence
from json import JSONDecoder from json import JSONDecoder
from typing import Dict, Sequence, Union from typing import Union
import partial_json_parser import partial_json_parser
from partial_json_parser.core.options import Allow from partial_json_parser.core.options import Allow
...@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser): ...@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser):
return None return None
# select as the current tool call the one we're on the state at # select as the current tool call the one we're on the state at
current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ current_tool_call: dict = tool_call_arr[self.current_tool_id] \
if len(tool_call_arr) > 0 else {} if len(tool_call_arr) > 0 else {}
# case -- if no tokens have been streamed for the tool, e.g. # case -- if no tokens have been streamed for the tool, e.g.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Dict, Sequence, Union from collections.abc import Sequence
from typing import Union
import partial_json_parser import partial_json_parser
from partial_json_parser.core.options import Allow from partial_json_parser.core.options import Allow
...@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser): ...@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser):
return None return None
# select as the current tool call the one we're on the state at # select as the current tool call the one we're on the state at
current_tool_call: Dict = tool_call_arr[self.current_tool_id] current_tool_call: dict = tool_call_arr[self.current_tool_id]
delta = None delta = None
# case: we are starting a new tool in the array # case: we are starting a new tool in the array
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment