Unverified Commit cf069aa8 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Update deprecated Python 3.8 typing (#13971)

parent bf33700e
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Union
from typing import Optional, Union
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
......@@ -22,7 +22,7 @@ class RequestLogger:
self,
request_id: str,
prompt: Optional[str],
prompt_token_ids: Optional[List[int]],
prompt_token_ids: Optional[list[int]],
params: Optional[Union[SamplingParams, PoolingParams,
BeamSearchParams]],
lora_request: Optional[LoRARequest],
......
......@@ -13,10 +13,11 @@ import socket
import tempfile
import uuid
from argparse import Namespace
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from functools import partial
from http import HTTPStatus
from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union
from typing import Annotated, Optional, Union
import uvloop
from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
......@@ -93,7 +94,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
logger = init_logger('vllm.entrypoints.openai.api_server')
_running_tasks: Set[asyncio.Task] = set()
_running_tasks: set[asyncio.Task] = set()
@asynccontextmanager
......@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
return await do_rerank(request, raw_request)
TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
TASK_HANDLERS: dict[str, dict[str, tuple]] = {
"generate": {
"messages": (ChatCompletionRequest, create_chat_completion),
"default": (CompletionRequest, create_completion),
......@@ -894,7 +895,7 @@ async def init_app_state(
state.task = model_config.task
def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
def create_server_socket(addr: tuple[str, int]) -> socket.socket:
family = socket.AF_INET
if is_valid_ipv6_address(addr[0]):
family = socket.AF_INET6
......
......@@ -8,7 +8,8 @@ purposes.
import argparse
import json
import ssl
from typing import List, Optional, Sequence, Union, get_args
from collections.abc import Sequence
from typing import Optional, Union, get_args
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
......@@ -33,7 +34,7 @@ class LoRAParserAction(argparse.Action):
if isinstance(values, str):
raise TypeError("Expected values to be a list")
lora_list: List[LoRAModulePath] = []
lora_list: list[LoRAModulePath] = []
for item in values:
if item in [None, '']: # Skip if item is None or empty string
continue
......@@ -69,7 +70,7 @@ class PromptAdapterParserAction(argparse.Action):
if isinstance(values, str):
raise TypeError("Expected values to be a list")
adapter_list: List[PromptAdapterPath] = []
adapter_list: list[PromptAdapterPath] = []
for item in values:
name, path = item.split('=')
adapter_list.append(PromptAdapterPath(name, path))
......
# SPDX-License-Identifier: Apache-2.0
from collections.abc import Iterable
from functools import lru_cache, partial
from typing import Dict, FrozenSet, Iterable, List, Optional, Union
from typing import Optional, Union
import torch
......@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
specific set of token ids."""
def __init__(self, allowed_ids: Iterable[int]):
self.allowed_ids: Optional[List[int]] = list(allowed_ids)
self.allowed_ids: Optional[list[int]] = list(allowed_ids)
self.mask: Optional[torch.Tensor] = None
def __call__(self, token_ids: List[int],
def __call__(self, token_ids: list[int],
logits: torch.Tensor) -> torch.Tensor:
if self.mask is None:
self.mask = torch.ones((logits.shape[-1], ),
......@@ -31,7 +32,7 @@ class AllowedTokenIdsLogitsProcessor:
@lru_cache(maxsize=32)
def _get_allowed_token_ids_logits_processor(
allowed_token_ids: FrozenSet[int],
allowed_token_ids: frozenset[int],
vocab_size: int,
) -> LogitsProcessor:
if not allowed_token_ids:
......@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
def logit_bias_logits_processor(
logit_bias: Dict[int, float],
token_ids: List[int],
logit_bias: dict[int, float],
token_ids: list[int],
logits: torch.Tensor,
) -> torch.Tensor:
for token_id, bias in logit_bias.items():
......@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
def get_logits_processors(
logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
allowed_token_ids: Optional[List[int]],
logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
allowed_token_ids: Optional[list[int]],
tokenizer: AnyTokenizer,
) -> List[LogitsProcessor]:
logits_processors: List[LogitsProcessor] = []
) -> list[LogitsProcessor]:
logits_processors: list[LogitsProcessor] = []
if logit_bias:
try:
# Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec
clamped_logit_bias: Dict[int, float] = {
clamped_logit_bias: dict[int, float] = {
int(token_id): min(100.0, max(-100.0, bias))
for token_id, bias in logit_bias.items()
}
......
......@@ -5,13 +5,13 @@
import re
import time
from argparse import Namespace
from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
from typing import Annotated, Any, ClassVar, Literal, Optional, Union
import torch
from fastapi import UploadFile
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
ValidationInfo, field_validator, model_validator)
from typing_extensions import Annotated, TypeAlias
from typing_extensions import TypeAlias
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.logger import init_logger
......@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
model_config = ConfigDict(extra="allow")
# Cache class field names
field_names: ClassVar[Optional[Set[str]]] = None
field_names: ClassVar[Optional[set[str]]] = None
@model_validator(mode="wrap")
@classmethod
......@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
root: Optional[str] = None
parent: Optional[str] = None
max_model_len: Optional[int] = None
permission: List[ModelPermission] = Field(default_factory=list)
permission: list[ModelPermission] = Field(default_factory=list)
class ModelList(OpenAIBaseModel):
object: str = "list"
data: List[ModelCard] = Field(default_factory=list)
data: list[ModelCard] = Field(default_factory=list)
class PromptTokenUsageInfo(OpenAIBaseModel):
......@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
description: Optional[str] = None
# schema is the field in openai but that causes conflicts with pydantic so
# instead use json_schema with an alias
json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema')
json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
strict: Optional[bool] = None
......@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
class FunctionDefinition(OpenAIBaseModel):
name: str
description: Optional[str] = None
parameters: Optional[Dict[str, Any]] = None
parameters: Optional[dict[str, Any]] = None
class ChatCompletionToolsParam(OpenAIBaseModel):
......@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
class LogitsProcessorConstructor(BaseModel):
qualname: str
args: Optional[List[Any]] = None
kwargs: Optional[Dict[str, Any]] = None
args: Optional[list[Any]] = None
kwargs: Optional[dict[str, Any]] = None
LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
def get_logits_processors(processors: Optional[LogitsProcessors],
pattern: Optional[str]) -> Optional[List[Any]]:
pattern: Optional[str]) -> Optional[list[Any]]:
if processors and pattern:
logits_processors = []
for processor in processors:
......@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
class ChatCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages: List[ChatCompletionMessageParam]
messages: list[ChatCompletionMessageParam]
model: Optional[str] = None
frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None
logit_bias: Optional[dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
# TODO(#9845): remove max_tokens when field is removed from OpenAI API
......@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None
seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
stream: Optional[bool] = False
stream_options: Optional[StreamOptions] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
tools: Optional[List[ChatCompletionToolsParam]] = None
tools: Optional[list[ChatCompletionToolsParam]] = None
tool_choice: Optional[Union[Literal["none"], Literal["auto"],
ChatCompletionNamedToolChoiceParam]] = "none"
......@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
min_p: Optional[float] = None
repetition_penalty: Optional[float] = None
length_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
stop_token_ids: Optional[list[int]] = Field(default_factory=list)
include_stop_str_in_output: bool = False
ignore_eos: bool = False
min_tokens: int = 0
......@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to false (as is the "
"default)."),
)
documents: Optional[List[Dict[str, str]]] = Field(
documents: Optional[list[dict[str, str]]] = Field(
default=None,
description=
("A list of dicts representing documents that will be accessible to "
......@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."),
)
mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
......@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
description=(
"If specified, the output will follow the regex pattern."),
)
guided_choice: Optional[List[str]] = Field(
guided_choice: Optional[list[str]] = Field(
default=None,
description=(
"If specified, the output will be exactly one of the choices."),
......@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/completions/create
model: Optional[str] = None
prompt: Union[List[int], List[List[int]], str, List[str]]
prompt: Union[list[int], list[list[int]], str, list[str]]
best_of: Optional[int] = None
echo: Optional[bool] = False
frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None
logit_bias: Optional[dict[str, float]] = None
logprobs: Optional[int] = None
max_tokens: Optional[int] = 16
n: int = 1
presence_penalty: Optional[float] = 0.0
seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
stream: Optional[bool] = False
stream_options: Optional[StreamOptions] = None
suffix: Optional[str] = None
......@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
min_p: Optional[float] = None
repetition_penalty: Optional[float] = None
length_penalty: float = 1.0
stop_token_ids: Optional[List[int]] = Field(default_factory=list)
stop_token_ids: Optional[list[int]] = Field(default_factory=list)
include_stop_str_in_output: bool = False
ignore_eos: bool = False
min_tokens: int = 0
skip_special_tokens: bool = True
spaces_between_special_tokens: bool = True
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
allowed_token_ids: Optional[List[int]] = None
allowed_token_ids: Optional[list[int]] = None
prompt_logprobs: Optional[int] = None
# doc: end-completion-sampling-params
......@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
description=(
"If specified, the output will follow the regex pattern."),
)
guided_choice: Optional[List[str]] = Field(
guided_choice: Optional[list[str]] = Field(
default=None,
description=(
"If specified, the output will be exactly one of the choices."),
......@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model: Optional[str] = None
input: Union[List[int], List[List[int]], str, List[str]]
input: Union[list[int], list[list[int]], str, list[str]]
encoding_format: Literal["float", "base64"] = "float"
dimensions: Optional[int] = None
user: Optional[str] = None
......@@ -940,7 +940,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
class EmbeddingChatRequest(OpenAIBaseModel):
model: Optional[str] = None
messages: List[ChatCompletionMessageParam]
messages: list[ChatCompletionMessageParam]
encoding_format: Literal["float", "base64"] = "float"
dimensions: Optional[int] = None
......@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."),
)
mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
......@@ -1008,8 +1008,8 @@ PoolingRequest = Union[PoolingCompletionRequest, PoolingChatRequest]
class ScoreRequest(OpenAIBaseModel):
model: Optional[str] = None
text_1: Union[List[str], str]
text_2: Union[List[str], str]
text_1: Union[list[str], str]
text_2: Union[list[str], str]
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
# doc: begin-score-pooling-params
......@@ -1033,7 +1033,7 @@ class ScoreRequest(OpenAIBaseModel):
class RerankRequest(OpenAIBaseModel):
model: Optional[str] = None
query: str
documents: List[str]
documents: list[str]
top_n: int = Field(default_factory=lambda: 0)
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
......@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
id: str
model: str
usage: RerankUsage
results: List[RerankResult]
results: list[RerankResult]
class CompletionLogProbs(OpenAIBaseModel):
text_offset: List[int] = Field(default_factory=list)
token_logprobs: List[Optional[float]] = Field(default_factory=list)
tokens: List[str] = Field(default_factory=list)
top_logprobs: List[Optional[Dict[str,
text_offset: list[int] = Field(default_factory=list)
token_logprobs: list[Optional[float]] = Field(default_factory=list)
tokens: list[str] = Field(default_factory=list)
top_logprobs: list[Optional[dict[str,
float]]] = Field(default_factory=list)
......@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
"to stop, None if the completion finished for some other reason "
"including encountering the EOS token"),
)
prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
class CompletionResponse(OpenAIBaseModel):
......@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseChoice]
choices: list[CompletionResponseChoice]
usage: UsageInfo
......@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
object: str = "text_completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[CompletionResponseStreamChoice]
choices: list[CompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(default=None)
class EmbeddingResponseData(OpenAIBaseModel):
index: int
object: str = "embedding"
embedding: Union[List[float], str]
embedding: Union[list[float], str]
class EmbeddingResponse(OpenAIBaseModel):
......@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
object: str = "list"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
data: List[EmbeddingResponseData]
data: list[EmbeddingResponseData]
usage: UsageInfo
class PoolingResponseData(OpenAIBaseModel):
index: int
object: str = "pooling"
data: Union[List[List[float]], List[float], str]
data: Union[list[list[float]], list[float], str]
class PoolingResponse(OpenAIBaseModel):
......@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
object: str = "list"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
data: List[PoolingResponseData]
data: list[PoolingResponseData]
usage: UsageInfo
......@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
object: str = "list"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
data: List[ScoreResponseData]
data: list[ScoreResponseData]
usage: UsageInfo
......@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
tools_called: bool
# extracted tool calls
tool_calls: List[ToolCall]
tool_calls: list[ToolCall]
# content - per OpenAI spec, content AND tool calls can be returned rarely
# But some models will do this intentionally
......@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
role: str
reasoning_content: Optional[str] = None
content: Optional[str] = None
tool_calls: List[ToolCall] = Field(default_factory=list)
tool_calls: list[ToolCall] = Field(default_factory=list)
class ChatCompletionLogProb(OpenAIBaseModel):
token: str
logprob: float = -9999.0
bytes: Optional[List[int]] = None
bytes: Optional[list[int]] = None
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list)
top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
class ChatCompletionLogProbs(OpenAIBaseModel):
content: Optional[List[ChatCompletionLogProbsContent]] = None
content: Optional[list[ChatCompletionLogProbsContent]] = None
class ChatCompletionResponseChoice(OpenAIBaseModel):
......@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseChoice]
choices: list[ChatCompletionResponseChoice]
usage: UsageInfo
prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
class DeltaMessage(OpenAIBaseModel):
role: Optional[str] = None
content: Optional[str] = None
reasoning_content: Optional[str] = None
tool_calls: List[DeltaToolCall] = Field(default_factory=list)
tool_calls: list[DeltaToolCall] = Field(default_factory=list)
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
......@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]
choices: list[ChatCompletionResponseStreamChoice]
usage: Optional[UsageInfo] = Field(default=None)
......@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
class TokenizeChatRequest(OpenAIBaseModel):
model: Optional[str] = None
messages: List[ChatCompletionMessageParam]
messages: list[ChatCompletionMessageParam]
add_generation_prompt: bool = Field(
default=True,
......@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
chat_template_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."),
)
mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
mm_processor_kwargs: Optional[dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
......@@ -1419,12 +1419,12 @@ TokenizeRequest = Union[TokenizeCompletionRequest, TokenizeChatRequest]
class TokenizeResponse(OpenAIBaseModel):
count: int
max_model_len: int
tokens: List[int]
tokens: list[int]
class DetokenizeRequest(OpenAIBaseModel):
model: Optional[str] = None
tokens: List[int]
tokens: list[int]
class DetokenizeResponse(OpenAIBaseModel):
......@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to automatically increase the temperature until certain thresholds are hit.
"""
timestamp_granularities: List[Literal["word", "segment"]] = Field(
timestamp_granularities: list[Literal["word", "segment"]] = Field(
alias="timestamp_granularities[]", default=[])
"""The timestamp granularities to populate for this transcription.
......@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
text: str
"""Text content of the segment."""
tokens: List[int]
tokens: list[int]
"""Array of token IDs for the text content."""
......@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
text: str
"""The transcribed text."""
segments: Optional[List[TranscriptionSegment]] = None
segments: Optional[list[TranscriptionSegment]] = None
"""Segments of the transcribed text and their corresponding details."""
words: Optional[List[TranscriptionWord]] = None
words: Optional[list[TranscriptionWord]] = None
"""Extracted words and their corresponding timestamps."""
# SPDX-License-Identifier: Apache-2.0
import os
from collections.abc import Sequence
from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
from typing import Callable, Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
......@@ -25,14 +26,14 @@ class ReasoningParser:
self.model_tokenizer = tokenizer
@cached_property
def vocab(self) -> Dict[str, int]:
def vocab(self) -> dict[str, int]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab()
def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> Tuple[Optional[str], Optional[str]]:
) -> tuple[Optional[str], Optional[str]]:
"""
Extract reasoning content from a complete model-generated string.
......@@ -47,7 +48,7 @@ class ReasoningParser:
The request object that was used to generate the model_output.
Returns:
Tuple[Optional[str], Optional[str]]
tuple[Optional[str], Optional[str]]
A tuple containing the reasoning content and the content.
"""
......@@ -77,10 +78,10 @@ class ReasoningParser:
class ReasoningParserManager:
reasoning_parsers: Dict[str, Type] = {}
reasoning_parsers: dict[str, type] = {}
@classmethod
def get_reasoning_parser(cls, name) -> Type:
def get_reasoning_parser(cls, name) -> type:
"""
Get reasoning parser by name which is registered by `register_module`.
......@@ -94,8 +95,8 @@ class ReasoningParserManager:
@classmethod
def _register_module(cls,
module: Type,
module_name: Optional[Union[str, List[str]]] = None,
module: type,
module_name: Optional[Union[str, list[str]]] = None,
force: bool = True) -> None:
if not issubclass(module, ReasoningParser):
raise TypeError("module must be subclass of ReasoningParser, "
......@@ -114,9 +115,9 @@ class ReasoningParserManager:
@classmethod
def register_module(
cls,
name: Optional[Union[str, List[str]]] = None,
name: Optional[Union[str, list[str]]] = None,
force: bool = True,
module: Union[Type, None] = None) -> Union[type, Callable]:
module: Union[type, None] = None) -> Union[type, Callable]:
"""
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
......
# SPDX-License-Identifier: Apache-2.0
import re
from typing import Optional, Sequence, Tuple, Union
from collections.abc import Sequence
from typing import Optional, Union
from transformers import PreTrainedTokenizerBase
......@@ -122,7 +123,7 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
def extract_reasoning_content(
self, model_output: str, request: ChatCompletionRequest
) -> Tuple[Optional[str], Optional[str]]:
) -> tuple[Optional[str], Optional[str]]:
# DeepSeek R1 doesn't generate <think> now.
# Thus we assume the reasoning content is always at the start.
......
......@@ -2,9 +2,10 @@
import asyncio
import tempfile
from collections.abc import Awaitable
from http import HTTPStatus
from io import StringIO
from typing import Awaitable, Callable, List, Optional
from typing import Callable, Optional
import aiohttp
import torch
......@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
async def write_local_file(output_path: str,
batch_outputs: List[BatchRequestOutput]) -> None:
batch_outputs: list[BatchRequestOutput]) -> None:
"""
Write the responses to a local file.
output_path: The path to write the responses to.
......@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
f"Error message: {str(e)}.") from e
async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput],
async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
output_tmp_dir: str) -> None:
"""
Write batch_outputs to a file or upload to a URL.
......@@ -353,7 +354,7 @@ async def main(args):
logger.info("Reading batch from %s...", args.input_file)
# Submit all requests in the file to the engine "concurrently".
response_futures: List[Awaitable[BatchRequestOutput]] = []
response_futures: list[Awaitable[BatchRequestOutput]] = []
for request_json in (await read_file(args.input_file)).strip().split("\n"):
# Skip empty lines.
request_json = request_json.strip()
......
......@@ -3,10 +3,9 @@
import asyncio
import json
import time
from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List,
Optional)
from typing import Sequence as GenericSequence
from typing import Union
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Callable, Final, Optional, Union
from fastapi import Request
......@@ -205,7 +204,7 @@ class OpenAIServingChat(OpenAIServing):
raw_request.state.request_metadata = request_metadata
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[RequestOutput, None]] = []
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
sampling_params: Union[SamplingParams, BeamSearchParams]
......@@ -282,7 +281,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator: AsyncIterator[RequestOutput],
request_id: str,
model_name: str,
conversation: List[ConversationMessage],
conversation: list[ConversationMessage],
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
) -> AsyncGenerator[str, None]:
......@@ -310,7 +309,7 @@ class OpenAIServingChat(OpenAIServing):
should_stream_with_reasoning_parsing = (
self._should_stream_with_reasoning_parsing(request))
all_previous_token_ids: Optional[List[List[int]]]
all_previous_token_ids: Optional[list[list[int]]]
# Only one of these will be used, thus previous_texts and
# all_previous_token_ids will not be used twice in the same iteration.
......@@ -339,7 +338,7 @@ class OpenAIServingChat(OpenAIServing):
# Prepare the tool parser if it's needed
try:
if tool_choice_auto and self.tool_parser:
tool_parsers: List[Optional[ToolParser]] = [
tool_parsers: list[Optional[ToolParser]] = [
self.tool_parser(tokenizer)
] * num_choices
else:
......@@ -406,7 +405,7 @@ class OpenAIServingChat(OpenAIServing):
# Send response to echo the input portion of the
# last message
if request.echo:
last_msg_content: Union[str, List[Dict[str, str]]] = ""
last_msg_content: Union[str, list[dict[str, str]]] = ""
if conversation and "content" in conversation[
-1] and conversation[-1].get("role") == role:
last_msg_content = conversation[-1]["content"] or ""
......@@ -674,7 +673,7 @@ class OpenAIServingChat(OpenAIServing):
result_generator: AsyncIterator[RequestOutput],
request_id: str,
model_name: str,
conversation: List[ConversationMessage],
conversation: list[ConversationMessage],
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
) -> Union[ErrorResponse, ChatCompletionResponse]:
......@@ -693,7 +692,7 @@ class OpenAIServingChat(OpenAIServing):
assert final_res is not None
choices: List[ChatCompletionResponseChoice] = []
choices: list[ChatCompletionResponseChoice] = []
role = self.get_chat_request_role(request)
for output in final_res.outputs:
......@@ -812,7 +811,7 @@ class OpenAIServingChat(OpenAIServing):
choices.append(choice_data)
if request.echo:
last_msg_content: Union[str, List[Dict[str, str]]] = ""
last_msg_content: Union[str, list[dict[str, str]]] = ""
if conversation and "content" in conversation[-1] and conversation[
-1].get("role") == role:
last_msg_content = conversation[-1]["content"] or ""
......@@ -853,8 +852,8 @@ class OpenAIServingChat(OpenAIServing):
return response
def _get_top_logprobs(
self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]:
self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
return [
ChatCompletionLogProb(token=(token := self._get_decoded_token(
p[1],
......@@ -871,12 +870,12 @@ class OpenAIServingChat(OpenAIServing):
def _create_chat_logprobs(
self,
token_ids: GenericSequence[int],
top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
tokenizer: AnyTokenizer,
num_output_top_logprobs: Optional[int] = None,
) -> ChatCompletionLogProbs:
"""Create OpenAI-style logprobs."""
logprobs_content: List[ChatCompletionLogProbsContent] = []
logprobs_content: list[ChatCompletionLogProbsContent] = []
for i, token_id in enumerate(token_ids):
step_top_logprobs = top_logprobs[i]
......
......@@ -2,9 +2,9 @@
import asyncio
import time
from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
from typing import Sequence as GenericSequence
from typing import Tuple, Union, cast
from collections.abc import AsyncGenerator, AsyncIterator
from collections.abc import Sequence as GenericSequence
from typing import Optional, Union, cast
from fastapi import Request
......@@ -113,7 +113,7 @@ class OpenAIServingCompletion(OpenAIServing):
return self.create_error_response(str(e))
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[RequestOutput, None]] = []
generators: list[AsyncGenerator[RequestOutput, None]] = []
try:
for i, engine_prompt in enumerate(engine_prompts):
sampling_params: Union[SamplingParams, BeamSearchParams]
......@@ -189,7 +189,7 @@ class OpenAIServingCompletion(OpenAIServing):
request_metadata=request_metadata)
# Non-streaming response
final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
try:
async for i, res in result_generator:
final_res_batch[i] = res
......@@ -203,7 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
if final_res.prompt is None:
final_res.prompt = request_prompts[i]["prompt"]
final_res_batch_checked = cast(List[RequestOutput],
final_res_batch_checked = cast(list[RequestOutput],
final_res_batch)
response = self.request_output_to_completion_response(
......@@ -237,7 +237,7 @@ class OpenAIServingCompletion(OpenAIServing):
async def completion_stream_generator(
self,
request: CompletionRequest,
result_generator: AsyncIterator[Tuple[int, RequestOutput]],
result_generator: AsyncIterator[tuple[int, RequestOutput]],
request_id: str,
created_time: int,
model_name: str,
......@@ -270,7 +270,7 @@ class OpenAIServingCompletion(OpenAIServing):
num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
delta_token_ids: GenericSequence[int]
out_logprobs: Optional[GenericSequence[Optional[Dict[
out_logprobs: Optional[GenericSequence[Optional[dict[
int, Logprob]]]]
for output in res.outputs:
......@@ -381,7 +381,7 @@ class OpenAIServingCompletion(OpenAIServing):
def request_output_to_completion_response(
self,
final_res_batch: List[RequestOutput],
final_res_batch: list[RequestOutput],
request: CompletionRequest,
request_id: str,
created_time: int,
......@@ -389,7 +389,7 @@ class OpenAIServingCompletion(OpenAIServing):
tokenizer: AnyTokenizer,
request_metadata: RequestResponseMetadata,
) -> CompletionResponse:
choices: List[CompletionResponseChoice] = []
choices: list[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
......@@ -406,7 +406,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text = final_res.prompt
token_ids: GenericSequence[int]
out_logprobs: Optional[GenericSequence[Optional[Dict[int,
out_logprobs: Optional[GenericSequence[Optional[dict[int,
Logprob]]]]
for output in final_res.outputs:
......@@ -480,16 +480,16 @@ class OpenAIServingCompletion(OpenAIServing):
def _create_completion_logprobs(
self,
token_ids: GenericSequence[int],
top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
num_output_top_logprobs: int,
tokenizer: AnyTokenizer,
initial_text_offset: int = 0,
) -> CompletionLogProbs:
"""Create logprobs for OpenAI Completion API."""
out_text_offset: List[int] = []
out_token_logprobs: List[Optional[float]] = []
out_tokens: List[str] = []
out_top_logprobs: List[Optional[Dict[str, float]]] = []
out_text_offset: list[int] = []
out_token_logprobs: list[Optional[float]] = []
out_tokens: list[str] = []
out_top_logprobs: list[Optional[dict[str, float]]] = []
last_token_len = 0
......
......@@ -3,7 +3,8 @@
import asyncio
import base64
import time
from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
from collections.abc import AsyncGenerator
from typing import Final, Literal, Optional, Union, cast
import numpy as np
from fastapi import Request
......@@ -31,7 +32,7 @@ logger = init_logger(__name__)
def _get_embedding(
output: EmbeddingOutput,
encoding_format: Literal["float", "base64"],
) -> Union[List[float], str]:
) -> Union[list[float], str]:
if encoding_format == "float":
return output.embedding
elif encoding_format == "base64":
......@@ -143,7 +144,7 @@ class OpenAIServingEmbedding(OpenAIServing):
return self.create_error_response(str(e))
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
try:
pooling_params = request.to_pooling_params()
......@@ -178,7 +179,7 @@ class OpenAIServingEmbedding(OpenAIServing):
num_prompts = len(engine_prompts)
# Non-streaming response
final_res_batch: List[Optional[PoolingRequestOutput]]
final_res_batch: list[Optional[PoolingRequestOutput]]
final_res_batch = [None] * num_prompts
try:
async for i, res in result_generator:
......@@ -186,7 +187,7 @@ class OpenAIServingEmbedding(OpenAIServing):
assert all(final_res is not None for final_res in final_res_batch)
final_res_batch_checked = cast(List[PoolingRequestOutput],
final_res_batch_checked = cast(list[PoolingRequestOutput],
final_res_batch)
response = self.request_output_to_embedding_response(
......@@ -206,13 +207,13 @@ class OpenAIServingEmbedding(OpenAIServing):
def request_output_to_embedding_response(
self,
final_res_batch: List[PoolingRequestOutput],
final_res_batch: list[PoolingRequestOutput],
request_id: str,
created_time: int,
model_name: str,
encoding_format: Literal["float", "base64"],
) -> EmbeddingResponse:
items: List[EmbeddingResponseData] = []
items: list[EmbeddingResponseData] = []
num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch):
......
# SPDX-License-Identifier: Apache-2.0
import json
from collections.abc import Iterable, Iterator, Mapping, Sequence
from concurrent.futures.thread import ThreadPoolExecutor
from http import HTTPStatus
from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
Optional, Sequence, Tuple, TypedDict, Union)
from typing import Annotated, Any, Callable, Optional, TypedDict, Union
from fastapi import Request
from pydantic import Field
from starlette.datastructures import Headers
from typing_extensions import Annotated
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
......@@ -64,10 +63,10 @@ AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
class TextTokensPrompt(TypedDict):
prompt: str
prompt_token_ids: List[int]
prompt_token_ids: list[int]
RequestPrompt = Union[List[int], str, TextTokensPrompt]
RequestPrompt = Union[list[int], str, TextTokensPrompt]
class OpenAIServing:
......@@ -144,7 +143,7 @@ class OpenAIServing:
def _maybe_get_adapters(
self, request: AnyRequest
) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[
) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[
None, PromptAdapterRequest]]:
if self._is_model_supported(request.model):
return None, None
......@@ -188,7 +187,7 @@ class OpenAIServing:
self,
request: AnyRequest,
tokenizer: AnyTokenizer,
prompt_ids: List[int],
prompt_ids: list[int],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
) -> TextTokensPrompt:
if truncate_prompt_tokens is None:
......@@ -203,7 +202,7 @@ class OpenAIServing:
def _validate_input(
self,
request: AnyRequest,
input_ids: List[int],
input_ids: list[int],
input_text: str,
) -> TextTokensPrompt:
token_num = len(input_ids)
......@@ -259,7 +258,7 @@ class OpenAIServing:
self,
request: AnyRequest,
tokenizer: AnyTokenizer,
prompt_input: Union[str, List[int]],
prompt_input: Union[str, list[int]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True,
) -> TextTokensPrompt:
......@@ -280,7 +279,7 @@ class OpenAIServing:
self,
request: AnyRequest,
tokenizer: AnyTokenizer,
prompt_inputs: Iterable[Union[str, List[int]]],
prompt_inputs: Iterable[Union[str, list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True,
) -> Iterator[TextTokensPrompt]:
......@@ -309,10 +308,10 @@ class OpenAIServing:
self,
request: AnyRequest,
tokenizer: AnyTokenizer,
input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True,
) -> List[TextTokensPrompt]:
) -> list[TextTokensPrompt]:
"""
Tokenize/detokenize depending on the input format.
......@@ -344,10 +343,10 @@ class OpenAIServing:
self,
request: CompletionLikeRequest,
tokenizer: AnyTokenizer,
input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = True,
) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]:
) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]:
request_prompts = await self._tokenize_prompt_input_or_inputs_async(
request,
tokenizer,
......@@ -367,19 +366,19 @@ class OpenAIServing:
self,
request: ChatLikeRequest,
tokenizer: AnyTokenizer,
messages: List[ChatCompletionMessageParam],
messages: list[ChatCompletionMessageParam],
chat_template: Optional[str],
chat_template_content_format: ChatTemplateContentFormatOption,
add_generation_prompt: bool = True,
continue_final_message: bool = False,
tool_dicts: Optional[List[Dict[str, Any]]] = None,
documents: Optional[List[Dict[str, str]]] = None,
chat_template_kwargs: Optional[Dict[str, Any]] = None,
tool_dicts: Optional[list[dict[str, Any]]] = None,
documents: Optional[list[dict[str, str]]] = None,
chat_template_kwargs: Optional[dict[str, Any]] = None,
tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
add_special_tokens: bool = False,
) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
List[TokensPrompt]]:
) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
list[TokensPrompt]]:
resolved_content_format = resolve_chat_template_content_format(
chat_template,
chat_template_content_format,
......@@ -392,7 +391,7 @@ class OpenAIServing:
content_format=resolved_content_format,
)
_chat_template_kwargs: Dict[str, Any] = dict(
_chat_template_kwargs: dict[str, Any] = dict(
chat_template=chat_template,
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
......@@ -401,7 +400,7 @@ class OpenAIServing:
)
_chat_template_kwargs.update(chat_template_kwargs or {})
request_prompt: Union[str, List[int]]
request_prompt: Union[str, list[int]]
if isinstance(tokenizer, MistralTokenizer):
request_prompt = apply_mistral_chat_template(
tokenizer,
......
......@@ -4,7 +4,7 @@ import json
import pathlib
from dataclasses import dataclass
from http import HTTPStatus
from typing import List, Optional, Union
from typing import Optional, Union
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
......@@ -53,10 +53,10 @@ class OpenAIServingModels:
self,
engine_client: EngineClient,
model_config: ModelConfig,
base_model_paths: List[BaseModelPath],
base_model_paths: list[BaseModelPath],
*,
lora_modules: Optional[List[LoRAModulePath]] = None,
prompt_adapters: Optional[List[PromptAdapterPath]] = None,
lora_modules: Optional[list[LoRAModulePath]] = None,
prompt_adapters: Optional[list[PromptAdapterPath]] = None,
):
super().__init__()
......@@ -65,7 +65,7 @@ class OpenAIServingModels:
self.engine_client = engine_client
self.static_lora_modules = lora_modules
self.lora_requests: List[LoRARequest] = []
self.lora_requests: list[LoRARequest] = []
self.lora_id_counter = AtomicCounter(0)
self.prompt_adapter_requests = []
......
......@@ -3,7 +3,8 @@
import asyncio
import base64
import time
from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
from collections.abc import AsyncGenerator
from typing import Final, Literal, Optional, Union, cast
import numpy as np
from fastapi import Request
......@@ -29,7 +30,7 @@ logger = init_logger(__name__)
def _get_data(
output: PoolingOutput,
encoding_format: Literal["float", "base64"],
) -> Union[List[float], str]:
) -> Union[list[float], str]:
if encoding_format == "float":
return output.data.tolist()
elif encoding_format == "base64":
......@@ -139,7 +140,7 @@ class OpenAIServingPooling(OpenAIServing):
return self.create_error_response(str(e))
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
try:
pooling_params = request.to_pooling_params()
......@@ -174,7 +175,7 @@ class OpenAIServingPooling(OpenAIServing):
num_prompts = len(engine_prompts)
# Non-streaming response
final_res_batch: List[Optional[PoolingRequestOutput]]
final_res_batch: list[Optional[PoolingRequestOutput]]
final_res_batch = [None] * num_prompts
try:
async for i, res in result_generator:
......@@ -182,7 +183,7 @@ class OpenAIServingPooling(OpenAIServing):
assert all(final_res is not None for final_res in final_res_batch)
final_res_batch_checked = cast(List[PoolingRequestOutput],
final_res_batch_checked = cast(list[PoolingRequestOutput],
final_res_batch)
response = self.request_output_to_pooling_response(
......@@ -202,13 +203,13 @@ class OpenAIServingPooling(OpenAIServing):
def request_output_to_pooling_response(
self,
final_res_batch: List[PoolingRequestOutput],
final_res_batch: list[PoolingRequestOutput],
request_id: str,
created_time: int,
model_name: str,
encoding_format: Literal["float", "base64"],
) -> PoolingResponse:
items: List[PoolingResponseData] = []
items: list[PoolingResponseData] = []
num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch):
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import time
from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union
from collections.abc import AsyncGenerator, Mapping
from typing import Any, Optional, Union
from fastapi import Request
......@@ -48,8 +49,8 @@ class ServingScores(OpenAIServing):
async def _embedding_score(
self,
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
texts_1: List[str],
texts_2: List[str],
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
request_id=str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
......@@ -57,11 +58,11 @@ class ServingScores(OpenAIServing):
prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> List[PoolingRequestOutput]:
) -> list[PoolingRequestOutput]:
input_texts = texts_1 + texts_2
engine_prompts: List[TokensPrompt] = []
engine_prompts: list[TokensPrompt] = []
tokenize_async = make_async(tokenizer.__call__,
executor=self._tokenizer_executor)
......@@ -82,7 +83,7 @@ class ServingScores(OpenAIServing):
prompt_token_ids=text_token_prompt["prompt_token_ids"]))
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
pooling_params = request.to_pooling_params()
for i, engine_prompt in enumerate(engine_prompts):
......@@ -108,16 +109,16 @@ class ServingScores(OpenAIServing):
result_generator = merge_async_iterators(*generators)
# Non-streaming response
final_res_batch: List[PoolingRequestOutput] = []
final_res_batch: list[PoolingRequestOutput] = []
embeddings: List[Optional[PoolingRequestOutput]] =\
embeddings: list[Optional[PoolingRequestOutput]] =\
[None] * len(engine_prompts)
async for i, res in result_generator:
embeddings[i] = res
emb_texts_1: List[PoolingRequestOutput] = []
emb_texts_2: List[PoolingRequestOutput] = []
emb_texts_1: list[PoolingRequestOutput] = []
emb_texts_2: list[PoolingRequestOutput] = []
for i in range(0, len(texts_1)):
assert (emb := embeddings[i]) is not None
......@@ -139,8 +140,8 @@ class ServingScores(OpenAIServing):
async def _cross_encoding_score(
self,
tokenizer: Union[AnyTokenizer],
texts_1: List[str],
texts_2: List[str],
texts_1: list[str],
texts_2: list[str],
request: Union[RerankRequest, ScoreRequest],
request_id=str,
tokenization_kwargs: Optional[dict[str, Any]] = None,
......@@ -148,10 +149,10 @@ class ServingScores(OpenAIServing):
prompt_adapter_request: Optional[Union[PromptAdapterRequest,
None]] = None,
trace_headers: Optional[Mapping[str, str]] = None,
) -> List[PoolingRequestOutput]:
) -> list[PoolingRequestOutput]:
request_prompts: List[str] = []
engine_prompts: List[TokensPrompt] = []
request_prompts: list[str] = []
engine_prompts: list[TokensPrompt] = []
if len(texts_1) == 1:
texts_1 = texts_1 * len(texts_2)
......@@ -185,7 +186,7 @@ class ServingScores(OpenAIServing):
engine_prompts.append(engine_prompt)
# Schedule the request and get the result generator.
generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
pooling_params = request.to_pooling_params()
......@@ -212,7 +213,7 @@ class ServingScores(OpenAIServing):
result_generator = merge_async_iterators(*generators)
# Non-streaming response
final_res_batch: List[
final_res_batch: list[
Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
async for i, res in result_generator:
......@@ -228,9 +229,9 @@ class ServingScores(OpenAIServing):
request_id: str,
raw_request: Optional[Request] = None,
truncate_prompt_tokens: Optional[int] = None,
) -> List[PoolingRequestOutput]:
) -> list[PoolingRequestOutput]:
tokenization_kwargs: Dict[str, Any] = {}
tokenization_kwargs: dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens
......@@ -372,12 +373,12 @@ class ServingScores(OpenAIServing):
def request_output_to_score_response(
self,
final_res_batch: List[PoolingRequestOutput],
final_res_batch: list[PoolingRequestOutput],
request_id: str,
created_time: int,
model_name: str,
) -> ScoreResponse:
items: List[ScoreResponseData] = []
items: list[ScoreResponseData] = []
num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch):
......@@ -406,13 +407,13 @@ class ServingScores(OpenAIServing):
)
def request_output_to_rerank_response(
self, final_res_batch: List[PoolingRequestOutput], request_id: str,
model_name: str, documents: List[str],
self, final_res_batch: list[PoolingRequestOutput], request_id: str,
model_name: str, documents: list[str],
top_n: int) -> RerankResponse:
"""
Convert the output of do_rank to a RerankResponse
"""
results: List[RerankResult] = []
results: list[RerankResult] = []
num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch):
classify_res = ScoringRequestOutput.from_base(final_res)
......
# SPDX-License-Identifier: Apache-2.0
from typing import Final, List, Optional, Union
from typing import Final, Optional, Union
from fastapi import Request
......@@ -92,7 +92,7 @@ class OpenAIServingTokenization(OpenAIServing):
logger.exception("Error in preprocessing prompt inputs")
return self.create_error_response(str(e))
input_ids: List[int] = []
input_ids: list[int] = []
for i, engine_prompt in enumerate(engine_prompts):
self._log_inputs(request_id,
request_prompts[i],
......
# SPDX-License-Identifier: Apache-2.0
import asyncio
import io
from typing import AsyncGenerator, Optional, Union, cast
from collections.abc import AsyncGenerator
from typing import Optional, Union, cast
from fastapi import Request
......
# SPDX-License-Identifier: Apache-2.0
import os
from collections.abc import Sequence
from functools import cached_property
from typing import Callable, Dict, List, Optional, Sequence, Type, Union
from typing import Callable, Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage,
......@@ -22,16 +23,16 @@ class ToolParser:
"""
def __init__(self, tokenizer: AnyTokenizer):
self.prev_tool_call_arr: List[Dict] = []
self.prev_tool_call_arr: list[dict] = []
# the index of the tool call that is currently being parsed
self.current_tool_id: int = -1
self.current_tool_name_sent: bool = False
self.streamed_args_for_tool: List[str] = []
self.streamed_args_for_tool: list[str] = []
self.model_tokenizer = tokenizer
@cached_property
def vocab(self) -> Dict[str, int]:
def vocab(self) -> dict[str, int]:
# NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
# whereas all tokenizers have .get_vocab()
return self.model_tokenizer.get_vocab()
......@@ -79,10 +80,10 @@ class ToolParser:
class ToolParserManager:
tool_parsers: Dict[str, Type] = {}
tool_parsers: dict[str, type] = {}
@classmethod
def get_tool_parser(cls, name) -> Type:
def get_tool_parser(cls, name) -> type:
"""
Get tool parser by name which is registered by `register_module`.
......@@ -95,8 +96,8 @@ class ToolParserManager:
@classmethod
def _register_module(cls,
module: Type,
module_name: Optional[Union[str, List[str]]] = None,
module: type,
module_name: Optional[Union[str, list[str]]] = None,
force: bool = True) -> None:
if not issubclass(module, ToolParser):
raise TypeError(
......@@ -116,9 +117,9 @@ class ToolParserManager:
@classmethod
def register_module(
cls,
name: Optional[Union[str, List[str]]] = None,
name: Optional[Union[str, list[str]]] = None,
force: bool = True,
module: Union[Type, None] = None) -> Union[type, Callable]:
module: Union[type, None] = None) -> Union[type, Callable]:
"""
Register module with the given name or name list. it can be used as a
decoder(with module as None) or normal function(with module as not
......
......@@ -2,8 +2,9 @@
import json
import re
from collections.abc import Sequence
from json import JSONDecoder
from typing import Dict, Sequence, Union
from typing import Union
import partial_json_parser
from partial_json_parser.core.options import Allow
......@@ -145,7 +146,7 @@ class Granite20bFCToolParser(ToolParser):
return None
# select as the current tool call the one we're on the state at
current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
current_tool_call: dict = tool_call_arr[self.current_tool_id] \
if len(tool_call_arr) > 0 else {}
# case -- if no tokens have been streamed for the tool, e.g.
......
# SPDX-License-Identifier: Apache-2.0
import json
from typing import Dict, Sequence, Union
from collections.abc import Sequence
from typing import Union
import partial_json_parser
from partial_json_parser.core.options import Allow
......@@ -136,7 +137,7 @@ class GraniteToolParser(ToolParser):
return None
# select as the current tool call the one we're on the state at
current_tool_call: Dict = tool_call_arr[self.current_tool_id]
current_tool_call: dict = tool_call_arr[self.current_tool_id]
delta = None
# case: we are starting a new tool in the array
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment