"tests/vscode:/vscode.git/clone" did not exist on "a4c29e6e823dc2a4b8362ad29d8e7b0b7693dc5f"
Unverified Commit f399182e authored by Chenheli Hua's avatar Chenheli Hua Committed by GitHub
Browse files

Run ruff format on a few files. (#24075)


Signed-off-by: default avatarChenheli Hua <huachenheli@outlook.com>
parent 1c413105
This diff is collapsed.
This diff is collapsed.
...@@ -82,16 +82,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of, ...@@ -82,16 +82,26 @@ from vllm.utils import (AsyncMicrobatchTokenizer, is_list_of,
logger = init_logger(__name__) logger = init_logger(__name__)
CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, CompletionLikeRequest = Union[
EmbeddingCompletionRequest, RerankRequest, CompletionRequest,
ClassificationRequest, ScoreRequest, DetokenizeRequest,
TokenizeCompletionRequest] EmbeddingCompletionRequest,
RerankRequest,
ClassificationRequest,
ScoreRequest,
TokenizeCompletionRequest,
]
ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
TokenizeChatRequest] TokenizeChatRequest]
SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest] SpeechToTextRequest = Union[TranscriptionRequest, TranslationRequest]
AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest, SpeechToTextRequest, AnyRequest = Union[
ResponsesRequest, IOProcessorRequest] CompletionLikeRequest,
ChatLikeRequest,
SpeechToTextRequest,
ResponsesRequest,
IOProcessorRequest,
]
AnyResponse = Union[ AnyResponse = Union[
CompletionResponse, CompletionResponse,
...@@ -135,6 +145,7 @@ class RequestProcessingMixin(BaseModel): ...@@ -135,6 +145,7 @@ class RequestProcessingMixin(BaseModel):
Mixin for request processing, Mixin for request processing,
handling prompt preparation and engine input. handling prompt preparation and engine input.
""" """
request_prompts: Optional[Sequence[RequestPrompt]] = [] request_prompts: Optional[Sequence[RequestPrompt]] = []
engine_prompts: Optional[Union[list[EngineTokensPrompt], engine_prompts: Optional[Union[list[EngineTokensPrompt],
list[EngineEmbedsPrompt]]] = [] list[EngineEmbedsPrompt]]] = []
...@@ -147,6 +158,7 @@ class ResponseGenerationMixin(BaseModel): ...@@ -147,6 +158,7 @@ class ResponseGenerationMixin(BaseModel):
Mixin for response generation, Mixin for response generation,
managing result generators and final batch results. managing result generators and final batch results.
""" """
result_generator: Optional[AsyncGenerator[tuple[int, Union[ result_generator: Optional[AsyncGenerator[tuple[int, Union[
RequestOutput, PoolingRequestOutput]], None]] = None RequestOutput, PoolingRequestOutput]], None]] = None
final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field( final_res_batch: list[Union[RequestOutput, PoolingRequestOutput]] = Field(
...@@ -155,8 +167,12 @@ class ResponseGenerationMixin(BaseModel): ...@@ -155,8 +167,12 @@ class ResponseGenerationMixin(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel, class ServeContext(
Generic[RequestT]): RequestProcessingMixin,
ResponseGenerationMixin,
BaseModel,
Generic[RequestT],
):
# Shared across all requests # Shared across all requests
request: RequestT request: RequestT
raw_request: Optional[Request] = None raw_request: Optional[Request] = None
...@@ -298,8 +314,8 @@ class OpenAIServing: ...@@ -298,8 +314,8 @@ class OpenAIServing:
truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens", truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
None) None)
if truncate_prompt_tokens is not None and \ if (truncate_prompt_tokens is not None
truncate_prompt_tokens > self.max_model_len: and truncate_prompt_tokens > self.max_model_len):
return self.create_error_response( return self.create_error_response(
"truncate_prompt_tokens value is " "truncate_prompt_tokens value is "
"greater than max_model_len." "greater than max_model_len."
...@@ -344,10 +360,12 @@ class OpenAIServing: ...@@ -344,10 +360,12 @@ class OpenAIServing:
return self.create_error_response( return self.create_error_response(
"Request prompts not available") "Request prompts not available")
self._log_inputs(request_id_item, self._log_inputs(
request_id_item,
ctx.request_prompts[i], ctx.request_prompts[i],
params=pooling_params, params=pooling_params,
lora_request=ctx.lora_request) lora_request=ctx.lora_request,
)
# Mypy has an existing bug related to inferring the variance of # Mypy has an existing bug related to inferring the variance of
# TypedDicts with `builtins.enumerate`: # TypedDicts with `builtins.enumerate`:
...@@ -413,7 +431,8 @@ class OpenAIServing: ...@@ -413,7 +431,8 @@ class OpenAIServing:
self, self,
message: str, message: str,
err_type: str = "BadRequestError", err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> ErrorResponse:
if self.log_error_stack: if self.log_error_stack:
exc_type, _, _ = sys.exc_info() exc_type, _, _ = sys.exc_info()
if exc_type is not None: if exc_type is not None:
...@@ -427,7 +446,8 @@ class OpenAIServing: ...@@ -427,7 +446,8 @@ class OpenAIServing:
self, self,
message: str, message: str,
err_type: str = "BadRequestError", err_type: str = "BadRequestError",
status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str: status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
) -> str:
json_str = json.dumps( json_str = json.dumps(
self.create_error_response(message=message, self.create_error_response(message=message,
err_type=err_type, err_type=err_type,
...@@ -438,25 +458,25 @@ class OpenAIServing: ...@@ -438,25 +458,25 @@ class OpenAIServing:
self, self,
request: AnyRequest, request: AnyRequest,
) -> Optional[ErrorResponse]: ) -> Optional[ErrorResponse]:
error_response = None error_response = None
if self._is_model_supported(request.model): if self._is_model_supported(request.model):
return None return None
if request.model in self.models.lora_requests: if request.model in self.models.lora_requests:
return None return None
if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and ( if (envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING and request.model and
load_result := await self.models.resolve_lora(request.model)): (load_result := await self.models.resolve_lora(request.model))):
if isinstance(load_result, LoRARequest): if isinstance(load_result, LoRARequest):
return None return None
if isinstance(load_result, ErrorResponse) and \ if (isinstance(load_result, ErrorResponse) and
load_result.error.code == HTTPStatus.BAD_REQUEST.value: load_result.error.code == HTTPStatus.BAD_REQUEST.value):
error_response = load_result error_response = load_result
return error_response or self.create_error_response( return error_response or self.create_error_response(
message=f"The model `{request.model}` does not exist.", message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError", err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND) status_code=HTTPStatus.NOT_FOUND,
)
def _get_active_default_mm_loras( def _get_active_default_mm_loras(
self, request: AnyRequest) -> Optional[LoRARequest]: self, request: AnyRequest) -> Optional[LoRARequest]:
...@@ -487,7 +507,6 @@ class OpenAIServing: ...@@ -487,7 +507,6 @@ class OpenAIServing:
request: AnyRequest, request: AnyRequest,
supports_default_mm_loras: bool = False, supports_default_mm_loras: bool = False,
) -> Optional[LoRARequest]: ) -> Optional[LoRARequest]:
if request.model in self.models.lora_requests: if request.model in self.models.lora_requests:
return self.models.lora_requests[request.model] return self.models.lora_requests[request.model]
...@@ -548,13 +567,15 @@ class OpenAIServing: ...@@ -548,13 +567,15 @@ class OpenAIServing:
prompt, prompt,
add_special_tokens=add_special_tokens, add_special_tokens=add_special_tokens,
truncation=True, truncation=True,
max_length=self.max_model_len) max_length=self.max_model_len,
)
else: else:
encoded = await async_tokenizer( encoded = await async_tokenizer(
prompt, prompt,
add_special_tokens=add_special_tokens, add_special_tokens=add_special_tokens,
truncation=True, truncation=True,
max_length=truncate_prompt_tokens) max_length=truncate_prompt_tokens,
)
input_ids = encoded.input_ids input_ids = encoded.input_ids
input_text = prompt input_text = prompt
...@@ -595,16 +616,22 @@ class OpenAIServing: ...@@ -595,16 +616,22 @@ class OpenAIServing:
# Note: EmbeddingRequest, ClassificationRequest, # Note: EmbeddingRequest, ClassificationRequest,
# and ScoreRequest doesn't have max_tokens # and ScoreRequest doesn't have max_tokens
if isinstance(request, if isinstance(
(EmbeddingChatRequest, EmbeddingCompletionRequest, request,
ScoreRequest, RerankRequest, ClassificationRequest)): (
EmbeddingChatRequest,
EmbeddingCompletionRequest,
ScoreRequest,
RerankRequest,
ClassificationRequest,
),
):
# Note: input length can be up to the entire model context length # Note: input length can be up to the entire model context length
# since these requests don't generate tokens. # since these requests don't generate tokens.
if token_num > self.max_model_len: if token_num > self.max_model_len:
operations: dict[type[AnyRequest], str] = { operations: dict[type[AnyRequest], str] = {
ScoreRequest: "score", ScoreRequest: "score",
ClassificationRequest: "classification" ClassificationRequest: "classification",
} }
operation = operations.get(type(request), operation = operations.get(type(request),
"embedding generation") "embedding generation")
...@@ -618,8 +645,11 @@ class OpenAIServing: ...@@ -618,8 +645,11 @@ class OpenAIServing:
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation # and does not require model context length validation
if isinstance(request, (TokenizeCompletionRequest, TokenizeChatRequest, if isinstance(
DetokenizeRequest)): request,
(TokenizeCompletionRequest, TokenizeChatRequest,
DetokenizeRequest),
):
return TextTokensPrompt(prompt=input_text, return TextTokensPrompt(prompt=input_text,
prompt_token_ids=input_ids) prompt_token_ids=input_ids)
...@@ -639,8 +669,8 @@ class OpenAIServing: ...@@ -639,8 +669,8 @@ class OpenAIServing:
f"{token_num} input tokens. Please reduce the length of " f"{token_num} input tokens. Please reduce the length of "
"the input messages.") "the input messages.")
if max_tokens is not None and \ if (max_tokens is not None
token_num + max_tokens > self.max_model_len: and token_num + max_tokens > self.max_model_len):
raise ValueError( raise ValueError(
"'max_tokens' or 'max_completion_tokens' is too large: " "'max_tokens' or 'max_completion_tokens' is too large: "
f"{max_tokens}. This model's maximum context length is " f"{max_tokens}. This model's maximum context length is "
...@@ -745,13 +775,14 @@ class OpenAIServing: ...@@ -745,13 +775,14 @@ class OpenAIServing:
tasks = [] tasks = []
for prompt_input in batch_inputs: for prompt_input in batch_inputs:
if prompt_input["is_tokens"] is False: if prompt_input["is_tokens"] is False:
assert tokenizer is not None, \ assert tokenizer is not None, (
"Tokenizer is required for text prompts" "Tokenizer is required for text prompts")
task = self._normalize_prompt_text_to_input( task = self._normalize_prompt_text_to_input(
request, request,
prompt_input["content"], prompt_input["content"],
tokenizer=tokenizer, tokenizer=tokenizer,
add_special_tokens=add_special_tokens) add_special_tokens=add_special_tokens,
)
else: else:
task = self._normalize_prompt_tokens_to_input( task = self._normalize_prompt_tokens_to_input(
request, prompt_input["content"], tokenizer=tokenizer) request, prompt_input["content"], tokenizer=tokenizer)
...@@ -766,9 +797,14 @@ class OpenAIServing: ...@@ -766,9 +797,14 @@ class OpenAIServing:
@overload @overload
async def _preprocess_completion( async def _preprocess_completion(
self, self,
request: Union[DetokenizeRequest, EmbeddingCompletionRequest, request: Union[
RerankRequest, ClassificationRequest, ScoreRequest, DetokenizeRequest,
TokenizeCompletionRequest], EmbeddingCompletionRequest,
RerankRequest,
ClassificationRequest,
ScoreRequest,
TokenizeCompletionRequest,
],
tokenizer: Optional[AnyTokenizer], tokenizer: Optional[AnyTokenizer],
input_or_inputs: Union[str, list[str], list[int], list[list[int]]], input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
add_special_tokens: bool = ..., add_special_tokens: bool = ...,
...@@ -783,8 +819,10 @@ class OpenAIServing: ...@@ -783,8 +819,10 @@ class OpenAIServing:
input_or_inputs: Optional[Union[str, list[str], list[int], input_or_inputs: Optional[Union[str, list[str], list[int],
list[list[int]]]], list[list[int]]]],
add_special_tokens: bool = ..., add_special_tokens: bool = ...,
) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[ ) -> tuple[
EngineTokensPrompt, EngineEmbedsPrompt]]]: list[Union[TextTokensPrompt, EmbedsPrompt]],
list[Union[EngineTokensPrompt, EngineEmbedsPrompt]],
]:
... ...
async def _preprocess_completion( async def _preprocess_completion(
...@@ -794,17 +832,23 @@ class OpenAIServing: ...@@ -794,17 +832,23 @@ class OpenAIServing:
input_or_inputs: Optional[Union[str, list[str], list[int], input_or_inputs: Optional[Union[str, list[str], list[int],
list[list[int]]]], list[list[int]]]],
add_special_tokens: bool = True, add_special_tokens: bool = True,
) -> tuple[Union[list[TextTokensPrompt], list[Union[ ) -> tuple[
TextTokensPrompt, EmbedsPrompt]]], Union[ Union[list[TextTokensPrompt], list[Union[TextTokensPrompt,
list[EngineTokensPrompt], list[Union[EngineTokensPrompt, EmbedsPrompt]]],
EngineEmbedsPrompt]]]]: Union[
if not isinstance(request, list[EngineTokensPrompt],
CompletionRequest) and input_or_inputs is None: list[Union[EngineTokensPrompt, EngineEmbedsPrompt]],
],
]:
if (not isinstance(request, CompletionRequest)
and input_or_inputs is None):
raise ValueError( raise ValueError(
"Prompt embeds with non-completion requests is not" "Prompt embeds with non-completion requests is not"
" currently supported.") " currently supported.")
(request_prompts_text, request_prompts_embeds (
request_prompts_text,
request_prompts_embeds,
) = await self._tokenize_prompt_input_or_inputs_async( ) = await self._tokenize_prompt_input_or_inputs_async(
request, request,
tokenizer, tokenizer,
...@@ -817,9 +861,9 @@ class OpenAIServing: ...@@ -817,9 +861,9 @@ class OpenAIServing:
prompt_token_ids=request_prompt_text["prompt_token_ids"]) prompt_token_ids=request_prompt_text["prompt_token_ids"])
for request_prompt_text in request_prompts_text for request_prompt_text in request_prompts_text
] ]
cache_salt = request.cache_salt if ( cache_salt = (request.cache_salt if
hasattr(request, "cache_salt") (hasattr(request, "cache_salt")
and request.cache_salt is not None) else None and request.cache_salt is not None) else None)
if cache_salt: if cache_salt:
for prompt_text in engine_prompts_text: for prompt_text in engine_prompts_text:
prompt_text["cache_salt"] = cache_salt prompt_text["cache_salt"] = cache_salt
...@@ -831,8 +875,8 @@ class OpenAIServing: ...@@ -831,8 +875,8 @@ class OpenAIServing:
# non-completion requests and if we don't add the overload here, # non-completion requests and if we don't add the overload here,
# everywhere this function is used outside of serving_completion will # everywhere this function is used outside of serving_completion will
# need logic asserting that only text prompts are in the request. # need logic asserting that only text prompts are in the request.
if not isinstance(request, if (not isinstance(request, CompletionRequest)
CompletionRequest) and input_or_inputs is not None: and input_or_inputs is not None):
return request_prompts_text, engine_prompts_text return request_prompts_text, engine_prompts_text
engine_prompts_embeds = [ engine_prompts_embeds = [
...@@ -862,8 +906,11 @@ class OpenAIServing: ...@@ -862,8 +906,11 @@ class OpenAIServing:
chat_template_kwargs: Optional[dict[str, Any]] = None, chat_template_kwargs: Optional[dict[str, Any]] = None,
tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
add_special_tokens: bool = False, add_special_tokens: bool = False,
) -> tuple[list[ConversationMessage], Sequence[RequestPrompt], ) -> tuple[
list[EngineTokensPrompt]]: list[ConversationMessage],
Sequence[RequestPrompt],
list[EngineTokensPrompt],
]:
model_config = self.model_config model_config = self.model_config
resolved_content_format = resolve_chat_template_content_format( resolved_content_format = resolve_chat_template_content_format(
...@@ -925,8 +972,8 @@ class OpenAIServing: ...@@ -925,8 +972,8 @@ class OpenAIServing:
if tokenizer is None: if tokenizer is None:
assert isinstance(request_prompt, str), ( assert isinstance(request_prompt, str), (
"Prompt has to be a string", \ "Prompt has to be a string",
"when the tokenizer is not initialised" "when the tokenizer is not initialised",
) )
prompt_inputs = TextTokensPrompt(prompt=request_prompt, prompt_inputs = TextTokensPrompt(prompt=request_prompt,
prompt_token_ids=[1]) prompt_token_ids=[1])
...@@ -943,7 +990,8 @@ class OpenAIServing: ...@@ -943,7 +990,8 @@ class OpenAIServing:
"Prompt has to be either a string or a list of token ids") "Prompt has to be either a string or a list of token ids")
prompt_inputs = TextTokensPrompt( prompt_inputs = TextTokensPrompt(
prompt=tokenizer.decode(request_prompt), prompt=tokenizer.decode(request_prompt),
prompt_token_ids=request_prompt) prompt_token_ids=request_prompt,
)
engine_prompt = EngineTokensPrompt( engine_prompt = EngineTokensPrompt(
prompt_token_ids=prompt_inputs["prompt_token_ids"]) prompt_token_ids=prompt_inputs["prompt_token_ids"])
...@@ -1007,22 +1055,23 @@ class OpenAIServing: ...@@ -1007,22 +1055,23 @@ class OpenAIServing:
prompt_token_ids=prompt_token_ids) prompt_token_ids=prompt_token_ids)
request_prompt = prompt_token_ids request_prompt = prompt_token_ids
# Update the sampling params. # Update the sampling params.
sampling_params.max_tokens = (self.max_model_len - sampling_params.max_tokens = self.max_model_len - len(
len(prompt_token_ids)) prompt_token_ids)
# OPTIMIZATION # OPTIMIZATION
priority = orig_priority - 1 priority = orig_priority - 1
@staticmethod @staticmethod
def _load_prompt_embeds( def _load_prompt_embeds(
prompt_embeds: Optional[Union[bytes, list[bytes]]], prompt_embeds: Optional[Union[bytes, list[bytes]]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
) -> list[EmbedsPrompt]: ) -> list[EmbedsPrompt]:
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt: def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
tensor = torch.load(io.BytesIO( tensor = torch.load(
pybase64.b64decode(embed, validate=True)), io.BytesIO(pybase64.b64decode(embed, validate=True)),
weights_only=True, weights_only=True,
map_location=torch.device("cpu")) map_location=torch.device("cpu"),
)
assert isinstance(tensor, torch.Tensor) and tensor.dtype in ( assert isinstance(tensor, torch.Tensor) and tensor.dtype in (
torch.float32, torch.float32,
torch.bfloat16, torch.bfloat16,
...@@ -1061,7 +1110,7 @@ class OpenAIServing: ...@@ -1061,7 +1110,7 @@ class OpenAIServing:
prompt = inputs prompt = inputs
elif isinstance(inputs, list): elif isinstance(inputs, list):
prompt_token_ids = inputs prompt_token_ids = inputs
elif 'prompt_embeds' in inputs: elif "prompt_embeds" in inputs:
prompt_embeds = inputs.get("prompt_embeds") prompt_embeds = inputs.get("prompt_embeds")
else: else:
prompt = inputs["prompt"] prompt = inputs["prompt"]
...@@ -1101,10 +1150,12 @@ class OpenAIServing: ...@@ -1101,10 +1150,12 @@ class OpenAIServing:
return raw_request.headers.get("X-Request-Id", default) return raw_request.headers.get("X-Request-Id", default)
@staticmethod @staticmethod
def _get_decoded_token(logprob: Logprob, def _get_decoded_token(
logprob: Logprob,
token_id: int, token_id: int,
tokenizer: AnyTokenizer, tokenizer: AnyTokenizer,
return_as_token_id: bool = False) -> str: return_as_token_id: bool = False,
) -> str:
if return_as_token_id: if return_as_token_id:
return f"token_id:{token_id}" return f"token_id:{token_id}"
...@@ -1117,9 +1168,11 @@ class OpenAIServing: ...@@ -1117,9 +1168,11 @@ class OpenAIServing:
return True return True
return self.models.is_base_model(model_name) return self.models.is_base_model(model_name)
def _get_model_name(self, def _get_model_name(
self,
model_name: Optional[str] = None, model_name: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> str: lora_request: Optional[LoRARequest] = None,
) -> str:
if lora_request: if lora_request:
return lora_request.lora_name return lora_request.lora_name
if not model_name: if not model_name:
...@@ -1129,7 +1182,7 @@ class OpenAIServing: ...@@ -1129,7 +1182,7 @@ class OpenAIServing:
def clamp_prompt_logprobs( def clamp_prompt_logprobs(
prompt_logprobs: Union[PromptLogprobs, prompt_logprobs: Union[PromptLogprobs,
None]) -> Union[PromptLogprobs, None]: None], ) -> Union[PromptLogprobs, None]:
if prompt_logprobs is None: if prompt_logprobs is None:
return prompt_logprobs return prompt_logprobs
...@@ -1137,6 +1190,6 @@ def clamp_prompt_logprobs( ...@@ -1137,6 +1190,6 @@ def clamp_prompt_logprobs(
if logprob_dict is None: if logprob_dict is None:
continue continue
for logprob_values in logprob_dict.values(): for logprob_values in logprob_dict.values():
if logprob_values.logprob == float('-inf'): if logprob_values.logprob == float("-inf"):
logprob_values.logprob = -9999.0 logprob_values.logprob = -9999.0
return prompt_logprobs return prompt_logprobs
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment