Unverified Commit 0ae11f78 authored by SangBin Cho's avatar SangBin Cho Committed by GitHub
Browse files

[Mypy] Part 3 fix typing for nested directories for most of directory (#4161)

parent 34128a69
......@@ -185,6 +185,7 @@ class OpenAIServingCompletion(OpenAIServing):
model_name: str,
num_prompts: int,
) -> AsyncGenerator[str, None]:
assert request.n is not None
previous_texts = [""] * request.n * num_prompts
previous_num_tokens = [0] * request.n * num_prompts
has_echoed = [False] * request.n * num_prompts
......@@ -202,6 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
# TODO(simon): optimize the performance by avoiding full
# text O(n^2) sending.
assert request.max_tokens is not None
if request.echo and request.max_tokens == 0:
# only return the prompt
delta_text = res.prompt
......@@ -279,7 +281,7 @@ class OpenAIServingCompletion(OpenAIServing):
created_time: int,
model_name: str,
) -> CompletionResponse:
choices = []
choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
for final_res in final_res_batch:
......@@ -289,6 +291,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text = final_res.prompt
for output in final_res.outputs:
assert request.max_tokens is not None
if request.echo and request.max_tokens == 0:
token_ids = prompt_token_ids
top_logprobs = prompt_logprobs
......
......@@ -4,7 +4,9 @@ from dataclasses import dataclass
from http import HTTPStatus
from typing import Dict, List, Optional, Tuple, Union
from pydantic import conint
from pydantic import Field
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from typing_extensions import Annotated
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
......@@ -45,7 +47,8 @@ class OpenAIServing:
]
self.max_model_len = 0
self.tokenizer = None
# Lazy initialized
self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
try:
event_loop = asyncio.get_running_loop()
......@@ -92,7 +95,7 @@ class OpenAIServing:
def _create_logprobs(
self,
token_ids: List[int],
top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None,
top_logprobs: List[Optional[Dict[int, Logprob]]],
num_output_top_logprobs: Optional[int] = None,
initial_text_offset: int = 0,
) -> LogProbs:
......@@ -108,6 +111,7 @@ class OpenAIServing:
token = self.tokenizer.decode(token_id)
logprobs.tokens.append(token)
logprobs.token_logprobs.append(None)
assert logprobs.top_logprobs is not None
logprobs.top_logprobs.append(None)
else:
token_logprob = step_top_logprobs[token_id].logprob
......@@ -116,6 +120,7 @@ class OpenAIServing:
logprobs.token_logprobs.append(token_logprob)
if num_output_top_logprobs:
assert logprobs.top_logprobs is not None
logprobs.top_logprobs.append({
# Convert float("-inf") to the
# JSON-serializable float that OpenAI uses
......@@ -155,9 +160,9 @@ class OpenAIServing:
async def _check_model(self, request) -> Optional[ErrorResponse]:
if request.model in self.served_model_names:
return
return None
if request.model in [lora.lora_name for lora in self.lora_requests]:
return
return None
return self.create_error_response(
message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError",
......@@ -165,7 +170,7 @@ class OpenAIServing:
def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
if request.model in self.served_model_names:
return
return None
for lora in self.lora_requests:
if request.model == lora.lora_name:
return lora
......@@ -177,7 +182,7 @@ class OpenAIServing:
request: Union[ChatCompletionRequest, CompletionRequest],
prompt: Optional[str] = None,
prompt_ids: Optional[List[int]] = None,
truncate_prompt_tokens: Optional[conint(ge=1)] = None
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
) -> Tuple[List[int], str]:
if not (prompt or prompt_ids):
raise ValueError("Either prompt or prompt_ids should be provided.")
......
......@@ -33,7 +33,7 @@ class LoRALayerWeights:
def optimize(self) -> "LoRALayerWeights":
"""Optimize the LoRA by merging the scaling into lora_b."""
if self.scaling == 1:
return
return self
self.lora_b *= self.scaling
self.scaling = 1
return self
......
......@@ -29,8 +29,8 @@ def _multi_split_sample(
sampled_tokens_size: Tuple[int, int],
sampled_logprobs_size: Tuple[int, int],
sample_indices: torch.Tensor,
logprobs: torch.Tensor,
*,
logprobs: Optional[torch.Tensor] = None,
modify_greedy_probs: bool = False,
save_logprobs: bool = False,
):
......@@ -167,6 +167,7 @@ def sample(
sampled_logprobs_size = (0, 0)
logprobs = probs
assert logprobs is not None
if _save_modified_probs:
sampled_modified_probs_size = sampled_tokens_size
else:
......
......@@ -108,7 +108,8 @@ class RotaryEmbedding(nn.Module):
query_pass = query[..., self.rotary_dim:]
key_pass = key[..., self.rotary_dim:]
self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
positions.device)
cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
if offsets is not None else positions]
cos, sin = cos_sin.chunk(2, dim=-1)
......
......@@ -222,13 +222,15 @@ class JAISConfig(PretrainedConfig):
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
or alibi_scaling_factor <= 1.0):
or (alibi_scaling_factor is not None
and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0,"
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
or alibi_dynamic_scaling <= 1):
or (alibi_dynamic_scaling is not None
and alibi_dynamic_scaling <= 1)):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an"
f"integer > 1, got {alibi_dynamic_scaling}")
......@@ -11,7 +11,7 @@ if ray:
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)
else:
RayTokenizerGroupPool = None
RayTokenizerGroupPool = None # type: ignore
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
......
......@@ -89,6 +89,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
This is blocking.
"""
self._ensure_queue_initialized()
assert self._idle_actors is not None
if self._idle_actors.empty():
raise RuntimeError("No idle actors available.")
......@@ -120,6 +121,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
This is non-blocking.
"""
self._ensure_queue_initialized()
assert self._idle_actors is not None
actor = await self._idle_actors.get()
try:
......
......@@ -114,9 +114,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
token = self.sp_model.IdToPiece(index)
return token
def convert_tokens_to_string(self, tokens):
def convert_tokens_to_string(self, tokens: List[str]):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
current_sub_tokens: List[str] = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment