Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0ae11f78
Unverified
Commit
0ae11f78
authored
Apr 23, 2024
by
SangBin Cho
Committed by
GitHub
Apr 22, 2024
Browse files
[Mypy] Part 3 fix typing for nested directories for most of directory (#4161)
parent
34128a69
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
30 additions
and
16 deletions
+30
-16
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+4
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+12
-7
vllm/lora/lora.py
vllm/lora/lora.py
+1
-1
vllm/model_executor/layers/ops/sample.py
vllm/model_executor/layers/ops/sample.py
+2
-1
vllm/model_executor/layers/rotary_embedding.py
vllm/model_executor/layers/rotary_embedding.py
+2
-1
vllm/transformers_utils/configs/jais.py
vllm/transformers_utils/configs/jais.py
+4
-2
vllm/transformers_utils/tokenizer_group/__init__.py
vllm/transformers_utils/tokenizer_group/__init__.py
+1
-1
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
...transformers_utils/tokenizer_group/ray_tokenizer_group.py
+2
-0
vllm/transformers_utils/tokenizers/baichuan.py
vllm/transformers_utils/tokenizers/baichuan.py
+2
-2
No files found.
vllm/entrypoints/openai/serving_completion.py
View file @
0ae11f78
...
...
@@ -185,6 +185,7 @@ class OpenAIServingCompletion(OpenAIServing):
model_name
:
str
,
num_prompts
:
int
,
)
->
AsyncGenerator
[
str
,
None
]:
assert
request
.
n
is
not
None
previous_texts
=
[
""
]
*
request
.
n
*
num_prompts
previous_num_tokens
=
[
0
]
*
request
.
n
*
num_prompts
has_echoed
=
[
False
]
*
request
.
n
*
num_prompts
...
...
@@ -202,6 +203,7 @@ class OpenAIServingCompletion(OpenAIServing):
# TODO(simon): optimize the performance by avoiding full
# text O(n^2) sending.
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
request
.
max_tokens
==
0
:
# only return the prompt
delta_text
=
res
.
prompt
...
...
@@ -279,7 +281,7 @@ class OpenAIServingCompletion(OpenAIServing):
created_time
:
int
,
model_name
:
str
,
)
->
CompletionResponse
:
choices
=
[]
choices
:
List
[
CompletionResponseChoice
]
=
[]
num_prompt_tokens
=
0
num_generated_tokens
=
0
for
final_res
in
final_res_batch
:
...
...
@@ -289,6 +291,7 @@ class OpenAIServingCompletion(OpenAIServing):
prompt_text
=
final_res
.
prompt
for
output
in
final_res
.
outputs
:
assert
request
.
max_tokens
is
not
None
if
request
.
echo
and
request
.
max_tokens
==
0
:
token_ids
=
prompt_token_ids
top_logprobs
=
prompt_logprobs
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
0ae11f78
...
...
@@ -4,7 +4,9 @@ from dataclasses import dataclass
from
http
import
HTTPStatus
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
pydantic
import
conint
from
pydantic
import
Field
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
typing_extensions
import
Annotated
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
...
...
@@ -45,7 +47,8 @@ class OpenAIServing:
]
self
.
max_model_len
=
0
self
.
tokenizer
=
None
# Lazy initialized
self
.
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]
try
:
event_loop
=
asyncio
.
get_running_loop
()
...
...
@@ -92,7 +95,7 @@ class OpenAIServing:
def
_create_logprobs
(
self
,
token_ids
:
List
[
int
],
top_logprobs
:
Optional
[
List
[
Optional
[
Dict
[
int
,
Logprob
]]]
]
=
None
,
top_logprobs
:
List
[
Optional
[
Dict
[
int
,
Logprob
]]],
num_output_top_logprobs
:
Optional
[
int
]
=
None
,
initial_text_offset
:
int
=
0
,
)
->
LogProbs
:
...
...
@@ -108,6 +111,7 @@ class OpenAIServing:
token
=
self
.
tokenizer
.
decode
(
token_id
)
logprobs
.
tokens
.
append
(
token
)
logprobs
.
token_logprobs
.
append
(
None
)
assert
logprobs
.
top_logprobs
is
not
None
logprobs
.
top_logprobs
.
append
(
None
)
else
:
token_logprob
=
step_top_logprobs
[
token_id
].
logprob
...
...
@@ -116,6 +120,7 @@ class OpenAIServing:
logprobs
.
token_logprobs
.
append
(
token_logprob
)
if
num_output_top_logprobs
:
assert
logprobs
.
top_logprobs
is
not
None
logprobs
.
top_logprobs
.
append
({
# Convert float("-inf") to the
# JSON-serializable float that OpenAI uses
...
...
@@ -155,9 +160,9 @@ class OpenAIServing:
async
def
_check_model
(
self
,
request
)
->
Optional
[
ErrorResponse
]:
if
request
.
model
in
self
.
served_model_names
:
return
return
None
if
request
.
model
in
[
lora
.
lora_name
for
lora
in
self
.
lora_requests
]:
return
return
None
return
self
.
create_error_response
(
message
=
f
"The model `
{
request
.
model
}
` does not exist."
,
err_type
=
"NotFoundError"
,
...
...
@@ -165,7 +170,7 @@ class OpenAIServing:
def
_maybe_get_lora
(
self
,
request
)
->
Optional
[
LoRARequest
]:
if
request
.
model
in
self
.
served_model_names
:
return
return
None
for
lora
in
self
.
lora_requests
:
if
request
.
model
==
lora
.
lora_name
:
return
lora
...
...
@@ -177,7 +182,7 @@ class OpenAIServing:
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
],
prompt
:
Optional
[
str
]
=
None
,
prompt_ids
:
Optional
[
List
[
int
]]
=
None
,
truncate_prompt_tokens
:
Optional
[
conint
(
ge
=
1
)]
=
None
truncate_prompt_tokens
:
Optional
[
Annotated
[
int
,
Field
(
ge
=
1
)]
]
=
None
)
->
Tuple
[
List
[
int
],
str
]:
if
not
(
prompt
or
prompt_ids
):
raise
ValueError
(
"Either prompt or prompt_ids should be provided."
)
...
...
vllm/lora/lora.py
View file @
0ae11f78
...
...
@@ -33,7 +33,7 @@ class LoRALayerWeights:
def
optimize
(
self
)
->
"LoRALayerWeights"
:
"""Optimize the LoRA by merging the scaling into lora_b."""
if
self
.
scaling
==
1
:
return
return
self
self
.
lora_b
*=
self
.
scaling
self
.
scaling
=
1
return
self
...
...
vllm/model_executor/layers/ops/sample.py
View file @
0ae11f78
...
...
@@ -29,8 +29,8 @@ def _multi_split_sample(
sampled_tokens_size
:
Tuple
[
int
,
int
],
sampled_logprobs_size
:
Tuple
[
int
,
int
],
sample_indices
:
torch
.
Tensor
,
logprobs
:
torch
.
Tensor
,
*
,
logprobs
:
Optional
[
torch
.
Tensor
]
=
None
,
modify_greedy_probs
:
bool
=
False
,
save_logprobs
:
bool
=
False
,
):
...
...
@@ -167,6 +167,7 @@ def sample(
sampled_logprobs_size
=
(
0
,
0
)
logprobs
=
probs
assert
logprobs
is
not
None
if
_save_modified_probs
:
sampled_modified_probs_size
=
sampled_tokens_size
else
:
...
...
vllm/model_executor/layers/rotary_embedding.py
View file @
0ae11f78
...
...
@@ -108,7 +108,8 @@ class RotaryEmbedding(nn.Module):
query_pass
=
query
[...,
self
.
rotary_dim
:]
key_pass
=
key
[...,
self
.
rotary_dim
:]
self
.
cos_sin_cache
=
self
.
cos_sin_cache
.
to
(
positions
.
device
)
self
.
cos_sin_cache
:
torch
.
Tensor
=
self
.
cos_sin_cache
.
to
(
positions
.
device
)
cos_sin
=
self
.
cos_sin_cache
[
torch
.
add
(
positions
,
offsets
)
if
offsets
is
not
None
else
positions
]
cos
,
sin
=
cos_sin
.
chunk
(
2
,
dim
=-
1
)
...
...
vllm/transformers_utils/configs/jais.py
View file @
0ae11f78
...
...
@@ -222,13 +222,15 @@ class JAISConfig(PretrainedConfig):
f
"got
{
alibi_scaling_type
}
"
)
if
(
alibi_scaling_factor
is
not
None
and
not
isinstance
(
alibi_scaling_factor
,
float
)
or
alibi_scaling_factor
<=
1.0
):
or
(
alibi_scaling_factor
is
not
None
and
alibi_scaling_factor
<=
1.0
)):
raise
ValueError
(
f
"`alibi_scaling`'s factor field must be a float > 1.0,"
f
"got
{
alibi_scaling_factor
}
"
)
if
(
alibi_dynamic_scaling
is
not
None
and
not
isinstance
(
alibi_dynamic_scaling
,
int
)
or
alibi_dynamic_scaling
<=
1
):
or
(
alibi_dynamic_scaling
is
not
None
and
alibi_dynamic_scaling
<=
1
)):
raise
ValueError
(
f
"`alibi_scaling`'s `train_seq_len` field must be an"
f
"integer > 1, got
{
alibi_dynamic_scaling
}
"
)
vllm/transformers_utils/tokenizer_group/__init__.py
View file @
0ae11f78
...
...
@@ -11,7 +11,7 @@ if ray:
from
vllm.transformers_utils.tokenizer_group.ray_tokenizer_group
import
(
RayTokenizerGroupPool
)
else
:
RayTokenizerGroupPool
=
None
RayTokenizerGroupPool
=
None
# type: ignore
def
get_tokenizer_group
(
tokenizer_pool_config
:
Optional
[
TokenizerPoolConfig
],
...
...
vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
View file @
0ae11f78
...
...
@@ -89,6 +89,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
This is blocking.
"""
self
.
_ensure_queue_initialized
()
assert
self
.
_idle_actors
is
not
None
if
self
.
_idle_actors
.
empty
():
raise
RuntimeError
(
"No idle actors available."
)
...
...
@@ -120,6 +121,7 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
This is non-blocking.
"""
self
.
_ensure_queue_initialized
()
assert
self
.
_idle_actors
is
not
None
actor
=
await
self
.
_idle_actors
.
get
()
try
:
...
...
vllm/transformers_utils/tokenizers/baichuan.py
View file @
0ae11f78
...
...
@@ -114,9 +114,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
token
=
self
.
sp_model
.
IdToPiece
(
index
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
def
convert_tokens_to_string
(
self
,
tokens
:
List
[
str
]
):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens
=
[]
current_sub_tokens
:
List
[
str
]
=
[]
out_string
=
""
prev_is_special
=
False
for
i
,
token
in
enumerate
(
tokens
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment