Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
158 additions
and
168 deletions
+158
-168
setup.py
setup.py
+3
-4
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+3
-2
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+2
-2
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+3
-3
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+4
-4
tests/conftest.py
tests/conftest.py
+79
-80
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+2
-1
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+5
-6
tests/core/block/test_block_table.py
tests/core/block/test_block_table.py
+3
-5
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+2
-2
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+8
-8
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+12
-13
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+9
-10
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+1
-3
tests/core/utils.py
tests/core/utils.py
+10
-11
tests/distributed/test_expert_parallel.py
tests/distributed/test_expert_parallel.py
+3
-3
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+4
-4
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+2
-3
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+1
-2
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+2
-2
No files found.
setup.py
View file @
cf069aa8
...
...
@@ -9,7 +9,6 @@ import subprocess
import
sys
from
pathlib
import
Path
from
shutil
import
which
from
typing
import
Dict
,
List
import
torch
from
packaging.version
import
Version
,
parse
...
...
@@ -78,7 +77,7 @@ class CMakeExtension(Extension):
class
cmake_build_ext
(
build_ext
):
# A dict of extension directories that have been configured.
did_config
:
D
ict
[
str
,
bool
]
=
{}
did_config
:
d
ict
[
str
,
bool
]
=
{}
#
# Determine number of compilation jobs and optionally nvcc compile threads.
...
...
@@ -548,10 +547,10 @@ def get_vllm_version() -> str:
return
version
def
get_requirements
()
->
L
ist
[
str
]:
def
get_requirements
()
->
l
ist
[
str
]:
"""Get Python package dependencies from requirements.txt."""
def
_read_requirements
(
filename
:
str
)
->
L
ist
[
str
]:
def
_read_requirements
(
filename
:
str
)
->
l
ist
[
str
]:
with
open
(
get_path
(
filename
))
as
f
:
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
resolved_requirements
=
[]
...
...
tests/async_engine/api_server_async_engine.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
"""vllm.entrypoints.api_server with some extra logging for testing."""
from
typing
import
Any
,
Dict
,
Iterable
from
collections.abc
import
Iterable
from
typing
import
Any
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
...
...
@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
self
.
_num_aborts
+=
len
(
ids
)
await
super
().
_engine_abort
(
ids
)
def
testing_stats
(
self
)
->
D
ict
[
str
,
Any
]:
def
testing_stats
(
self
)
->
d
ict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
...
...
tests/async_engine/test_async_llm_engine.py
View file @
cf069aa8
...
...
@@ -6,7 +6,7 @@ import uuid
from
asyncio
import
CancelledError
from
copy
import
copy
from
dataclasses
import
dataclass
from
typing
import
List
,
Optional
from
typing
import
Optional
import
pytest
import
pytest_asyncio
...
...
@@ -254,7 +254,7 @@ async def test_output_kinds(async_engine, stop):
params
.
output_kind
=
RequestOutputKind
.
DELTA
prompt_tokens
=
None
output_tokens
:
L
ist
[
int
]
=
[]
output_tokens
:
l
ist
[
int
]
=
[]
output_text
=
""
output_count
=
0
final_output
=
None
...
...
tests/compile/piecewise/test_toy_llama.py
View file @
cf069aa8
...
...
@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
"""
from
dataclasses
import
dataclass
from
typing
import
Any
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Optional
import
torch
from
torch
import
nn
...
...
@@ -56,7 +56,7 @@ class LlamaConfig:
random_seed
:
int
=
0
def
compute_hash
(
self
)
->
str
:
factors
:
L
ist
[
Any
]
=
[]
factors
:
l
ist
[
Any
]
=
[]
for
k
,
v
in
self
.
__dict__
.
items
():
if
k
==
"random_seed"
:
continue
...
...
@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
For tractable computation:
- if residual is None, the outputs are:
...
...
tests/compile/test_basic_correctness.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
dataclasses
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Optional
import
pytest
...
...
@@ -14,7 +14,7 @@ from ..utils import compare_all_settings
@
dataclasses
.
dataclass
class
TestSetting
:
model
:
str
model_args
:
L
ist
[
str
]
model_args
:
l
ist
[
str
]
pp_size
:
int
tp_size
:
int
attn_backend
:
str
...
...
@@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting):
final_args
=
[
"--enforce-eager"
]
+
model_args
+
[
"-pp"
,
str
(
pp_size
)]
+
\
[
"-tp"
,
str
(
tp_size
)]
all_args
:
L
ist
[
L
ist
[
str
]]
=
[]
all_envs
:
L
ist
[
Optional
[
D
ict
[
str
,
str
]]]
=
[]
all_args
:
l
ist
[
l
ist
[
str
]]
=
[]
all_envs
:
l
ist
[
Optional
[
d
ict
[
str
,
str
]]]
=
[]
for
level
in
[
CompilationLevel
.
NO_COMPILATION
,
...
...
tests/conftest.py
View file @
cf069aa8
...
...
@@ -5,8 +5,7 @@ import os
import
tempfile
from
collections
import
UserList
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
TypedDict
,
TypeVar
,
Union
)
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
import
numpy
as
np
import
pytest
...
...
@@ -47,14 +46,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M
=
TypeVar
(
"_M"
)
_PromptMultiModalInput
=
Union
[
L
ist
[
_M
],
L
ist
[
L
ist
[
_M
]]]
_PromptMultiModalInput
=
Union
[
l
ist
[
_M
],
l
ist
[
l
ist
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptAudioInput
=
_PromptMultiModalInput
[
T
uple
[
np
.
ndarray
,
int
]]
PromptAudioInput
=
_PromptMultiModalInput
[
t
uple
[
np
.
ndarray
,
int
]]
PromptVideoInput
=
_PromptMultiModalInput
[
np
.
ndarray
]
def
_read_prompts
(
filename
:
str
)
->
L
ist
[
str
]:
def
_read_prompts
(
filename
:
str
)
->
l
ist
[
str
]:
with
open
(
filename
)
as
f
:
prompts
=
f
.
readlines
()
return
prompts
...
...
@@ -77,7 +76,7 @@ class _ImageAssets(_ImageAssetsBase):
ImageAsset
(
"cherry_blossom"
),
])
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
L
ist
[
str
]:
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
l
ist
[
str
]:
"""
Convenience method to define the prompt for each test image.
...
...
@@ -102,7 +101,7 @@ class _VideoAssets(_VideoAssetsBase):
VideoAsset
(
"sample_demo_1.mp4"
),
])
def
prompts
(
self
,
prompts
:
_VideoAssetPrompts
)
->
L
ist
[
str
]:
def
prompts
(
self
,
prompts
:
_VideoAssetPrompts
)
->
l
ist
[
str
]:
return
[
prompts
[
"sample_demo_1"
]]
...
...
@@ -175,7 +174,7 @@ def dynamo_reset():
@
pytest
.
fixture
def
example_prompts
()
->
L
ist
[
str
]:
def
example_prompts
()
->
l
ist
[
str
]:
prompts
=
[]
for
filename
in
_TEST_PROMPTS
:
prompts
+=
_read_prompts
(
filename
)
...
...
@@ -197,7 +196,7 @@ class DecoderPromptType(Enum):
@
pytest
.
fixture
def
example_encoder_decoder_prompts
(
)
->
D
ict
[
DecoderPromptType
,
L
ist
[
ExplicitEncoderDecoderPrompt
]]:
)
->
d
ict
[
DecoderPromptType
,
l
ist
[
ExplicitEncoderDecoderPrompt
]]:
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
...
...
@@ -229,7 +228,7 @@ def example_encoder_decoder_prompts(
@
pytest
.
fixture
def
example_long_prompts
()
->
L
ist
[
str
]:
def
example_long_prompts
()
->
l
ist
[
str
]:
prompts
=
[]
for
filename
in
_LONG_PROMPTS
:
prompts
+=
_read_prompts
(
filename
)
...
...
@@ -273,11 +272,11 @@ class HfRunner:
model_name
:
str
,
dtype
:
str
=
"half"
,
*
,
model_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
,
model_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
,
is_sentence_transformer
:
bool
=
False
,
is_cross_encoder
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
auto_cls
:
T
ype
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
auto_cls
:
t
ype
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[...,
BatchEncoding
]
=
identity
,
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
...
...
@@ -334,11 +333,11 @@ class HfRunner:
def
get_inputs
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
BatchEncoding
]:
)
->
l
ist
[
BatchEncoding
]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
...
...
@@ -348,9 +347,9 @@ class HfRunner:
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
all_inputs
:
L
ist
[
BatchEncoding
]
=
[]
all_inputs
:
l
ist
[
BatchEncoding
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
D
ict
[
str
,
Any
]
=
{
processor_kwargs
:
d
ict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
}
...
...
@@ -370,7 +369,7 @@ class HfRunner:
return
all_inputs
def
classify
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
str
]:
def
classify
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
str
]:
# output is final logits
all_inputs
=
self
.
get_inputs
(
prompts
)
outputs
=
[]
...
...
@@ -383,18 +382,18 @@ class HfRunner:
def
generate
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
outputs
:
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]
=
[]
outputs
:
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]
=
[]
for
inputs
in
all_inputs
:
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
...
...
@@ -412,13 +411,13 @@ class HfRunner:
def
generate_greedy
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
int
],
str
]]:
)
->
l
ist
[
t
uple
[
l
ist
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
...
...
@@ -432,10 +431,10 @@ class HfRunner:
def
generate_beam_search
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
beam_width
:
int
,
max_tokens
:
int
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
...
...
@@ -453,19 +452,19 @@ class HfRunner:
def
generate_greedy_logprobs
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
L
ist
[
torch
.
Tensor
]]:
)
->
l
ist
[
l
ist
[
torch
.
Tensor
]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
all_logprobs
:
L
ist
[
L
ist
[
torch
.
Tensor
]]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
torch
.
Tensor
]]
=
[]
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
...
...
@@ -483,11 +482,11 @@ class HfRunner:
def
_hidden_states_to_seq_logprobs
(
self
,
hidden_states
:
T
uple
[
T
uple
[
torch
.
Tensor
,
...],
...],
)
->
L
ist
[
torch
.
Tensor
]:
hidden_states
:
t
uple
[
t
uple
[
torch
.
Tensor
,
...],
...],
)
->
l
ist
[
torch
.
Tensor
]:
output_embeddings
=
self
.
model
.
get_output_embeddings
()
seq_logprobs
:
L
ist
[
torch
.
Tensor
]
=
[]
seq_logprobs
:
l
ist
[
torch
.
Tensor
]
=
[]
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
...
...
@@ -503,14 +502,14 @@ class HfRunner:
def
_hidden_states_to_logprobs
(
self
,
hidden_states
:
T
uple
[
T
uple
[
torch
.
Tensor
,
...],
...],
hidden_states
:
t
uple
[
t
uple
[
torch
.
Tensor
,
...],
...],
num_logprobs
:
int
,
)
->
T
uple
[
L
ist
[
D
ict
[
int
,
float
]],
int
]:
)
->
t
uple
[
l
ist
[
d
ict
[
int
,
float
]],
int
]:
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
output_len
=
len
(
hidden_states
)
# convert to dict
seq_logprobs_lst
:
L
ist
[
D
ict
[
int
,
float
]]
=
[]
seq_logprobs_lst
:
l
ist
[
d
ict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
if
tok_idx
==
0
:
...
...
@@ -530,22 +529,22 @@ class HfRunner:
def
generate_greedy_logprobs_limit
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
TokensTextLogprobs
]:
)
->
l
ist
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
all_logprobs
:
L
ist
[
L
ist
[
D
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
all_output_strs
:
L
ist
[
str
]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
d
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
all_output_strs
:
l
ist
[
str
]
=
[]
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
...
...
@@ -577,23 +576,23 @@ class HfRunner:
def
generate_encoder_decoder_greedy_logprobs_limit
(
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
TokensTextLogprobs
]:
)
->
l
ist
[
TokensTextLogprobs
]:
'''
Greedy logprobs generation for vLLM encoder/decoder models
'''
all_logprobs
:
L
ist
[
L
ist
[
D
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
all_output_strs
:
L
ist
[
str
]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
d
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
all_output_strs
:
l
ist
[
str
]
=
[]
for
i
,
(
encoder_prompt
,
decoder_prompt
)
in
enumerate
(
to_enc_dec_tuple_list
(
encoder_decoder_prompts
)):
processor_kwargs
:
D
ict
[
str
,
Any
]
=
{
processor_kwargs
:
d
ict
[
str
,
Any
]
=
{
"text"
:
encoder_prompt
,
"return_tensors"
:
"pt"
,
}
...
...
@@ -641,10 +640,10 @@ class HfRunner:
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
encode
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
L
ist
[
torch
.
Tensor
]]:
def
encode
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
l
ist
[
torch
.
Tensor
]]:
return
self
.
model
.
encode
(
prompts
)
def
predict
(
self
,
prompts
:
L
ist
[
L
ist
[
str
]])
->
torch
.
Tensor
:
def
predict
(
self
,
prompts
:
l
ist
[
l
ist
[
str
]])
->
torch
.
Tensor
:
return
self
.
model
.
predict
(
prompts
,
convert_to_tensor
=
True
)
def
__enter__
(
self
):
...
...
@@ -699,11 +698,11 @@ class VllmRunner:
def
get_inputs
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
TextPrompt
]:
)
->
l
ist
[
TextPrompt
]:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
...
...
@@ -733,13 +732,13 @@ class VllmRunner:
def
generate
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
...
...
@@ -749,12 +748,12 @@ class VllmRunner:
sampling_params
=
sampling_params
,
**
kwargs
)
outputs
:
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]
=
[]
outputs
:
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]
=
[]
for
req_output
in
req_outputs
:
prompt_str
=
req_output
.
prompt
prompt_ids
=
req_output
.
prompt_token_ids
req_sample_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
req_sample_output_strs
:
L
ist
[
str
]
=
[]
req_sample_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
req_sample_output_strs
:
l
ist
[
str
]
=
[]
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
...
...
@@ -765,9 +764,9 @@ class VllmRunner:
@
staticmethod
def
_final_steps_generate_w_logprobs
(
req_outputs
:
L
ist
[
RequestOutput
],
)
->
L
ist
[
TokensTextLogprobsPromptLogprobs
]:
outputs
:
L
ist
[
TokensTextLogprobsPromptLogprobs
]
=
[]
req_outputs
:
l
ist
[
RequestOutput
],
)
->
l
ist
[
TokensTextLogprobsPromptLogprobs
]:
outputs
:
l
ist
[
TokensTextLogprobsPromptLogprobs
]
=
[]
for
req_output
in
req_outputs
:
assert
len
(
req_output
.
outputs
)
>
0
for
sample
in
req_output
.
outputs
:
...
...
@@ -780,14 +779,14 @@ class VllmRunner:
def
generate_w_logprobs
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
...
...
@@ -806,10 +805,10 @@ class VllmRunner:
def
generate_encoder_decoder_w_logprobs
(
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
sampling_params
:
SamplingParams
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
'''
Logprobs generation for vLLM encoder/decoder models
'''
...
...
@@ -826,13 +825,13 @@ class VllmRunner:
def
generate_greedy
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
int
],
str
]]:
)
->
l
ist
[
t
uple
[
l
ist
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
...
...
@@ -845,18 +844,18 @@ class VllmRunner:
def
generate_greedy_logprobs
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
None
,
stop
:
Optional
[
L
ist
[
str
]]
=
None
,
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
None
,
stop
:
Optional
[
l
ist
[
str
]]
=
None
,
**
kwargs
:
Any
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
...
...
@@ -874,12 +873,12 @@ class VllmRunner:
def
generate_encoder_decoder_greedy_logprobs
(
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
...
...
@@ -895,10 +894,10 @@ class VllmRunner:
def
generate_beam_search
(
self
,
prompts
:
Union
[
L
ist
[
str
],
L
ist
[
L
ist
[
int
]]],
prompts
:
Union
[
l
ist
[
str
],
l
ist
[
l
ist
[
int
]]],
beam_width
:
int
,
max_tokens
:
int
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
if
is_list_of
(
prompts
,
str
,
check
=
"all"
):
prompts
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
else
:
...
...
@@ -915,17 +914,17 @@ class VllmRunner:
returned_outputs
.
append
((
token_ids
,
texts
))
return
returned_outputs
def
classify
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
L
ist
[
float
]]:
def
classify
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
l
ist
[
float
]]:
req_outputs
=
self
.
model
.
classify
(
prompts
)
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
def
encode
(
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
L
ist
[
float
]]:
)
->
l
ist
[
l
ist
[
float
]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
...
...
@@ -936,9 +935,9 @@ class VllmRunner:
def
score
(
self
,
text_1
:
Union
[
str
,
L
ist
[
str
]],
text_2
:
Union
[
str
,
L
ist
[
str
]],
)
->
L
ist
[
float
]:
text_1
:
Union
[
str
,
l
ist
[
str
]],
text_2
:
Union
[
str
,
l
ist
[
str
]],
)
->
l
ist
[
float
]:
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
)
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
...
...
tests/core/block/e2e/conftest.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Callable
,
Iterable
,
Optional
from
collections.abc
import
Iterable
from
typing
import
Callable
,
Optional
import
pytest
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
import
random
from
typing
import
List
import
pytest
...
...
@@ -137,9 +136,9 @@ def prep_prompts(batch_size: int):
The prompt is just under 10k tokens; sliding window is 4k
so the answer is outside sliding window, but should still be correct.
"""
prompts
:
L
ist
[
str
]
=
[]
answer
:
L
ist
[
int
]
=
[]
indices
:
L
ist
[
int
]
=
[]
prompts
:
l
ist
[
str
]
=
[]
answer
:
l
ist
[
int
]
=
[]
indices
:
l
ist
[
int
]
=
[]
random
.
seed
(
1
)
for
_
in
range
(
batch_size
):
idx
=
random
.
randint
(
30
,
90
)
...
...
@@ -158,7 +157,7 @@ def prep_prompts(batch_size: int):
return
prompts
,
answer
,
indices
def
check_answers
(
indices
:
L
ist
[
int
],
answer
:
L
ist
[
int
],
outputs
:
L
ist
[
str
]):
def
check_answers
(
indices
:
l
ist
[
int
],
answer
:
l
ist
[
int
],
outputs
:
l
ist
[
str
]):
answer2
=
[
int
(
text
[
0
:
2
].
strip
())
for
text
in
outputs
]
print
(
list
(
zip
(
indices
,
zip
(
answer
,
answer2
))))
numok
=
0
...
...
@@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
assert
frac_ok
>
0.7
def
check_window
(
prompts
:
L
ist
[
str
]):
def
check_window
(
prompts
:
l
ist
[
str
]):
def
inner
(
llm
:
LLM
):
sliding_window
=
llm
.
llm_engine
.
model_config
.
get_sliding_window
()
...
...
tests/core/block/test_block_table.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
pytest
from
vllm.core.block.block_table
import
BlockTable
...
...
@@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
token_ids
=
list
(
range
(
sequence_len
))
num_blocks_per_alloc
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
block_tables
:
L
ist
[
BlockTable
]
=
[]
block_tables
:
l
ist
[
BlockTable
]
=
[]
for
i
in
range
(
5
):
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_gpu_blocks
-
i
*
num_blocks_per_alloc
...
...
@@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
num_immutable_blocks_per_alloc
=
len
(
chunked_tokens
)
-
num_mutable_blocks_per_alloc
block_tables
:
L
ist
[
BlockTable
]
=
[]
block_tables
:
l
ist
[
BlockTable
]
=
[]
for
alloc_i
in
range
(
1
,
6
):
block_tables
.
append
(
...
...
@@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
)
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
appended_so_far
:
L
ist
[
int
]
=
[]
appended_so_far
:
l
ist
[
int
]
=
[]
for
append
in
chunk_list
(
token_ids_to_append
,
append_size
):
block_table
.
append_token_ids
(
append
)
appended_so_far
.
extend
(
append
)
...
...
tests/core/block/test_naive_block.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
from
typing
import
Optional
import
pytest
...
...
@@ -14,7 +14,7 @@ class TestNaiveBlockAllocator:
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
NaiveBlockAllocator
,
prev_block
:
Optional
[
Block
],
token_ids
:
L
ist
[
int
]):
token_ids
:
l
ist
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
...
...
tests/core/block/test_prefix_caching_block.py
View file @
cf069aa8
...
...
@@ -2,7 +2,7 @@
import
math
import
random
from
typing
import
List
,
Optional
from
typing
import
Optional
from
unittest.mock
import
MagicMock
import
pytest
...
...
@@ -123,11 +123,11 @@ class TestPrefixCachingBlock:
@
staticmethod
def
create_chain
(
block_size
:
int
,
token_ids
:
L
ist
[
int
],
num_empty_trailing_blocks
=
0
)
->
L
ist
[
PrefixCachingBlock
]:
token_ids
:
l
ist
[
int
],
num_empty_trailing_blocks
=
0
)
->
l
ist
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
:
L
ist
[
PrefixCachingBlock
]
=
[]
blocks
:
l
ist
[
PrefixCachingBlock
]
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
+
num_empty_trailing_blocks
...
...
@@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator:
@
staticmethod
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
BlockAllocator
,
prev_block
:
Optional
[
Block
],
token_ids
:
L
ist
[
int
]):
token_ids
:
l
ist
[
int
]):
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
...
...
@@ -839,13 +839,13 @@ class TestPrefixCachingBlockAllocator:
@
staticmethod
def
create_immutable_chain
(
block_size
:
int
,
token_ids
:
L
ist
[
int
],
token_ids
:
l
ist
[
int
],
allocator
:
PrefixCachingBlockAllocator
,
extra_hash
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
PrefixCachingBlock
]:
)
->
l
ist
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""
blocks
:
L
ist
[
Block
]
=
[]
blocks
:
l
ist
[
Block
]
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
if
num_blocks
==
0
:
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
from
unittest.mock
import
MagicMock
import
pytest
# noqa
...
...
@@ -46,7 +45,7 @@ def test_simple():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
...
...
@@ -93,7 +92,7 @@ def test_chunk():
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -145,7 +144,7 @@ def test_concurrent_chunking():
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
long_seqs
:
L
ist
[
SequenceGroup
]
=
[]
short_seqs
:
L
ist
[
SequenceGroup
]
=
[]
long_seqs
:
l
ist
[
SequenceGroup
]
=
[]
short_seqs
:
l
ist
[
SequenceGroup
]
=
[]
# Add 2 large seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -368,7 +367,7 @@ def test_complex():
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
64
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -439,7 +438,7 @@ def test_maximal_decoding():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -533,7 +532,7 @@ def test_prompt_limit():
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
,
...
...
@@ -565,7 +564,7 @@ def test_prompt_limit_exceed():
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
48
,
block_size
=
block_size
)
...
...
@@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs():
cache_config
.
num_cpu_blocks
=
128
cache_config
.
num_gpu_blocks
=
128
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
,
...
...
@@ -758,7 +757,7 @@ def test_prefix_caching():
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
@@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
2
):
...
...
tests/core/test_scheduler.py
View file @
cf069aa8
...
...
@@ -2,7 +2,6 @@
import
time
from
collections
import
deque
from
typing
import
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
import
pytest
# noqa
...
...
@@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group():
# Add multiple seq groups to scheduler.
num_seq_group
=
4
request_ids
:
S
et
[
str
]
=
set
()
request_ids
:
s
et
[
str
]
=
set
()
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
...
...
@@ -83,7 +82,7 @@ def test_scheduler_schedule_simple():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
...
...
@@ -221,7 +220,7 @@ def test_scheduler_max_seqs():
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
all_seq_groups
:
L
ist
[
SequenceGroup
]
=
[]
all_seq_groups
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
...
...
@@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora():
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
:
S
et
[
int
]
=
set
()
curr_loras
:
s
et
[
int
]
=
set
()
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras():
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
:
S
et
[
int
]
=
set
()
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
curr_loras
:
s
et
[
int
]
=
set
()
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in():
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -714,7 +713,7 @@ def test_infeasible_swap():
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
...
...
@@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy():
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
...
...
@@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
req_id_list
=
[]
...
...
tests/core/utils.py
View file @
cf069aa8
...
...
@@ -2,9 +2,8 @@
import
time
from
collections
import
defaultdict
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Tuple
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Optional
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
...
...
@@ -20,10 +19,10 @@ def create_dummy_prompt(
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
best_of
:
int
=
1
,
prompt_tokens
:
Optional
[
L
ist
[
int
]]
=
None
,
prompt_tokens
:
Optional
[
l
ist
[
int
]]
=
None
,
min_tokens
:
int
=
0
,
max_tokens
:
int
=
16
,
)
->
T
uple
[
Sequence
,
SequenceGroup
]:
)
->
t
uple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
prompt_length
...
...
@@ -48,7 +47,7 @@ def create_dummy_prompt(
return
prompt
,
seq_group
def
create_dummy_lora_sequence
(
request_id
:
int
,
token_ids
:
L
ist
[
int
],
def
create_dummy_lora_sequence
(
request_id
:
int
,
token_ids
:
l
ist
[
int
],
block_size
:
int
,
lora_int_id
:
int
)
->
Sequence
:
return
Sequence
(
seq_id
=
request_id
,
inputs
=
token_inputs
(
token_ids
),
...
...
@@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
lora_int_id
=
lora_int_id
))
def
create_dummy_sequence
(
request_id
:
int
,
token_ids
:
L
ist
[
int
],
def
create_dummy_sequence
(
request_id
:
int
,
token_ids
:
l
ist
[
int
],
block_size
:
int
)
->
Sequence
:
return
Sequence
(
seq_id
=
request_id
,
...
...
@@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder(
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
best_of
:
int
=
1
,
)
->
T
uple
[
Sequence
,
Sequence
,
SequenceGroup
]:
)
->
t
uple
[
Sequence
,
Sequence
,
SequenceGroup
]:
if
not
block_size
:
block_size
=
decoder_prompt_length
...
...
@@ -125,7 +124,7 @@ def create_seq_group(
prompt_token_ids
=
[
0
]
*
seq_prompt_len
seqs
:
L
ist
[
Sequence
]
=
[]
seqs
:
l
ist
[
Sequence
]
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
...
...
@@ -241,7 +240,7 @@ class SchedulerProxy:
def
__init__
(
self
,
scheduler
:
Scheduler
):
self
.
scheduler_
=
scheduler
self
.
call_history
:
D
ict
[
str
,
L
ist
[
Any
]]
=
defaultdict
(
list
)
self
.
call_history
:
d
ict
[
str
,
l
ist
[
Any
]]
=
defaultdict
(
list
)
def
__getattr__
(
self
,
name
:
str
)
->
Any
:
...
...
@@ -253,6 +252,6 @@ class SchedulerProxy:
return
wrapper
def
last_schedule_ret
(
self
,
)
->
T
uple
[
L
ist
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
self
,
)
->
t
uple
[
l
ist
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
return
ret
tests/distributed/test_expert_parallel.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
...
...
@@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple):
@
dataclass
class
EPTestSettings
:
parallel_setups
:
L
ist
[
ParallelSetup
]
distributed_backends
:
L
ist
[
str
]
parallel_setups
:
l
ist
[
ParallelSetup
]
distributed_backends
:
l
ist
[
str
]
task
:
TaskOption
test_options
:
EPTestOptions
...
...
tests/distributed/test_pipeline_parallel.py
View file @
cf069aa8
...
...
@@ -9,7 +9,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
...
...
@@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple):
@
dataclass
class
PPTestSettings
:
parallel_setups
:
L
ist
[
ParallelSetup
]
parallel_setups
:
l
ist
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
L
ist
[
str
]
distributed_backends
:
l
ist
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
L
ist
[
str
]
vllm_major_versions
:
l
ist
[
str
]
task
:
TaskOption
test_options
:
PPTestOptions
...
...
tests/distributed/test_pynccl.py
View file @
cf069aa8
...
...
@@ -2,7 +2,6 @@
import
multiprocessing
import
os
from
typing
import
Dict
,
List
import
pytest
import
torch
...
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
:
L
ist
[
multiprocessing
.
Process
]
=
[]
processes
:
l
ist
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
env
:
D
ict
[
str
,
str
]
=
{}
env
:
d
ict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
cf069aa8
...
...
@@ -3,7 +3,6 @@
import
multiprocessing
import
random
import
time
from
typing
import
List
import
numpy
as
np
import
torch.distributed
as
dist
...
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
L
ist
[
np
.
ndarray
]:
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
l
ist
[
np
.
ndarray
]:
np
.
random
.
seed
(
seed
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
# on average, each array will have 5k elements
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
cf069aa8
...
...
@@ -3,7 +3,7 @@
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
...
...
@@ -22,7 +22,7 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
):
"""Sanitize vllm output to be comparable with hf output."""
...
...
Prev
1
2
3
4
5
6
7
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment