Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf069aa8
Unverified
Commit
cf069aa8
authored
Mar 03, 2025
by
Harry Mellor
Committed by
GitHub
Mar 02, 2025
Browse files
Update deprecated Python 3.8 typing (#13971)
parent
bf33700e
Changes
300
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
158 additions
and
168 deletions
+158
-168
setup.py
setup.py
+3
-4
tests/async_engine/api_server_async_engine.py
tests/async_engine/api_server_async_engine.py
+3
-2
tests/async_engine/test_async_llm_engine.py
tests/async_engine/test_async_llm_engine.py
+2
-2
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+3
-3
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+4
-4
tests/conftest.py
tests/conftest.py
+79
-80
tests/core/block/e2e/conftest.py
tests/core/block/e2e/conftest.py
+2
-1
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+5
-6
tests/core/block/test_block_table.py
tests/core/block/test_block_table.py
+3
-5
tests/core/block/test_naive_block.py
tests/core/block/test_naive_block.py
+2
-2
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+8
-8
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+12
-13
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+9
-10
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+1
-3
tests/core/utils.py
tests/core/utils.py
+10
-11
tests/distributed/test_expert_parallel.py
tests/distributed/test_expert_parallel.py
+3
-3
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+4
-4
tests/distributed/test_pynccl.py
tests/distributed/test_pynccl.py
+2
-3
tests/distributed/test_shm_broadcast.py
tests/distributed/test_shm_broadcast.py
+1
-2
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+2
-2
No files found.
setup.py
View file @
cf069aa8
...
@@ -9,7 +9,6 @@ import subprocess
...
@@ -9,7 +9,6 @@ import subprocess
import
sys
import
sys
from
pathlib
import
Path
from
pathlib
import
Path
from
shutil
import
which
from
shutil
import
which
from
typing
import
Dict
,
List
import
torch
import
torch
from
packaging.version
import
Version
,
parse
from
packaging.version
import
Version
,
parse
...
@@ -78,7 +77,7 @@ class CMakeExtension(Extension):
...
@@ -78,7 +77,7 @@ class CMakeExtension(Extension):
class
cmake_build_ext
(
build_ext
):
class
cmake_build_ext
(
build_ext
):
# A dict of extension directories that have been configured.
# A dict of extension directories that have been configured.
did_config
:
D
ict
[
str
,
bool
]
=
{}
did_config
:
d
ict
[
str
,
bool
]
=
{}
#
#
# Determine number of compilation jobs and optionally nvcc compile threads.
# Determine number of compilation jobs and optionally nvcc compile threads.
...
@@ -548,10 +547,10 @@ def get_vllm_version() -> str:
...
@@ -548,10 +547,10 @@ def get_vllm_version() -> str:
return
version
return
version
def
get_requirements
()
->
L
ist
[
str
]:
def
get_requirements
()
->
l
ist
[
str
]:
"""Get Python package dependencies from requirements.txt."""
"""Get Python package dependencies from requirements.txt."""
def
_read_requirements
(
filename
:
str
)
->
L
ist
[
str
]:
def
_read_requirements
(
filename
:
str
)
->
l
ist
[
str
]:
with
open
(
get_path
(
filename
))
as
f
:
with
open
(
get_path
(
filename
))
as
f
:
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
requirements
=
f
.
read
().
strip
().
split
(
"
\n
"
)
resolved_requirements
=
[]
resolved_requirements
=
[]
...
...
tests/async_engine/api_server_async_engine.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""vllm.entrypoints.api_server with some extra logging for testing."""
"""vllm.entrypoints.api_server with some extra logging for testing."""
from
typing
import
Any
,
Dict
,
Iterable
from
collections.abc
import
Iterable
from
typing
import
Any
import
uvicorn
import
uvicorn
from
fastapi.responses
import
JSONResponse
,
Response
from
fastapi.responses
import
JSONResponse
,
Response
...
@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
...
@@ -24,7 +25,7 @@ class AsyncLLMEngineWithStats(AsyncLLMEngine):
self
.
_num_aborts
+=
len
(
ids
)
self
.
_num_aborts
+=
len
(
ids
)
await
super
().
_engine_abort
(
ids
)
await
super
().
_engine_abort
(
ids
)
def
testing_stats
(
self
)
->
D
ict
[
str
,
Any
]:
def
testing_stats
(
self
)
->
d
ict
[
str
,
Any
]:
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
return
{
"num_aborted_requests"
:
self
.
_num_aborts
}
...
...
tests/async_engine/test_async_llm_engine.py
View file @
cf069aa8
...
@@ -6,7 +6,7 @@ import uuid
...
@@ -6,7 +6,7 @@ import uuid
from
asyncio
import
CancelledError
from
asyncio
import
CancelledError
from
copy
import
copy
from
copy
import
copy
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
,
Optional
from
typing
import
Optional
import
pytest
import
pytest
import
pytest_asyncio
import
pytest_asyncio
...
@@ -254,7 +254,7 @@ async def test_output_kinds(async_engine, stop):
...
@@ -254,7 +254,7 @@ async def test_output_kinds(async_engine, stop):
params
.
output_kind
=
RequestOutputKind
.
DELTA
params
.
output_kind
=
RequestOutputKind
.
DELTA
prompt_tokens
=
None
prompt_tokens
=
None
output_tokens
:
L
ist
[
int
]
=
[]
output_tokens
:
l
ist
[
int
]
=
[]
output_text
=
""
output_text
=
""
output_count
=
0
output_count
=
0
final_output
=
None
final_output
=
None
...
...
tests/compile/piecewise/test_toy_llama.py
View file @
cf069aa8
...
@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
...
@@ -8,7 +8,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed.
initialized randomly with a fixed seed.
"""
"""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Any
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Optional
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
...
@@ -56,7 +56,7 @@ class LlamaConfig:
...
@@ -56,7 +56,7 @@ class LlamaConfig:
random_seed
:
int
=
0
random_seed
:
int
=
0
def
compute_hash
(
self
)
->
str
:
def
compute_hash
(
self
)
->
str
:
factors
:
L
ist
[
Any
]
=
[]
factors
:
l
ist
[
Any
]
=
[]
for
k
,
v
in
self
.
__dict__
.
items
():
for
k
,
v
in
self
.
__dict__
.
items
():
if
k
==
"random_seed"
:
if
k
==
"random_seed"
:
continue
continue
...
@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
...
@@ -174,7 +174,7 @@ class LlamaDecoderLayer(nn.Module):
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
],
residual
:
Optional
[
torch
.
Tensor
],
)
->
T
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
t
uple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
"""
For tractable computation:
For tractable computation:
- if residual is None, the outputs are:
- if residual is None, the outputs are:
...
...
tests/compile/test_basic_correctness.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
dataclasses
import
dataclasses
from
typing
import
Dict
,
List
,
Optional
from
typing
import
Optional
import
pytest
import
pytest
...
@@ -14,7 +14,7 @@ from ..utils import compare_all_settings
...
@@ -14,7 +14,7 @@ from ..utils import compare_all_settings
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
class
TestSetting
:
class
TestSetting
:
model
:
str
model
:
str
model_args
:
L
ist
[
str
]
model_args
:
l
ist
[
str
]
pp_size
:
int
pp_size
:
int
tp_size
:
int
tp_size
:
int
attn_backend
:
str
attn_backend
:
str
...
@@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting):
...
@@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting):
final_args
=
[
"--enforce-eager"
]
+
model_args
+
[
"-pp"
,
str
(
pp_size
)]
+
\
final_args
=
[
"--enforce-eager"
]
+
model_args
+
[
"-pp"
,
str
(
pp_size
)]
+
\
[
"-tp"
,
str
(
tp_size
)]
[
"-tp"
,
str
(
tp_size
)]
all_args
:
L
ist
[
L
ist
[
str
]]
=
[]
all_args
:
l
ist
[
l
ist
[
str
]]
=
[]
all_envs
:
L
ist
[
Optional
[
D
ict
[
str
,
str
]]]
=
[]
all_envs
:
l
ist
[
Optional
[
d
ict
[
str
,
str
]]]
=
[]
for
level
in
[
for
level
in
[
CompilationLevel
.
NO_COMPILATION
,
CompilationLevel
.
NO_COMPILATION
,
...
...
tests/conftest.py
View file @
cf069aa8
...
@@ -5,8 +5,7 @@ import os
...
@@ -5,8 +5,7 @@ import os
import
tempfile
import
tempfile
from
collections
import
UserList
from
collections
import
UserList
from
enum
import
Enum
from
enum
import
Enum
from
typing
import
(
Any
,
Callable
,
Dict
,
List
,
Optional
,
Tuple
,
Type
,
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
TypedDict
,
TypeVar
,
Union
)
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
...
@@ -47,14 +46,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
...
@@ -47,14 +46,14 @@ _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M
=
TypeVar
(
"_M"
)
_M
=
TypeVar
(
"_M"
)
_PromptMultiModalInput
=
Union
[
L
ist
[
_M
],
L
ist
[
L
ist
[
_M
]]]
_PromptMultiModalInput
=
Union
[
l
ist
[
_M
],
l
ist
[
l
ist
[
_M
]]]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptImageInput
=
_PromptMultiModalInput
[
Image
.
Image
]
PromptAudioInput
=
_PromptMultiModalInput
[
T
uple
[
np
.
ndarray
,
int
]]
PromptAudioInput
=
_PromptMultiModalInput
[
t
uple
[
np
.
ndarray
,
int
]]
PromptVideoInput
=
_PromptMultiModalInput
[
np
.
ndarray
]
PromptVideoInput
=
_PromptMultiModalInput
[
np
.
ndarray
]
def
_read_prompts
(
filename
:
str
)
->
L
ist
[
str
]:
def
_read_prompts
(
filename
:
str
)
->
l
ist
[
str
]:
with
open
(
filename
)
as
f
:
with
open
(
filename
)
as
f
:
prompts
=
f
.
readlines
()
prompts
=
f
.
readlines
()
return
prompts
return
prompts
...
@@ -77,7 +76,7 @@ class _ImageAssets(_ImageAssetsBase):
...
@@ -77,7 +76,7 @@ class _ImageAssets(_ImageAssetsBase):
ImageAsset
(
"cherry_blossom"
),
ImageAsset
(
"cherry_blossom"
),
])
])
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
L
ist
[
str
]:
def
prompts
(
self
,
prompts
:
_ImageAssetPrompts
)
->
l
ist
[
str
]:
"""
"""
Convenience method to define the prompt for each test image.
Convenience method to define the prompt for each test image.
...
@@ -102,7 +101,7 @@ class _VideoAssets(_VideoAssetsBase):
...
@@ -102,7 +101,7 @@ class _VideoAssets(_VideoAssetsBase):
VideoAsset
(
"sample_demo_1.mp4"
),
VideoAsset
(
"sample_demo_1.mp4"
),
])
])
def
prompts
(
self
,
prompts
:
_VideoAssetPrompts
)
->
L
ist
[
str
]:
def
prompts
(
self
,
prompts
:
_VideoAssetPrompts
)
->
l
ist
[
str
]:
return
[
prompts
[
"sample_demo_1"
]]
return
[
prompts
[
"sample_demo_1"
]]
...
@@ -175,7 +174,7 @@ def dynamo_reset():
...
@@ -175,7 +174,7 @@ def dynamo_reset():
@
pytest
.
fixture
@
pytest
.
fixture
def
example_prompts
()
->
L
ist
[
str
]:
def
example_prompts
()
->
l
ist
[
str
]:
prompts
=
[]
prompts
=
[]
for
filename
in
_TEST_PROMPTS
:
for
filename
in
_TEST_PROMPTS
:
prompts
+=
_read_prompts
(
filename
)
prompts
+=
_read_prompts
(
filename
)
...
@@ -197,7 +196,7 @@ class DecoderPromptType(Enum):
...
@@ -197,7 +196,7 @@ class DecoderPromptType(Enum):
@
pytest
.
fixture
@
pytest
.
fixture
def
example_encoder_decoder_prompts
(
def
example_encoder_decoder_prompts
(
)
->
D
ict
[
DecoderPromptType
,
L
ist
[
ExplicitEncoderDecoderPrompt
]]:
)
->
d
ict
[
DecoderPromptType
,
l
ist
[
ExplicitEncoderDecoderPrompt
]]:
'''
'''
Returns an encoder prompt list and a decoder prompt list, wherein each pair
Returns an encoder prompt list and a decoder prompt list, wherein each pair
of same-index entries in both lists corresponds to an (encoder prompt,
of same-index entries in both lists corresponds to an (encoder prompt,
...
@@ -229,7 +228,7 @@ def example_encoder_decoder_prompts(
...
@@ -229,7 +228,7 @@ def example_encoder_decoder_prompts(
@
pytest
.
fixture
@
pytest
.
fixture
def
example_long_prompts
()
->
L
ist
[
str
]:
def
example_long_prompts
()
->
l
ist
[
str
]:
prompts
=
[]
prompts
=
[]
for
filename
in
_LONG_PROMPTS
:
for
filename
in
_LONG_PROMPTS
:
prompts
+=
_read_prompts
(
filename
)
prompts
+=
_read_prompts
(
filename
)
...
@@ -273,11 +272,11 @@ class HfRunner:
...
@@ -273,11 +272,11 @@ class HfRunner:
model_name
:
str
,
model_name
:
str
,
dtype
:
str
=
"half"
,
dtype
:
str
=
"half"
,
*
,
*
,
model_kwargs
:
Optional
[
D
ict
[
str
,
Any
]]
=
None
,
model_kwargs
:
Optional
[
d
ict
[
str
,
Any
]]
=
None
,
is_sentence_transformer
:
bool
=
False
,
is_sentence_transformer
:
bool
=
False
,
is_cross_encoder
:
bool
=
False
,
is_cross_encoder
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
auto_cls
:
T
ype
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
auto_cls
:
t
ype
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
postprocess_inputs
:
Callable
[...,
BatchEncoding
]
=
identity
,
postprocess_inputs
:
Callable
[...,
BatchEncoding
]
=
identity
,
)
->
None
:
)
->
None
:
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
torch_dtype
=
STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
...
@@ -334,11 +333,11 @@ class HfRunner:
...
@@ -334,11 +333,11 @@ class HfRunner:
def
get_inputs
(
def
get_inputs
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
BatchEncoding
]:
)
->
l
ist
[
BatchEncoding
]:
if
images
is
not
None
:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
...
@@ -348,9 +347,9 @@ class HfRunner:
...
@@ -348,9 +347,9 @@ class HfRunner:
if
audios
is
not
None
:
if
audios
is
not
None
:
assert
len
(
prompts
)
==
len
(
audios
)
assert
len
(
prompts
)
==
len
(
audios
)
all_inputs
:
L
ist
[
BatchEncoding
]
=
[]
all_inputs
:
l
ist
[
BatchEncoding
]
=
[]
for
i
,
prompt
in
enumerate
(
prompts
):
for
i
,
prompt
in
enumerate
(
prompts
):
processor_kwargs
:
D
ict
[
str
,
Any
]
=
{
processor_kwargs
:
d
ict
[
str
,
Any
]
=
{
"text"
:
prompt
,
"text"
:
prompt
,
"return_tensors"
:
"pt"
,
"return_tensors"
:
"pt"
,
}
}
...
@@ -370,7 +369,7 @@ class HfRunner:
...
@@ -370,7 +369,7 @@ class HfRunner:
return
all_inputs
return
all_inputs
def
classify
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
str
]:
def
classify
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
str
]:
# output is final logits
# output is final logits
all_inputs
=
self
.
get_inputs
(
prompts
)
all_inputs
=
self
.
get_inputs
(
prompts
)
outputs
=
[]
outputs
=
[]
...
@@ -383,18 +382,18 @@ class HfRunner:
...
@@ -383,18 +382,18 @@ class HfRunner:
def
generate
(
def
generate
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
audios
=
audios
)
audios
=
audios
)
outputs
:
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]
=
[]
outputs
:
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]
=
[]
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
output_ids
=
self
.
model
.
generate
(
output_ids
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
...
@@ -412,13 +411,13 @@ class HfRunner:
...
@@ -412,13 +411,13 @@ class HfRunner:
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
int
],
str
]]:
)
->
l
ist
[
t
uple
[
l
ist
[
int
],
str
]]:
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
...
@@ -432,10 +431,10 @@ class HfRunner:
...
@@ -432,10 +431,10 @@ class HfRunner:
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
beam_width
:
int
,
beam_width
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
do_sample
=
False
,
do_sample
=
False
,
max_new_tokens
=
max_tokens
,
max_new_tokens
=
max_tokens
,
...
@@ -453,19 +452,19 @@ class HfRunner:
...
@@ -453,19 +452,19 @@ class HfRunner:
def
generate_greedy_logprobs
(
def
generate_greedy_logprobs
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
L
ist
[
torch
.
Tensor
]]:
)
->
l
ist
[
l
ist
[
torch
.
Tensor
]]:
all_inputs
=
self
.
get_inputs
(
prompts
,
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
audios
=
audios
)
audios
=
audios
)
all_logprobs
:
L
ist
[
L
ist
[
torch
.
Tensor
]]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
torch
.
Tensor
]]
=
[]
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
**
self
.
wrap_device
(
inputs
,
device
=
self
.
model
.
device
.
type
),
...
@@ -483,11 +482,11 @@ class HfRunner:
...
@@ -483,11 +482,11 @@ class HfRunner:
def
_hidden_states_to_seq_logprobs
(
def
_hidden_states_to_seq_logprobs
(
self
,
self
,
hidden_states
:
T
uple
[
T
uple
[
torch
.
Tensor
,
...],
...],
hidden_states
:
t
uple
[
t
uple
[
torch
.
Tensor
,
...],
...],
)
->
L
ist
[
torch
.
Tensor
]:
)
->
l
ist
[
torch
.
Tensor
]:
output_embeddings
=
self
.
model
.
get_output_embeddings
()
output_embeddings
=
self
.
model
.
get_output_embeddings
()
seq_logprobs
:
L
ist
[
torch
.
Tensor
]
=
[]
seq_logprobs
:
l
ist
[
torch
.
Tensor
]
=
[]
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
for
_
,
hidden_state
in
enumerate
(
hidden_states
):
last_hidden_states
=
hidden_state
[
-
1
][
0
]
last_hidden_states
=
hidden_state
[
-
1
][
0
]
logits
=
torch
.
matmul
(
logits
=
torch
.
matmul
(
...
@@ -503,14 +502,14 @@ class HfRunner:
...
@@ -503,14 +502,14 @@ class HfRunner:
def
_hidden_states_to_logprobs
(
def
_hidden_states_to_logprobs
(
self
,
self
,
hidden_states
:
T
uple
[
T
uple
[
torch
.
Tensor
,
...],
...],
hidden_states
:
t
uple
[
t
uple
[
torch
.
Tensor
,
...],
...],
num_logprobs
:
int
,
num_logprobs
:
int
,
)
->
T
uple
[
L
ist
[
D
ict
[
int
,
float
]],
int
]:
)
->
t
uple
[
l
ist
[
d
ict
[
int
,
float
]],
int
]:
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
output_len
=
len
(
hidden_states
)
output_len
=
len
(
hidden_states
)
# convert to dict
# convert to dict
seq_logprobs_lst
:
L
ist
[
D
ict
[
int
,
float
]]
=
[]
seq_logprobs_lst
:
l
ist
[
d
ict
[
int
,
float
]]
=
[]
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
for
tok_idx
,
tok_logprobs
in
enumerate
(
seq_logprobs
):
# drop prompt logprobs
# drop prompt logprobs
if
tok_idx
==
0
:
if
tok_idx
==
0
:
...
@@ -530,22 +529,22 @@ class HfRunner:
...
@@ -530,22 +529,22 @@ class HfRunner:
def
generate_greedy_logprobs_limit
(
def
generate_greedy_logprobs_limit
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
TokensTextLogprobs
]:
)
->
l
ist
[
TokensTextLogprobs
]:
all_inputs
=
self
.
get_inputs
(
prompts
,
all_inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
audios
=
audios
)
audios
=
audios
)
all_logprobs
:
L
ist
[
L
ist
[
D
ict
[
int
,
float
]]]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
d
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
all_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
all_output_strs
:
L
ist
[
str
]
=
[]
all_output_strs
:
l
ist
[
str
]
=
[]
for
inputs
in
all_inputs
:
for
inputs
in
all_inputs
:
output
=
self
.
model
.
generate
(
output
=
self
.
model
.
generate
(
...
@@ -577,23 +576,23 @@ class HfRunner:
...
@@ -577,23 +576,23 @@ class HfRunner:
def
generate_encoder_decoder_greedy_logprobs_limit
(
def
generate_encoder_decoder_greedy_logprobs_limit
(
self
,
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
TokensTextLogprobs
]:
)
->
l
ist
[
TokensTextLogprobs
]:
'''
'''
Greedy logprobs generation for vLLM encoder/decoder models
Greedy logprobs generation for vLLM encoder/decoder models
'''
'''
all_logprobs
:
L
ist
[
L
ist
[
D
ict
[
int
,
float
]]]
=
[]
all_logprobs
:
l
ist
[
l
ist
[
d
ict
[
int
,
float
]]]
=
[]
all_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
all_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
all_output_strs
:
L
ist
[
str
]
=
[]
all_output_strs
:
l
ist
[
str
]
=
[]
for
i
,
(
encoder_prompt
,
decoder_prompt
)
in
enumerate
(
for
i
,
(
encoder_prompt
,
decoder_prompt
)
in
enumerate
(
to_enc_dec_tuple_list
(
encoder_decoder_prompts
)):
to_enc_dec_tuple_list
(
encoder_decoder_prompts
)):
processor_kwargs
:
D
ict
[
str
,
Any
]
=
{
processor_kwargs
:
d
ict
[
str
,
Any
]
=
{
"text"
:
encoder_prompt
,
"text"
:
encoder_prompt
,
"return_tensors"
:
"pt"
,
"return_tensors"
:
"pt"
,
}
}
...
@@ -641,10 +640,10 @@ class HfRunner:
...
@@ -641,10 +640,10 @@ class HfRunner:
return
[(
output_ids
,
output_str
,
output_logprobs
)
return
[(
output_ids
,
output_str
,
output_logprobs
)
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
for
output_ids
,
output_str
,
output_logprobs
in
outputs
]
def
encode
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
L
ist
[
torch
.
Tensor
]]:
def
encode
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
l
ist
[
torch
.
Tensor
]]:
return
self
.
model
.
encode
(
prompts
)
return
self
.
model
.
encode
(
prompts
)
def
predict
(
self
,
prompts
:
L
ist
[
L
ist
[
str
]])
->
torch
.
Tensor
:
def
predict
(
self
,
prompts
:
l
ist
[
l
ist
[
str
]])
->
torch
.
Tensor
:
return
self
.
model
.
predict
(
prompts
,
convert_to_tensor
=
True
)
return
self
.
model
.
predict
(
prompts
,
convert_to_tensor
=
True
)
def
__enter__
(
self
):
def
__enter__
(
self
):
...
@@ -699,11 +698,11 @@ class VllmRunner:
...
@@ -699,11 +698,11 @@ class VllmRunner:
def
get_inputs
(
def
get_inputs
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
TextPrompt
]:
)
->
l
ist
[
TextPrompt
]:
if
images
is
not
None
:
if
images
is
not
None
:
assert
len
(
prompts
)
==
len
(
images
)
assert
len
(
prompts
)
==
len
(
images
)
...
@@ -733,13 +732,13 @@ class VllmRunner:
...
@@ -733,13 +732,13 @@ class VllmRunner:
def
generate
(
def
generate
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
inputs
=
self
.
get_inputs
(
prompts
,
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
...
@@ -749,12 +748,12 @@ class VllmRunner:
...
@@ -749,12 +748,12 @@ class VllmRunner:
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
**
kwargs
)
**
kwargs
)
outputs
:
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]
=
[]
outputs
:
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]
=
[]
for
req_output
in
req_outputs
:
for
req_output
in
req_outputs
:
prompt_str
=
req_output
.
prompt
prompt_str
=
req_output
.
prompt
prompt_ids
=
req_output
.
prompt_token_ids
prompt_ids
=
req_output
.
prompt_token_ids
req_sample_output_ids
:
L
ist
[
L
ist
[
int
]]
=
[]
req_sample_output_ids
:
l
ist
[
l
ist
[
int
]]
=
[]
req_sample_output_strs
:
L
ist
[
str
]
=
[]
req_sample_output_strs
:
l
ist
[
str
]
=
[]
for
sample
in
req_output
.
outputs
:
for
sample
in
req_output
.
outputs
:
output_str
=
sample
.
text
output_str
=
sample
.
text
output_ids
=
list
(
sample
.
token_ids
)
output_ids
=
list
(
sample
.
token_ids
)
...
@@ -765,9 +764,9 @@ class VllmRunner:
...
@@ -765,9 +764,9 @@ class VllmRunner:
@
staticmethod
@
staticmethod
def
_final_steps_generate_w_logprobs
(
def
_final_steps_generate_w_logprobs
(
req_outputs
:
L
ist
[
RequestOutput
],
req_outputs
:
l
ist
[
RequestOutput
],
)
->
L
ist
[
TokensTextLogprobsPromptLogprobs
]:
)
->
l
ist
[
TokensTextLogprobsPromptLogprobs
]:
outputs
:
L
ist
[
TokensTextLogprobsPromptLogprobs
]
=
[]
outputs
:
l
ist
[
TokensTextLogprobsPromptLogprobs
]
=
[]
for
req_output
in
req_outputs
:
for
req_output
in
req_outputs
:
assert
len
(
req_output
.
outputs
)
>
0
assert
len
(
req_output
.
outputs
)
>
0
for
sample
in
req_output
.
outputs
:
for
sample
in
req_output
.
outputs
:
...
@@ -780,14 +779,14 @@ class VllmRunner:
...
@@ -780,14 +779,14 @@ class VllmRunner:
def
generate_w_logprobs
(
def
generate_w_logprobs
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
inputs
=
self
.
get_inputs
(
prompts
,
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
...
@@ -806,10 +805,10 @@ class VllmRunner:
...
@@ -806,10 +805,10 @@ class VllmRunner:
def
generate_encoder_decoder_w_logprobs
(
def
generate_encoder_decoder_w_logprobs
(
self
,
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
sampling_params
:
SamplingParams
,
sampling_params
:
SamplingParams
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
'''
'''
Logprobs generation for vLLM encoder/decoder models
Logprobs generation for vLLM encoder/decoder models
'''
'''
...
@@ -826,13 +825,13 @@ class VllmRunner:
...
@@ -826,13 +825,13 @@ class VllmRunner:
def
generate_greedy
(
def
generate_greedy
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
L
ist
[
T
uple
[
L
ist
[
int
],
str
]]:
)
->
l
ist
[
t
uple
[
l
ist
[
int
],
str
]]:
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
outputs
=
self
.
generate
(
prompts
,
outputs
=
self
.
generate
(
prompts
,
greedy_params
,
greedy_params
,
...
@@ -845,18 +844,18 @@ class VllmRunner:
...
@@ -845,18 +844,18 @@ class VllmRunner:
def
generate_greedy_logprobs
(
def
generate_greedy_logprobs
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
stop_token_ids
:
Optional
[
L
ist
[
int
]]
=
None
,
stop_token_ids
:
Optional
[
l
ist
[
int
]]
=
None
,
stop
:
Optional
[
L
ist
[
str
]]
=
None
,
stop
:
Optional
[
l
ist
[
str
]]
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
temperature
=
0.0
,
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
...
@@ -874,12 +873,12 @@ class VllmRunner:
...
@@ -874,12 +873,12 @@ class VllmRunner:
def
generate_encoder_decoder_greedy_logprobs
(
def
generate_encoder_decoder_greedy_logprobs
(
self
,
self
,
encoder_decoder_prompts
:
L
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
encoder_decoder_prompts
:
l
ist
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
)
->
Union
[
L
ist
[
TokensTextLogprobs
],
)
->
Union
[
l
ist
[
TokensTextLogprobs
],
L
ist
[
TokensTextLogprobsPromptLogprobs
]]:
l
ist
[
TokensTextLogprobsPromptLogprobs
]]:
greedy_logprobs_params
=
SamplingParams
(
greedy_logprobs_params
=
SamplingParams
(
temperature
=
0.0
,
temperature
=
0.0
,
max_tokens
=
max_tokens
,
max_tokens
=
max_tokens
,
...
@@ -895,10 +894,10 @@ class VllmRunner:
...
@@ -895,10 +894,10 @@ class VllmRunner:
def
generate_beam_search
(
def
generate_beam_search
(
self
,
self
,
prompts
:
Union
[
L
ist
[
str
],
L
ist
[
L
ist
[
int
]]],
prompts
:
Union
[
l
ist
[
str
],
l
ist
[
l
ist
[
int
]]],
beam_width
:
int
,
beam_width
:
int
,
max_tokens
:
int
,
max_tokens
:
int
,
)
->
L
ist
[
T
uple
[
L
ist
[
L
ist
[
int
]],
L
ist
[
str
]]]:
)
->
l
ist
[
t
uple
[
l
ist
[
l
ist
[
int
]],
l
ist
[
str
]]]:
if
is_list_of
(
prompts
,
str
,
check
=
"all"
):
if
is_list_of
(
prompts
,
str
,
check
=
"all"
):
prompts
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
prompts
=
[
TextPrompt
(
prompt
=
prompt
)
for
prompt
in
prompts
]
else
:
else
:
...
@@ -915,17 +914,17 @@ class VllmRunner:
...
@@ -915,17 +914,17 @@ class VllmRunner:
returned_outputs
.
append
((
token_ids
,
texts
))
returned_outputs
.
append
((
token_ids
,
texts
))
return
returned_outputs
return
returned_outputs
def
classify
(
self
,
prompts
:
L
ist
[
str
])
->
L
ist
[
L
ist
[
float
]]:
def
classify
(
self
,
prompts
:
l
ist
[
str
])
->
l
ist
[
l
ist
[
float
]]:
req_outputs
=
self
.
model
.
classify
(
prompts
)
req_outputs
=
self
.
model
.
classify
(
prompts
)
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
def
encode
(
def
encode
(
self
,
self
,
prompts
:
L
ist
[
str
],
prompts
:
l
ist
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
)
->
L
ist
[
L
ist
[
float
]]:
)
->
l
ist
[
l
ist
[
float
]]:
inputs
=
self
.
get_inputs
(
prompts
,
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
images
=
images
,
videos
=
videos
,
videos
=
videos
,
...
@@ -936,9 +935,9 @@ class VllmRunner:
...
@@ -936,9 +935,9 @@ class VllmRunner:
def
score
(
def
score
(
self
,
self
,
text_1
:
Union
[
str
,
L
ist
[
str
]],
text_1
:
Union
[
str
,
l
ist
[
str
]],
text_2
:
Union
[
str
,
L
ist
[
str
]],
text_2
:
Union
[
str
,
l
ist
[
str
]],
)
->
L
ist
[
float
]:
)
->
l
ist
[
float
]:
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
)
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
)
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
...
...
tests/core/block/e2e/conftest.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Callable
,
Iterable
,
Optional
from
collections.abc
import
Iterable
from
typing
import
Callable
,
Optional
import
pytest
import
pytest
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
random
import
random
from
typing
import
List
import
pytest
import
pytest
...
@@ -137,9 +136,9 @@ def prep_prompts(batch_size: int):
...
@@ -137,9 +136,9 @@ def prep_prompts(batch_size: int):
The prompt is just under 10k tokens; sliding window is 4k
The prompt is just under 10k tokens; sliding window is 4k
so the answer is outside sliding window, but should still be correct.
so the answer is outside sliding window, but should still be correct.
"""
"""
prompts
:
L
ist
[
str
]
=
[]
prompts
:
l
ist
[
str
]
=
[]
answer
:
L
ist
[
int
]
=
[]
answer
:
l
ist
[
int
]
=
[]
indices
:
L
ist
[
int
]
=
[]
indices
:
l
ist
[
int
]
=
[]
random
.
seed
(
1
)
random
.
seed
(
1
)
for
_
in
range
(
batch_size
):
for
_
in
range
(
batch_size
):
idx
=
random
.
randint
(
30
,
90
)
idx
=
random
.
randint
(
30
,
90
)
...
@@ -158,7 +157,7 @@ def prep_prompts(batch_size: int):
...
@@ -158,7 +157,7 @@ def prep_prompts(batch_size: int):
return
prompts
,
answer
,
indices
return
prompts
,
answer
,
indices
def
check_answers
(
indices
:
L
ist
[
int
],
answer
:
L
ist
[
int
],
outputs
:
L
ist
[
str
]):
def
check_answers
(
indices
:
l
ist
[
int
],
answer
:
l
ist
[
int
],
outputs
:
l
ist
[
str
]):
answer2
=
[
int
(
text
[
0
:
2
].
strip
())
for
text
in
outputs
]
answer2
=
[
int
(
text
[
0
:
2
].
strip
())
for
text
in
outputs
]
print
(
list
(
zip
(
indices
,
zip
(
answer
,
answer2
))))
print
(
list
(
zip
(
indices
,
zip
(
answer
,
answer2
))))
numok
=
0
numok
=
0
...
@@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
...
@@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
assert
frac_ok
>
0.7
assert
frac_ok
>
0.7
def
check_window
(
prompts
:
L
ist
[
str
]):
def
check_window
(
prompts
:
l
ist
[
str
]):
def
inner
(
llm
:
LLM
):
def
inner
(
llm
:
LLM
):
sliding_window
=
llm
.
llm_engine
.
model_config
.
get_sliding_window
()
sliding_window
=
llm
.
llm_engine
.
model_config
.
get_sliding_window
()
...
...
tests/core/block/test_block_table.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
pytest
import
pytest
from
vllm.core.block.block_table
import
BlockTable
from
vllm.core.block.block_table
import
BlockTable
...
@@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
...
@@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
token_ids
=
list
(
range
(
sequence_len
))
token_ids
=
list
(
range
(
sequence_len
))
num_blocks_per_alloc
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
num_blocks_per_alloc
=
len
(
list
(
chunk_list
(
token_ids
,
block_size
)))
block_tables
:
L
ist
[
BlockTable
]
=
[]
block_tables
:
l
ist
[
BlockTable
]
=
[]
for
i
in
range
(
5
):
for
i
in
range
(
5
):
assert
allocator
.
get_num_free_blocks
(
assert
allocator
.
get_num_free_blocks
(
device
=
Device
.
GPU
)
==
num_gpu_blocks
-
i
*
num_blocks_per_alloc
device
=
Device
.
GPU
)
==
num_gpu_blocks
-
i
*
num_blocks_per_alloc
...
@@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
...
@@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
num_immutable_blocks_per_alloc
=
len
(
num_immutable_blocks_per_alloc
=
len
(
chunked_tokens
)
-
num_mutable_blocks_per_alloc
chunked_tokens
)
-
num_mutable_blocks_per_alloc
block_tables
:
L
ist
[
BlockTable
]
=
[]
block_tables
:
l
ist
[
BlockTable
]
=
[]
for
alloc_i
in
range
(
1
,
6
):
for
alloc_i
in
range
(
1
,
6
):
block_tables
.
append
(
block_tables
.
append
(
...
@@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
...
@@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
)
)
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
block_table
.
allocate
(
token_ids
=
token_ids
,
device
=
Device
.
GPU
)
appended_so_far
:
L
ist
[
int
]
=
[]
appended_so_far
:
l
ist
[
int
]
=
[]
for
append
in
chunk_list
(
token_ids_to_append
,
append_size
):
for
append
in
chunk_list
(
token_ids_to_append
,
append_size
):
block_table
.
append_token_ids
(
append
)
block_table
.
append_token_ids
(
append
)
appended_so_far
.
extend
(
append
)
appended_so_far
.
extend
(
append
)
...
...
tests/core/block/test_naive_block.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
,
Optional
from
typing
import
Optional
import
pytest
import
pytest
...
@@ -14,7 +14,7 @@ class TestNaiveBlockAllocator:
...
@@ -14,7 +14,7 @@ class TestNaiveBlockAllocator:
def
create_allocate_lambda
(
allocate_type
:
str
,
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
NaiveBlockAllocator
,
allocator
:
NaiveBlockAllocator
,
prev_block
:
Optional
[
Block
],
prev_block
:
Optional
[
Block
],
token_ids
:
L
ist
[
int
]):
token_ids
:
l
ist
[
int
]):
if
allocate_type
==
"immutable"
:
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
prev_block
=
prev_block
,
token_ids
=
token_ids
)
...
...
tests/core/block/test_prefix_caching_block.py
View file @
cf069aa8
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
import
math
import
math
import
random
import
random
from
typing
import
List
,
Optional
from
typing
import
Optional
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
import
pytest
...
@@ -123,11 +123,11 @@ class TestPrefixCachingBlock:
...
@@ -123,11 +123,11 @@ class TestPrefixCachingBlock:
@
staticmethod
@
staticmethod
def
create_chain
(
block_size
:
int
,
def
create_chain
(
block_size
:
int
,
token_ids
:
L
ist
[
int
],
token_ids
:
l
ist
[
int
],
num_empty_trailing_blocks
=
0
)
->
L
ist
[
PrefixCachingBlock
]:
num_empty_trailing_blocks
=
0
)
->
l
ist
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""Helper method which creates a chain of blocks.
"""
"""
blocks
:
L
ist
[
PrefixCachingBlock
]
=
[]
blocks
:
l
ist
[
PrefixCachingBlock
]
=
[]
num_blocks
=
math
.
ceil
(
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
+
num_empty_trailing_blocks
len
(
token_ids
)
/
block_size
)
+
num_empty_trailing_blocks
...
@@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator:
...
@@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator:
@
staticmethod
@
staticmethod
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
BlockAllocator
,
def
create_allocate_lambda
(
allocate_type
:
str
,
allocator
:
BlockAllocator
,
prev_block
:
Optional
[
Block
],
prev_block
:
Optional
[
Block
],
token_ids
:
L
ist
[
int
]):
token_ids
:
l
ist
[
int
]):
if
allocate_type
==
"immutable"
:
if
allocate_type
==
"immutable"
:
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
allocate_block
=
lambda
:
allocator
.
allocate_immutable_block
(
prev_block
=
prev_block
,
token_ids
=
token_ids
)
prev_block
=
prev_block
,
token_ids
=
token_ids
)
...
@@ -839,13 +839,13 @@ class TestPrefixCachingBlockAllocator:
...
@@ -839,13 +839,13 @@ class TestPrefixCachingBlockAllocator:
@
staticmethod
@
staticmethod
def
create_immutable_chain
(
def
create_immutable_chain
(
block_size
:
int
,
block_size
:
int
,
token_ids
:
L
ist
[
int
],
token_ids
:
l
ist
[
int
],
allocator
:
PrefixCachingBlockAllocator
,
allocator
:
PrefixCachingBlockAllocator
,
extra_hash
:
Optional
[
int
]
=
None
,
extra_hash
:
Optional
[
int
]
=
None
,
)
->
L
ist
[
PrefixCachingBlock
]:
)
->
l
ist
[
PrefixCachingBlock
]:
"""Helper method which creates a chain of blocks.
"""Helper method which creates a chain of blocks.
"""
"""
blocks
:
L
ist
[
Block
]
=
[]
blocks
:
l
ist
[
Block
]
=
[]
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
num_blocks
=
math
.
ceil
(
len
(
token_ids
)
/
block_size
)
if
num_blocks
==
0
:
if
num_blocks
==
0
:
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
# noqa
...
@@ -46,7 +45,7 @@ def test_simple():
...
@@ -46,7 +45,7 @@ def test_simple():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
...
@@ -93,7 +92,7 @@ def test_chunk():
...
@@ -93,7 +92,7 @@ def test_chunk():
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -145,7 +144,7 @@ def test_concurrent_chunking():
...
@@ -145,7 +144,7 @@ def test_concurrent_chunking():
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_cpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
...
@@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_cpu_blocks
=
3200
# large KV cache size for large requests
cache_config
.
num_gpu_blocks
=
3200
cache_config
.
num_gpu_blocks
=
3200
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
long_seqs
:
L
ist
[
SequenceGroup
]
=
[]
long_seqs
:
l
ist
[
SequenceGroup
]
=
[]
short_seqs
:
L
ist
[
SequenceGroup
]
=
[]
short_seqs
:
l
ist
[
SequenceGroup
]
=
[]
# Add 2 large seq groups to scheduler.
# Add 2 large seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -368,7 +367,7 @@ def test_complex():
...
@@ -368,7 +367,7 @@ def test_complex():
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_cpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
64
cache_config
.
num_gpu_blocks
=
64
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -439,7 +438,7 @@ def test_maximal_decoding():
...
@@ -439,7 +438,7 @@ def test_maximal_decoding():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -533,7 +532,7 @@ def test_prompt_limit():
...
@@ -533,7 +532,7 @@ def test_prompt_limit():
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
48
,
prompt_length
=
48
,
...
@@ -565,7 +564,7 @@ def test_prompt_limit_exceed():
...
@@ -565,7 +564,7 @@ def test_prompt_limit_exceed():
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
_
,
seq_group
=
create_dummy_prompt
(
"2"
,
prompt_length
=
48
,
prompt_length
=
48
,
block_size
=
block_size
)
block_size
=
block_size
)
...
@@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs():
...
@@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs():
cache_config
.
num_cpu_blocks
=
128
cache_config
.
num_cpu_blocks
=
128
cache_config
.
num_gpu_blocks
=
128
cache_config
.
num_gpu_blocks
=
128
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
65
,
prompt_length
=
65
,
...
@@ -758,7 +757,7 @@ def test_prefix_caching():
...
@@ -758,7 +757,7 @@ def test_prefix_caching():
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
@@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
...
@@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_cpu_blocks
=
0
cache_config
.
num_gpu_blocks
=
32
cache_config
.
num_gpu_blocks
=
32
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
2
):
for
i
in
range
(
2
):
...
...
tests/core/test_scheduler.py
View file @
cf069aa8
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
import
time
import
time
from
collections
import
deque
from
collections
import
deque
from
typing
import
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
# noqa
...
@@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group():
...
@@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group():
# Add multiple seq groups to scheduler.
# Add multiple seq groups to scheduler.
num_seq_group
=
4
num_seq_group
=
4
request_ids
:
S
et
[
str
]
=
set
()
request_ids
:
s
et
[
str
]
=
set
()
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
...
@@ -83,7 +82,7 @@ def test_scheduler_schedule_simple():
...
@@ -83,7 +82,7 @@ def test_scheduler_schedule_simple():
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
...
@@ -221,7 +220,7 @@ def test_scheduler_max_seqs():
...
@@ -221,7 +220,7 @@ def test_scheduler_max_seqs():
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
all_seq_groups
:
L
ist
[
SequenceGroup
]
=
[]
all_seq_groups
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
...
@@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora():
...
@@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora():
num_cpu_blocks
=
64
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
120
)
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
:
S
et
[
int
]
=
set
()
curr_loras
:
s
et
[
int
]
=
set
()
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
...
@@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras():
...
@@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras():
block_size
=
block_size
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
:
S
et
[
int
]
=
set
()
curr_loras
:
s
et
[
int
]
=
set
()
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
...
@@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in():
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
...
@@ -714,7 +713,7 @@ def test_infeasible_swap():
...
@@ -714,7 +713,7 @@ def test_infeasible_swap():
num_cpu_blocks
=
32
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
...
@@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy():
...
@@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy():
block_size
=
block_size
)
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
:
L
ist
[
T
uple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
l
ist
[
t
uple
[
int
,
int
]]
=
[]
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
typing
import
List
import
pytest
# noqa
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
...
@@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
...
@@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
running
:
L
ist
[
SequenceGroup
]
=
[]
running
:
l
ist
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
req_id_list
=
[]
req_id_list
=
[]
...
...
tests/core/utils.py
View file @
cf069aa8
...
@@ -2,9 +2,8 @@
...
@@ -2,9 +2,8 @@
import
time
import
time
from
collections
import
defaultdict
from
collections
import
defaultdict
from
typing
import
Any
,
Dict
,
List
,
Optional
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Optional
from
typing
import
Tuple
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
...
@@ -20,10 +19,10 @@ def create_dummy_prompt(
...
@@ -20,10 +19,10 @@ def create_dummy_prompt(
block_size
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
best_of
:
int
=
1
,
best_of
:
int
=
1
,
prompt_tokens
:
Optional
[
L
ist
[
int
]]
=
None
,
prompt_tokens
:
Optional
[
l
ist
[
int
]]
=
None
,
min_tokens
:
int
=
0
,
min_tokens
:
int
=
0
,
max_tokens
:
int
=
16
,
max_tokens
:
int
=
16
,
)
->
T
uple
[
Sequence
,
SequenceGroup
]:
)
->
t
uple
[
Sequence
,
SequenceGroup
]:
if
not
block_size
:
if
not
block_size
:
block_size
=
prompt_length
block_size
=
prompt_length
...
@@ -48,7 +47,7 @@ def create_dummy_prompt(
...
@@ -48,7 +47,7 @@ def create_dummy_prompt(
return
prompt
,
seq_group
return
prompt
,
seq_group
def
create_dummy_lora_sequence
(
request_id
:
int
,
token_ids
:
L
ist
[
int
],
def
create_dummy_lora_sequence
(
request_id
:
int
,
token_ids
:
l
ist
[
int
],
block_size
:
int
,
lora_int_id
:
int
)
->
Sequence
:
block_size
:
int
,
lora_int_id
:
int
)
->
Sequence
:
return
Sequence
(
seq_id
=
request_id
,
return
Sequence
(
seq_id
=
request_id
,
inputs
=
token_inputs
(
token_ids
),
inputs
=
token_inputs
(
token_ids
),
...
@@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
...
@@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
lora_int_id
=
lora_int_id
))
lora_int_id
=
lora_int_id
))
def
create_dummy_sequence
(
request_id
:
int
,
token_ids
:
L
ist
[
int
],
def
create_dummy_sequence
(
request_id
:
int
,
token_ids
:
l
ist
[
int
],
block_size
:
int
)
->
Sequence
:
block_size
:
int
)
->
Sequence
:
return
Sequence
(
return
Sequence
(
seq_id
=
request_id
,
seq_id
=
request_id
,
...
@@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder(
...
@@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder(
block_size
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
best_of
:
int
=
1
,
best_of
:
int
=
1
,
)
->
T
uple
[
Sequence
,
Sequence
,
SequenceGroup
]:
)
->
t
uple
[
Sequence
,
Sequence
,
SequenceGroup
]:
if
not
block_size
:
if
not
block_size
:
block_size
=
decoder_prompt_length
block_size
=
decoder_prompt_length
...
@@ -125,7 +124,7 @@ def create_seq_group(
...
@@ -125,7 +124,7 @@ def create_seq_group(
prompt_token_ids
=
[
0
]
*
seq_prompt_len
prompt_token_ids
=
[
0
]
*
seq_prompt_len
seqs
:
L
ist
[
Sequence
]
=
[]
seqs
:
l
ist
[
Sequence
]
=
[]
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
for
seq_id_offset
,
output_len
in
enumerate
(
seq_output_lens
):
seq
=
Sequence
(
seq
=
Sequence
(
seq_id
=
seq_id_start
+
seq_id_offset
,
seq_id
=
seq_id_start
+
seq_id_offset
,
...
@@ -241,7 +240,7 @@ class SchedulerProxy:
...
@@ -241,7 +240,7 @@ class SchedulerProxy:
def
__init__
(
self
,
scheduler
:
Scheduler
):
def
__init__
(
self
,
scheduler
:
Scheduler
):
self
.
scheduler_
=
scheduler
self
.
scheduler_
=
scheduler
self
.
call_history
:
D
ict
[
str
,
L
ist
[
Any
]]
=
defaultdict
(
list
)
self
.
call_history
:
d
ict
[
str
,
l
ist
[
Any
]]
=
defaultdict
(
list
)
def
__getattr__
(
self
,
name
:
str
)
->
Any
:
def
__getattr__
(
self
,
name
:
str
)
->
Any
:
...
@@ -253,6 +252,6 @@ class SchedulerProxy:
...
@@ -253,6 +252,6 @@ class SchedulerProxy:
return
wrapper
return
wrapper
def
last_schedule_ret
(
def
last_schedule_ret
(
self
,
)
->
T
uple
[
L
ist
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
self
,
)
->
t
uple
[
l
ist
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
return
ret
return
ret
tests/distributed/test_expert_parallel.py
View file @
cf069aa8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
import
pytest
...
@@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple):
...
@@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple):
@
dataclass
@
dataclass
class
EPTestSettings
:
class
EPTestSettings
:
parallel_setups
:
L
ist
[
ParallelSetup
]
parallel_setups
:
l
ist
[
ParallelSetup
]
distributed_backends
:
L
ist
[
str
]
distributed_backends
:
l
ist
[
str
]
task
:
TaskOption
task
:
TaskOption
test_options
:
EPTestOptions
test_options
:
EPTestOptions
...
...
tests/distributed/test_pipeline_parallel.py
View file @
cf069aa8
...
@@ -9,7 +9,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
...
@@ -9,7 +9,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
import
json
import
json
import
os
import
os
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
,
Literal
,
NamedTuple
,
Optional
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
import
pytest
...
@@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple):
...
@@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple):
@
dataclass
@
dataclass
class
PPTestSettings
:
class
PPTestSettings
:
parallel_setups
:
L
ist
[
ParallelSetup
]
parallel_setups
:
l
ist
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# are first zipped together to iterate over all
# test settings.
# test settings.
distributed_backends
:
L
ist
[
str
]
distributed_backends
:
l
ist
[
str
]
# vllm major version: "0" for V0, "1" for V1
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
L
ist
[
str
]
vllm_major_versions
:
l
ist
[
str
]
task
:
TaskOption
task
:
TaskOption
test_options
:
PPTestOptions
test_options
:
PPTestOptions
...
...
tests/distributed/test_pynccl.py
View file @
cf069aa8
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
import
multiprocessing
import
multiprocessing
import
os
import
os
from
typing
import
Dict
,
List
import
pytest
import
pytest
import
torch
import
torch
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
...
@@ -20,9 +19,9 @@ from vllm.utils import update_environment_variables
def
distributed_run
(
fn
,
world_size
):
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
number_of_processes
=
world_size
processes
:
L
ist
[
multiprocessing
.
Process
]
=
[]
processes
:
l
ist
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
for
i
in
range
(
number_of_processes
):
env
:
D
ict
[
str
,
str
]
=
{}
env
:
d
ict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
...
...
tests/distributed/test_shm_broadcast.py
View file @
cf069aa8
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
import
multiprocessing
import
multiprocessing
import
random
import
random
import
time
import
time
from
typing
import
List
import
numpy
as
np
import
numpy
as
np
import
torch.distributed
as
dist
import
torch.distributed
as
dist
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
...
@@ -13,7 +12,7 @@ from vllm.distributed.utils import StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
from
vllm.utils
import
get_ip
,
get_open_port
,
update_environment_variables
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
L
ist
[
np
.
ndarray
]:
def
get_arrays
(
n
:
int
,
seed
:
int
=
0
)
->
l
ist
[
np
.
ndarray
]:
np
.
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
sizes
=
np
.
random
.
randint
(
1
,
10_000
,
n
)
# on average, each array will have 5k elements
# on average, each array will have 5k elements
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
cf069aa8
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
Optional
import
pytest
import
pytest
from
transformers
import
AutoModelForSeq2SeqLM
from
transformers
import
AutoModelForSeq2SeqLM
...
@@ -22,7 +22,7 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
...
@@ -22,7 +22,7 @@ LIST_ENC_DEC_SUPPORTED_BACKENDS = [
def
vllm_to_hf_output
(
def
vllm_to_hf_output
(
vllm_output
:
T
uple
[
L
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
vllm_output
:
t
uple
[
l
ist
[
int
],
str
,
Optional
[
SampleLogprobs
]],
decoder_prompt_type
:
DecoderPromptType
,
decoder_prompt_type
:
DecoderPromptType
,
):
):
"""Sanitize vllm output to be comparable with hf output."""
"""Sanitize vllm output to be comparable with hf output."""
...
...
Prev
1
2
3
4
5
6
7
…
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment