Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
778 additions
and
85 deletions
+778
-85
tests/v1/test_metrics_reader.py
tests/v1/test_metrics_reader.py
+1
-0
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+1
-0
tests/v1/test_serial_utils.py
tests/v1/test_serial_utils.py
+1
-0
tests/v1/test_utils.py
tests/v1/test_utils.py
+1
-0
tests/v1/tpu/test_basic.py
tests/v1/tpu/test_basic.py
+1
-0
tests/v1/tpu/test_mha_attn.py
tests/v1/tpu/test_mha_attn.py
+1
-0
tests/v1/tpu/test_multimodal.py
tests/v1/tpu/test_multimodal.py
+1
-0
tests/v1/tpu/test_pallas.py
tests/v1/tpu/test_pallas.py
+1
-0
tests/v1/tpu/test_perf.py
tests/v1/tpu/test_perf.py
+1
-0
tests/v1/tpu/test_sampler.py
tests/v1/tpu/test_sampler.py
+1
-0
tests/v1/tpu/test_spmd_model_weight_loading.py
tests/v1/tpu/test_spmd_model_weight_loading.py
+70
-0
tests/v1/tpu/test_topk_topp_sampler.py
tests/v1/tpu/test_topk_topp_sampler.py
+1
-0
tests/v1/tpu/test_tpu_qkv_linear.py
tests/v1/tpu/test_tpu_qkv_linear.py
+89
-0
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+278
-34
tests/v1/worker/test_gpu_input_batch.py
tests/v1/worker/test_gpu_input_batch.py
+5
-27
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+321
-24
tests/vllm_test_utils/setup.py
tests/vllm_test_utils/setup.py
+1
-0
tests/vllm_test_utils/vllm_test_utils/__init__.py
tests/vllm_test_utils/vllm_test_utils/__init__.py
+1
-0
tests/vllm_test_utils/vllm_test_utils/blame.py
tests/vllm_test_utils/vllm_test_utils/blame.py
+1
-0
tests/vllm_test_utils/vllm_test_utils/monitor.py
tests/vllm_test_utils/vllm_test_utils/monitor.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/v1/test_metrics_reader.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
prometheus_client
import
pytest
...
...
tests/v1/test_oracle.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
...
...
tests/v1/test_serial_utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections
import
UserDict
from
dataclasses
import
dataclass
from
typing
import
Optional
...
...
tests/v1/test_utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
...
...
tests/v1/tpu/test_basic.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""A basic correctness check for TPUs
Run `pytest tests/v1/tpu/test_basic.py`.
...
...
tests/v1/tpu/test_mha_attn.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test:
...
...
tests/v1/tpu/test_multimodal.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
openai
import
pytest
...
...
tests/v1/tpu/test_pallas.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
unittest.mock
import
ANY
,
patch
import
torch
...
...
tests/v1/tpu/test_perf.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""A basic performance regression test for TPUs
Run `pytest tests/v1/tpu/test_perf.py`.
...
...
tests/v1/tpu/test_sampler.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
pytest
...
...
tests/v1/tpu/test_spmd_model_weight_loading.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
import
gc
import
tempfile
import
numpy
as
np
import
pytest
import
torch_xla.distributed.spmd
as
xs
import
torch_xla.runtime
as
xr
from
vllm.config
import
set_current_vllm_config
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.model_loader.tpu
import
TPUModelLoader
def
_setup_environment
(
model
):
engine_args
=
EngineArgs
(
model
=
model
,
)
vllm_config
=
engine_args
.
create_engine_config
()
with
set_current_vllm_config
(
vllm_config
):
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
1
,
0
,
local_rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
backend
=
"gloo"
)
# Under single worker mode, full model is init first and then
# partitioned using GSPMD.
ensure_model_parallel_initialized
(
1
,
1
)
return
vllm_config
MESH
=
None
def
_get_spmd_mesh
():
global
MESH
if
MESH
is
None
:
xr
.
use_spmd
()
num_devices
=
xr
.
global_runtime_device_count
()
mesh_shape
=
(
num_devices
,
1
)
device_ids
=
np
.
array
(
range
(
num_devices
))
MESH
=
xs
.
Mesh
(
device_ids
,
mesh_shape
,
(
'x'
,
'y'
))
return
MESH
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"Qwen/Qwen2-1.5B-Instruct"
,
# Skip large models due to CI runner disk space limitations
# "meta-llama/Llama-3.1-8B-Instruct",
# "meta-llama/Llama-3.1-70B-Instruct",
])
def
test_tpu_model_loader
(
model
):
# Skip the 70B test if there are less than 8 chips
# TODO: Query using torch xla API, the query API is not working
# with SPMD now. However, This test is running under SPMD mode.
if
'70B'
in
model
and
xr
.
global_runtime_device_count
()
<
8
:
pytest
.
skip
(
"Skipping 70B model if the TPU VM has less than 8 chips to
\
avoid OOM."
)
vllm_config
=
_setup_environment
(
model
)
loader
=
TPUModelLoader
(
load_config
=
vllm_config
.
load_config
)
mesh
=
_get_spmd_mesh
()
model
=
loader
.
load_model
(
vllm_config
,
vllm_config
.
model_config
,
mesh
)
del
model
gc
.
collect
()
tests/v1/tpu/test_topk_topp_sampler.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
pytest
...
...
tests/v1/tpu/test_tpu_qkv_linear.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
import
tempfile
import
numpy
as
np
import
pytest
import
torch
import
torch_xla.distributed.spmd
as
xs
import
torch_xla.runtime
as
xr
from
vllm.config
import
set_current_vllm_config
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.distributed.tpu_distributed_utils
import
XlaQKVParallelLinear
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.layers.linear
import
QKVParallelLinear
@
pytest
.
fixture
(
autouse
=
True
)
def
setup_environment
():
# This is a fake config used for init dist env.
# QKVParallelLinear needs dist env to be initialized.
engine_args
=
EngineArgs
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
max_model_len
=
64
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
4
,
)
vllm_config
=
engine_args
.
create_engine_config
()
with
set_current_vllm_config
(
vllm_config
):
temp_file
=
tempfile
.
mkstemp
()[
1
]
init_distributed_environment
(
1
,
0
,
local_rank
=
0
,
distributed_init_method
=
f
"file://
{
temp_file
}
"
,
backend
=
"gloo"
)
ensure_model_parallel_initialized
(
1
,
1
)
yield
MESH
=
None
def
_get_spmd_mesh
():
global
MESH
if
MESH
is
None
:
xr
.
use_spmd
()
num_devices
=
xr
.
global_runtime_device_count
()
mesh_shape
=
(
num_devices
,
1
)
device_ids
=
np
.
array
(
range
(
num_devices
))
MESH
=
xs
.
Mesh
(
device_ids
,
mesh_shape
,
(
'x'
,
'y'
))
return
MESH
@
pytest
.
mark
.
parametrize
(
"bias"
,
[
False
,
True
])
# `xr.use_spmd()` will set a global state, and this state is not reversible.
# Therefore, non-SPMD tests should be run before SPMD tests.
@
pytest
.
mark
.
parametrize
(
"mesh"
,
[
None
,
_get_spmd_mesh
()])
@
pytest
.
mark
.
parametrize
(
"device"
,
[
'cpu'
,
'xla'
])
@
torch
.
no_grad
()
def
test_xla_qkv_linear
(
bias
,
mesh
,
device
):
torch
.
manual_seed
(
123
)
qkv_linear
=
QKVParallelLinear
(
hidden_size
=
4096
,
head_size
=
128
,
total_num_heads
=
32
,
total_num_kv_heads
=
8
,
bias
=
bias
,
params_dtype
=
torch
.
bfloat16
,
return_bias
=
False
,
)
qkv_linear
.
weight
.
data
=
torch
.
rand_like
(
qkv_linear
.
weight
.
data
)
/
10
if
bias
:
qkv_linear
.
bias
.
data
=
torch
.
rand_like
(
qkv_linear
.
bias
.
data
)
xla_qkv_linear
=
XlaQKVParallelLinear
(
qkv_linear
,
mesh
=
mesh
)
qkv_linear
=
qkv_linear
.
to
(
device
)
xla_qkv_linear
=
xla_qkv_linear
.
to
(
device
)
input_tensor
=
torch
.
rand
(
10
,
4096
,
dtype
=
torch
.
bfloat16
)
/
10
input_tensor
=
input_tensor
.
to
(
device
)
output
=
qkv_linear
(
input_tensor
)
xla_output
=
xla_qkv_linear
(
input_tensor
)
assert
torch
.
allclose
(
output
.
cpu
(),
xla_output
.
cpu
())
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
import
unittest.mock
as
mock
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.config
import
CacheConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
from
vllm.attention.layer
import
Attention
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
SchedulerConfig
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
GiB_bytes
from
vllm.v1.core.kv_cache_utils
import
(
estimate_max_model_len
,
get_kv_cache_config
)
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
SchedulerOutput
)
from
vllm.v1.worker.tpu_model_runner
import
(
TPUModelRunner
,
_get_padded_num_reqs_with_upper_limit
,
_get_padded_token_len
,
_get_req_paddings
,
_get_token_paddings
)
# Mock torch_xla module since it may not be available in the test environments
torch_xla_patcher
=
mock
.
patch
.
dict
(
"sys.modules"
,
{
"torch_xla"
:
mock
.
MagicMock
(),
"torch_xla.core.xla_model"
:
mock
.
MagicMock
(),
"torch_xla.runtime"
:
mock
.
MagicMock
(),
})
torch_xla_patcher
.
start
()
# Mock the PallasAttentionBackend
pallas_attention_backend_patcher
=
mock
.
patch
(
"vllm.v1.worker.tpu_model_runner.PallasAttentionBackend"
,
)
pallas_attention_backend_patcher
.
start
()
@
pytest
.
fixture
def
model_runner
():
# Patchers have already been started at module level.
def
get_vllm_config
():
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
10
,
max_num_batched_tokens
=
512
,
...
...
@@ -54,18 +43,19 @@ def model_runner():
cache_config
=
cache_config
,
scheduler_config
=
scheduler_config
,
)
return
vllm_config
def
get_model_runner
(
vllm_config
):
device
=
"xla:0"
# Mocking TPU device
with
mock
.
patch
(
"vllm.v1.worker.tpu_model_runner.torch"
),
\
mock
.
patch
(
"vllm.v1.worker.tpu_model_runner.xm"
),
\
mock
.
patch
(
"vllm.v1.worker.tpu_model_runner.xr"
):
return
TPUModelRunner
(
vllm_config
,
device
)
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"session"
)
def
cleanup_patches
():
yield
torch_xla_patcher
.
stop
()
pallas_attention_backend_patcher
.
stop
(
)
@
pytest
.
fixture
def
model_runner
():
# Patchers have already been started at module level.
vllm_config
=
get_vllm_config
()
return
get_model_runner
(
vllm_config
)
def
_schedule_new_request
(
*
req_ids
:
str
)
->
SchedulerOutput
:
...
...
@@ -81,7 +71,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
mm_hashes
=
[],
mm_positions
=
[],
sampling_params
=
SamplingParams
(),
block_ids
=
[
0
],
block_ids
=
(
[
0
],
),
# block_ids should be tuple[list[int]]
num_computed_tokens
=
0
,
lora_request
=
None
,
))
...
...
@@ -112,14 +102,35 @@ def _is_req_added(model_runner, req_id: str) -> bool:
def
_is_req_state_block_table_match
(
model_runner
,
req_id
:
str
)
->
bool
:
"""Check if the request state block IDs match the block table.
This function handles both legacy BlockTable and new MultiGroupBlockTable
structures for backward compatibility.
"""
req_index
=
model_runner
.
input_batch
.
req_id_to_index
[
req_id
]
block_table
=
model_runner
.
input_batch
.
block_table
multi_group_
block_table
=
model_runner
.
input_batch
.
block_table
req_state
=
model_runner
.
requests
[
req_id
]
if
block_table
.
num_blocks_per_row
[
req_index
]
!=
len
(
req_state
.
block_ids
):
# Access the first block table from MultiGroupBlockTable
# This is safe since we currently only use single KV cache groups
block_table
=
multi_group_block_table
[
0
]
# req_state.block_ids is now tuple[list[int], ...] for MultiGroupBlockTable
# Extract the first group's block IDs
if
isinstance
(
req_state
.
block_ids
[
0
],
list
):
# New format: tuple[list[int], ...] - extract first group
req_block_ids
=
req_state
.
block_ids
[
0
]
else
:
# Legacy format: list[int] - use directly
req_block_ids
=
req_state
.
block_ids
if
block_table
.
num_blocks_per_row
[
req_index
]
!=
len
(
req_block_ids
):
return
False
num_blocks
=
block_table
.
num_blocks_per_row
[
req_index
]
return
(
block_table
.
block_table_np
[
req_index
,
:
num_blocks
]
==
req_state
.
block_ids
).
all
()
block_table_values
=
block_table
.
block_table_np
[
req_index
,
:
num_blocks
]
return
(
block_table_values
==
req_
block_ids
).
all
()
def
test_update_states_new_request
(
model_runner
):
...
...
@@ -199,7 +210,7 @@ def test_update_states_request_resumed(model_runner):
req_id
=
req_id
,
resumed_from_preemption
=
False
,
new_token_ids
=
[],
new_block_ids
=
[],
new_block_ids
=
(
[],
),
num_computed_tokens
=
0
,
)
...
...
@@ -341,3 +352,236 @@ def test_get_req_paddings():
assert
_get_req_paddings
(
1
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
32
)
==
[
8
,
16
,
32
]
assert
_get_req_paddings
(
8
,
36
)
==
[
8
,
16
,
32
,
36
]
def
test_init_kv_cache_with_kv_sharing_invalid_target_layer_order
(
model_runner
):
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
error_msg
=
f
"
{
layer_1
}
must come before the current layer"
vllm_config
=
model_runner
.
vllm_config
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
),
\
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_0
,
kv_sharing_target_layer_name
=
layer_1
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_with_kv_sharing_target_layer_not_exist
(
model_runner
):
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
invalid_layer
=
"model.layers.0.cross_attn.attn"
error_msg
=
f
"
{
invalid_layer
}
is not a valid Attention layer in the model"
vllm_config
=
model_runner
.
vllm_config
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
),
\
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_1
,
# invalid layer: cross_attn.atn doesn't exist!
kv_sharing_target_layer_name
=
invalid_layer
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_with_kv_sharing_target_same_as_current
(
model_runner
):
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
error_msg
=
f
"
{
layer_1
}
cannot be the same as the current layer"
vllm_config
=
model_runner
.
vllm_config
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
),
\
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_1
,
kv_sharing_target_layer_name
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_without_kv_sharing
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
vllm_config
=
get_vllm_config
()
with
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
# Set high context length to test max context length estimation
vllm_config
.
model_config
.
max_model_len
=
1_000_000
vllm_ctx
=
vllm_config
.
compilation_config
.
static_forward_context
model_runner
=
get_model_runner
(
vllm_config
)
kv_cache_spec
=
model_runner
.
get_kv_cache_spec
()
assert
len
(
kv_cache_spec
)
==
2
assert
len
(
model_runner
.
shared_kv_cache_layers
)
==
0
available_memory
=
20
*
GiB_bytes
# page size for each layer KV can be calculated as
# 2 (non-MLA) * 8 (num_heads) * 128 (head_dim)
# * 2 (bfloat16, kv_cache dtype) * 128 (block_size) = 512KB
num_expected_blocks
=
20480
# 20GB / 512KB / 2 (num layers)
kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_spec
,
available_memory
)
assert
kv_cache_config
.
num_blocks
==
num_expected_blocks
assert
len
(
kv_cache_config
.
kv_cache_tensors
)
==
2
assert
kv_cache_config
.
kv_cache_tensors
[
0
].
size
==
available_memory
//
2
assert
kv_cache_config
.
kv_cache_tensors
[
1
].
size
==
available_memory
//
2
max_context_len
=
\
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
5
*
GiB_bytes
)
# max context len with KV sharing should be 2x as large as without
# max_context_len = available_memory / (page_size / block_size) / num_caches
# max_context_len = 5GB / (512KB / 128) / 2 = 655360
assert
max_context_len
==
655360
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 2 block worth of memory (2 * 512kb)
kv_cache_config
.
num_blocks
=
1
for
kv_cache_tensor
in
kv_cache_config
.
kv_cache_tensors
:
kv_cache_tensor
.
size
=
(
kv_cache_spec
[
kv_cache_tensor
.
shared_by
[
0
]].
page_size_bytes
)
model_runner
.
initialize_kv_cache
(
kv_cache_config
)
layer_0_kv
=
vllm_ctx
[
layer_0
].
kv_cache
[
0
]
layer_1_kv
=
vllm_ctx
[
layer_1
].
kv_cache
[
0
]
# check layer 1 kv cache does NOT share memory with layer 0
assert
id
(
layer_1_kv
)
!=
id
(
layer_0_kv
)
# check layer 1 added to kv cache group's layer names
assert
len
(
kv_cache_config
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
def
test_init_kv_cache_with_kv_sharing_valid
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
vllm_config
=
get_vllm_config
()
with
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
128
,
scale
=
1.0
,
prefix
=
layer_1
,
kv_sharing_target_layer_name
=
"model.layers.0.self_attn.attn"
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
# Set high context length to test max context length estimation
vllm_config
.
model_config
.
max_model_len
=
3_000_000
vllm_ctx
=
vllm_config
.
compilation_config
.
static_forward_context
model_runner
=
get_model_runner
(
vllm_config
)
kv_cache_spec
=
model_runner
.
get_kv_cache_spec
()
assert
len
(
kv_cache_spec
)
==
1
assert
layer_0
in
kv_cache_spec
assert
model_runner
.
shared_kv_cache_layers
[
layer_1
]
==
layer_0
available_memory
=
20
*
GiB_bytes
# page size for layer 0's kv_cache_spec is 512KB
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
# which is twice as many as without KV sharing
num_expected_blocks
=
2
*
20480
# 20GB / 512KB
kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_spec
,
available_memory
)
assert
kv_cache_config
.
num_blocks
==
num_expected_blocks
assert
len
(
kv_cache_config
.
kv_cache_tensors
)
==
1
# Each layer now has twice the available memory for KV cache
# compared to no KV sharing
assert
kv_cache_config
.
kv_cache_tensors
[
0
].
size
==
available_memory
max_context_len
=
\
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
5
*
GiB_bytes
)
# max context len with KV sharing should be 2x as large as without
assert
max_context_len
==
(
2
*
655360
)
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 1 block worth of memory (512kb)
kv_cache_config
.
num_blocks
=
1
kv_cache_config
.
kv_cache_tensors
[
0
].
size
=
\
kv_cache_spec
[
layer_0
].
page_size_bytes
model_runner
.
initialize_kv_cache
(
kv_cache_config
)
layer_0_kv
=
vllm_ctx
[
layer_0
].
kv_cache
[
0
]
layer_1_kv
=
vllm_ctx
[
layer_1
].
kv_cache
[
0
]
# check layer 1 kv cache shares memory with layer 0
assert
id
(
layer_1_kv
)
==
id
(
layer_0_kv
)
# check layer 1 added to kv cache group's layer names
assert
len
(
kv_cache_config
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
tests/v1/worker/test_gpu_input_batch.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
inspect
from
typing
import
Optional
...
...
@@ -9,8 +10,6 @@ import torch
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
is_pin_memory_available
,
make_tensor_with_pad
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
KVCacheGroupSpec
,
KVCacheTensor
)
from
vllm.v1.sample.metadata
import
SamplingMetadata
from
vllm.v1.worker.block_table
import
BlockTable
,
MultiGroupBlockTable
from
vllm.v1.worker.gpu_input_batch
import
CachedRequestState
,
InputBatch
...
...
@@ -24,27 +23,6 @@ CUDA_DEVICES = [
MAX_NUM_PROMPT_TOKENS
=
64
def
get_kv_cache_config
()
->
KVCacheConfig
:
return
KVCacheConfig
(
num_blocks
=
10
,
tensors
=
{
"layer.0"
:
KVCacheTensor
(
size
=
1024
),
},
kv_cache_groups
=
[
KVCacheGroupSpec
(
layer_names
=
[
"layer.0"
],
kv_cache_spec
=
FullAttentionSpec
(
block_size
=
1
,
num_kv_heads
=
1
,
head_size
=
16
,
dtype
=
torch
.
float16
,
use_mla
=
False
,
),
),
],
)
def
_compare_objs
(
obj1
,
obj2
):
attrs
=
inspect
.
getmembers
(
obj1
,
lambda
a
:
not
(
inspect
.
isroutine
(
a
)))
attr_names
=
set
([
...
...
@@ -225,7 +203,7 @@ def _construct_cached_request_state(req_id_suffix: int):
sampling_params
=
_create_sampling_params
(),
mm_inputs
=
[],
mm_positions
=
[],
block_ids
=
[
[]
]
,
block_ids
=
(
[]
,
)
,
generator
=
None
,
num_computed_tokens
=
len
(
output_token_ids
),
output_token_ids
=
output_token_ids
,
...
...
@@ -251,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
device
=
torch
.
device
(
device
),
pin_memory
=
is_pin_memory_available
(),
vocab_size
=
1024
,
block_size
=
1
,
block_size
s
=
[
1
]
,
)
reqs
:
list
[
CachedRequestState
]
=
[]
req_id_reqs
=
{}
...
...
@@ -341,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
device
=
torch
.
device
(
device
),
pin_memory
=
is_pin_memory_available
(),
vocab_size
=
1024
,
block_size
=
1
,
block_size
s
=
[
1
]
,
)
ref_input_batch
:
InputBatch
=
InputBatch
(
max_num_reqs
=
batch_size
,
...
...
@@ -350,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
device
=
torch
.
device
(
device
),
pin_memory
=
is_pin_memory_available
(),
vocab_size
=
1024
,
block_size
=
1
,
block_size
s
=
[
1
]
,
)
reqs
:
list
[
CachedRequestState
]
=
[]
...
...
tests/v1/worker/test_gpu_model_runner.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
pytest
from
vllm.attention
import
Attention
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
VllmConfig
)
SchedulerConfig
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
GiB_bytes
from
vllm.v1.core.kv_cache_utils
import
(
estimate_max_model_len
,
get_kv_cache_config
)
from
vllm.v1.core.sched.output
import
(
CachedRequestData
,
NewRequestData
,
SchedulerOutput
)
from
vllm.v1.kv_cache_interface
import
(
FullAttentionSpec
,
KVCacheConfig
,
...
...
@@ -13,28 +20,33 @@ from vllm.v1.sample.metadata import SamplingMetadata
from
vllm.v1.worker.gpu_input_batch
import
InputBatch
from
vllm.v1.worker.gpu_model_runner
import
GPUModelRunner
BLOCK_SIZE
=
16
NUM_BLOCKS
=
10
DEVICE
=
"cuda"
def
initialize_kv_cache
(
runner
:
GPUModelRunner
):
"""
Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
"""
kv_cache_config
=
KVCacheConfig
(
num_blocks
=
10
,
tensors
=
{
"layer.0"
:
KVCacheTensor
(
size
=
1024
),
},
kv_cache_groups
=
[
KVCacheGroupSpec
(
layer_names
=
[
"layer.0"
],
kv_cache_spec
=
FullAttentionSpec
(
block_size
=
16
,
attn_spec
=
FullAttentionSpec
(
block_size
=
BLOCK_SIZE
,
num_kv_heads
=
runner
.
model_config
.
get_num_kv_heads
(
runner
.
parallel_config
),
head_size
=
runner
.
model_config
.
get_head_size
(),
dtype
=
runner
.
kv_cache_dtype
,
use_mla
=
False
,
))
])
)
tensor_size
=
attn_spec
.
page_size_bytes
*
NUM_BLOCKS
kv_cache_config
=
KVCacheConfig
(
num_blocks
=
NUM_BLOCKS
,
kv_cache_tensors
=
[
KVCacheTensor
(
size
=
tensor_size
,
shared_by
=
[
"layer.0"
]),
],
kv_cache_groups
=
[
KVCacheGroupSpec
(
layer_names
=
[
"layer.0"
],
kv_cache_spec
=
attn_spec
)
],
)
runner
.
kv_cache_config
=
kv_cache_config
runner
.
input_batch
=
InputBatch
(
max_num_reqs
=
runner
.
max_num_reqs
,
...
...
@@ -43,13 +55,14 @@ def initialize_kv_cache(runner: GPUModelRunner):
device
=
runner
.
device
,
pin_memory
=
runner
.
pin_memory
,
vocab_size
=
runner
.
model_config
.
get_vocab_size
(),
block_size
=
kv_cache_config
.
kv_cache_groups
[
0
].
kv_cache_spec
.
block_size
,
block_sizes
=
[
kv_cache_config
.
kv_cache_groups
[
0
].
kv_cache_spec
.
block_size
],
)
runner
.
initialize_attn_backend
(
kv_cache_config
)
@
pytest
.
fixture
def
model_runner
():
def
get_vllm_config
():
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
10
,
max_num_batched_tokens
=
512
,
...
...
@@ -65,7 +78,7 @@ def model_runner():
seed
=
42
,
)
cache_config
=
CacheConfig
(
block_size
=
16
,
block_size
=
BLOCK_SIZE
,
gpu_memory_utilization
=
0.9
,
swap_space
=
0
,
cache_dtype
=
"auto"
,
...
...
@@ -77,13 +90,25 @@ def model_runner():
scheduler_config
=
scheduler_config
,
parallel_config
=
parallel_config
,
)
return
vllm_config
device
=
"cuda"
runner
=
GPUModelRunner
(
vllm_config
,
device
)
@
pytest
.
fixture
def
model_runner
():
vllm_config
=
get_vllm_config
()
model_config
=
vllm_config
.
model_config
num_heads
=
model_config
.
get_num_kv_heads
(
vllm_config
.
parallel_config
)
head_size
=
model_config
.
get_head_size
()
vllm_config
.
compilation_config
.
static_forward_context
[
"layer.0"
]
=
Attention
(
num_heads
,
head_size
,
0.1
)
runner
=
GPUModelRunner
(
vllm_config
,
DEVICE
)
initialize_kv_cache
(
runner
)
return
runner
model_runner_2
=
model_runner
def
_schedule_new_request
(
*
req_ids
:
str
)
->
SchedulerOutput
:
new_reqs
=
[]
num_scheduled_tokens
=
{}
...
...
@@ -97,7 +122,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
mm_hashes
=
[],
mm_positions
=
[],
sampling_params
=
SamplingParams
(),
block_ids
=
[
[
0
]
]
,
block_ids
=
(
[
0
]
,
)
,
num_computed_tokens
=
0
,
lora_request
=
None
,
))
...
...
@@ -225,7 +250,7 @@ def test_update_states_request_resumed(model_runner):
req_id
=
req_id
,
resumed_from_preemption
=
False
,
new_token_ids
=
[],
new_block_ids
=
[
[]
]
,
new_block_ids
=
(
[]
,
)
,
num_computed_tokens
=
0
,
)
...
...
@@ -321,3 +346,275 @@ def test_update_states_request_unscheduled(model_runner):
assert
_is_req_added
(
model_runner
,
req_ids
[
1
])
assert
not
_is_req_scheduled
(
model_runner
,
req_ids
[
1
])
def
test_kv_cache_stride_order
(
monkeypatch
,
model_runner
):
# This test checks if GPUModelRunner initializes correctly when an attention
# backend enforces a non-default KV cache stride order.
n_heads
=
model_runner
.
model_config
.
get_num_kv_heads
(
model_runner
.
parallel_config
)
expected_kv_cache_shape
=
[
2
,
NUM_BLOCKS
,
BLOCK_SIZE
,
n_heads
,
model_runner
.
model_config
.
get_head_size
()
]
# TODO mla test
default_stride
=
list
(
range
(
5
))
# Permutation that gets you back to expected kv shape
rnd_stride
=
tuple
(
random
.
sample
(
default_stride
,
len
(
default_stride
)))
def
rnd_stride_order
():
return
rnd_stride
# Patch the attention backend class and re-trigger the KV cache creation.
for
attn_backend
in
model_runner
.
attn_backends
:
monkeypatch
.
setattr
(
attn_backend
,
"get_kv_cache_stride_order"
,
rnd_stride_order
)
model_runner
.
attn_backends
=
[]
model_runner
.
attn_metadata_builders
=
[]
model_runner
.
initialize_kv_cache
(
model_runner
.
kv_cache_config
)
# Shape is unchanged, but layout may differ
kv_cache_shape
=
model_runner
.
kv_caches
[
0
].
shape
assert
list
(
kv_cache_shape
)
==
expected_kv_cache_shape
if
default_stride
==
rnd_stride
:
assert
all
(
kv
.
is_contiguous
()
for
kv
in
model_runner
.
kv_caches
)
else
:
assert
all
(
not
kv
.
is_contiguous
()
for
kv
in
model_runner
.
kv_caches
)
def
test_load_model_weights_inplace
(
dist_init
,
model_runner
,
model_runner_2
):
# In this test, model_runner loads model + weights in one go, while
# model_runner_2 loads dummy weights first then load real weights inplace
model_runner
.
load_model
()
original_load_format
=
model_runner_2
.
load_config
.
load_format
model_runner_2
.
load_config
.
load_format
=
"dummy"
model_runner_2
.
load_model
()
# Initial model loading with dummy weights
assert
str
(
model_runner
.
get_model
().
state_dict
())
!=
str
(
model_runner_2
.
get_model
().
state_dict
())
model_runner_2
.
load_config
.
load_format
=
original_load_format
model_runner_2
.
load_model
()
# Load real weights inplace
assert
str
(
model_runner
.
get_model
().
state_dict
())
==
str
(
model_runner_2
.
get_model
().
state_dict
())
def
test_init_kv_cache_with_kv_sharing_invalid_target_layer_order
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
error_msg
=
f
"
{
layer_1
}
must come before the current layer"
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
):
fwd_context
=
{
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_0
,
kv_sharing_target_layer_name
=
layer_1
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_with_kv_sharing_target_layer_not_exist
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
invalid_layer
=
"model.layers.0.cross_attn.attn"
error_msg
=
f
"
{
invalid_layer
}
is not a valid Attention layer in the model"
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_1
,
# invalid layer: cross_attn.atn doesn't exist!
kv_sharing_target_layer_name
=
invalid_layer
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_with_kv_sharing_target_same_as_current
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
error_msg
=
f
"
{
layer_1
}
cannot be the same as the current layer"
with
pytest
.
raises
(
ValueError
,
match
=
error_msg
):
fwd_context
=
{
# initialization below will fail because target layer is invalid;
# the target layer needs to come before layer 1
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_1
,
kv_sharing_target_layer_name
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
def
test_init_kv_cache_without_kv_sharing
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
vllm_config
=
get_vllm_config
()
with
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_1
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
# Set high context length to test max context length estimation
vllm_config
.
model_config
.
max_model_len
=
3_000_000
vllm_ctx
=
vllm_config
.
compilation_config
.
static_forward_context
runner
=
GPUModelRunner
(
vllm_config
,
DEVICE
)
kv_cache_spec
=
runner
.
get_kv_cache_spec
()
assert
len
(
kv_cache_spec
)
==
2
assert
len
(
runner
.
shared_kv_cache_layers
)
==
0
available_memory
=
20
*
GiB_bytes
# page size for layer 0's kv_cache_spec is 32KB
num_expected_blocks
=
327680
# 20GB / 32KB / 2 (num layers)
kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_spec
,
available_memory
)
assert
kv_cache_config
.
num_blocks
==
num_expected_blocks
assert
len
(
kv_cache_config
.
kv_cache_tensors
)
==
2
assert
kv_cache_config
.
kv_cache_tensors
[
0
].
size
==
available_memory
//
2
assert
kv_cache_config
.
kv_cache_tensors
[
1
].
size
==
available_memory
//
2
max_context_len
=
\
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
5
*
GiB_bytes
)
# max context len with KV sharing should be 2x as large as without
assert
max_context_len
==
1310720
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 2 block worth of memory (2 * 32kb)
kv_cache_config
.
num_blocks
=
1
for
kv_cache_tensor
in
kv_cache_config
.
kv_cache_tensors
:
kv_cache_tensor
.
size
=
(
kv_cache_spec
[
kv_cache_tensor
.
shared_by
[
0
]].
page_size_bytes
)
runner
.
initialize_kv_cache
(
kv_cache_config
)
layer_0_kv
=
vllm_ctx
[
layer_0
].
kv_cache
[
0
]
layer_1_kv
=
vllm_ctx
[
layer_1
].
kv_cache
[
0
]
# check layer 1 kv cache does NOT share memory with layer 0
assert
id
(
layer_1_kv
)
!=
id
(
layer_0_kv
)
# check layer 1 added to kv cache group's layer names
assert
len
(
kv_cache_config
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
def
test_init_kv_cache_with_kv_sharing_valid
():
layer_0
=
"model.layers.0.self_attn.attn"
layer_1
=
"model.layers.1.self_attn.attn"
vllm_config
=
get_vllm_config
()
with
set_current_vllm_config
(
vllm_config
):
fwd_context
=
{
layer_0
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_0
,
),
layer_1
:
Attention
(
num_heads
=
8
,
head_size
=
64
,
scale
=
1.0
,
prefix
=
layer_1
,
kv_sharing_target_layer_name
=
"model.layers.0.self_attn.attn"
,
)
}
# suppress var not used error
assert
fwd_context
is
not
None
# Set high context length to test max context length estimation
vllm_config
.
model_config
.
max_model_len
=
3_000_000
vllm_ctx
=
vllm_config
.
compilation_config
.
static_forward_context
runner
=
GPUModelRunner
(
vllm_config
,
DEVICE
)
kv_cache_spec
=
runner
.
get_kv_cache_spec
()
assert
len
(
kv_cache_spec
)
==
1
assert
layer_0
in
kv_cache_spec
assert
runner
.
shared_kv_cache_layers
[
layer_1
]
==
layer_0
available_memory
=
20
*
GiB_bytes
# page size for layer 0's kv_cache_spec is 32KB
# with KV sharing, we can allocate (available_mem//page_size//1) blocks
# which is twice as many as without KV sharing
num_expected_blocks
=
655360
# 20GB / 32KB
kv_cache_config
=
get_kv_cache_config
(
vllm_config
,
kv_cache_spec
,
available_memory
)
assert
kv_cache_config
.
num_blocks
==
num_expected_blocks
assert
len
(
kv_cache_config
.
kv_cache_tensors
)
==
1
# Each layer now has twice the available memory for KV cache
# compared to no KV sharing
assert
kv_cache_config
.
kv_cache_tensors
[
0
].
size
==
available_memory
max_context_len
=
\
estimate_max_model_len
(
vllm_config
,
kv_cache_spec
,
5
*
GiB_bytes
)
# max context len with KV sharing should be 2x as large as without
assert
max_context_len
==
2
*
1310720
# important: override tensor size to prevent large mem alloc during test
# this will only allocate 1 block worth of memory (32kb)
kv_cache_config
.
num_blocks
=
1
kv_cache_config
.
kv_cache_tensors
[
0
].
size
=
\
kv_cache_spec
[
layer_0
].
page_size_bytes
runner
.
initialize_kv_cache
(
kv_cache_config
)
layer_0_kv
=
vllm_ctx
[
layer_0
].
kv_cache
[
0
]
layer_1_kv
=
vllm_ctx
[
layer_1
].
kv_cache
[
0
]
# check layer 1 kv cache shares memory with layer 0
assert
id
(
layer_1_kv
)
==
id
(
layer_0_kv
)
# check layer 1 added to kv cache group's layer names
assert
len
(
kv_cache_config
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
tests/vllm_test_utils/setup.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
setuptools
import
setup
...
...
tests/vllm_test_utils/vllm_test_utils/__init__.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
vllm_utils is a package for vLLM testing utilities.
It does not import any vLLM modules.
...
...
tests/vllm_test_utils/vllm_test_utils/blame.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
dataclasses
...
...
tests/vllm_test_utils/vllm_test_utils/monitor.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
dataclasses
...
...
Prev
1
…
35
36
37
38
39
40
41
42
43
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment