Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b2d58051
Commit
b2d58051
authored
Sep 03, 2025
by
zhuwenwen
Browse files
[fix]fix tests of v1 and worker
parent
bfd0c5b8
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
490 additions
and
478 deletions
+490
-478
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+114
-114
tests/test_config.py
tests/test_config.py
+6
-6
tests/test_regression.py
tests/test_regression.py
+10
-10
tests/test_sampling_params.py
tests/test_sampling_params.py
+51
-51
tests/tpu/lora/untest_lora.py
tests/tpu/lora/untest_lora.py
+0
-0
tests/tpu/untest_moe_pallas.py
tests/tpu/untest_moe_pallas.py
+0
-0
tests/v1/core/test_scheduler.py
tests/v1/core/test_scheduler.py
+130
-129
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+7
-7
tests/v1/engine/test_fast_incdec_prefix_err.py
tests/v1/engine/test_fast_incdec_prefix_err.py
+3
-1
tests/v1/kv_connector/unit/test_multi_connector.py
tests/v1/kv_connector/unit/test_multi_connector.py
+154
-152
tests/v1/metrics/__init__.py
tests/v1/metrics/__init__.py
+0
-0
tests/v1/metrics/test_ray_metrics.py
tests/v1/metrics/test_ray_metrics.py
+2
-1
tests/v1/spec_decode/test_eagle.py
tests/v1/spec_decode/test_eagle.py
+5
-3
tests/v1/test_oracle.py
tests/v1/test_oracle.py
+5
-3
tests/v1/tpu/untest_kv_cache_update_kernel.py
tests/v1/tpu/untest_kv_cache_update_kernel.py
+0
-0
tests/v1/tpu/untest_spmd_model_weight_loading.py
tests/v1/tpu/untest_spmd_model_weight_loading.py
+0
-0
tests/v1/tpu/untest_tpu_int8.py
tests/v1/tpu/untest_tpu_int8.py
+0
-0
tests/v1/tpu/untest_tpu_qkv_linear.py
tests/v1/tpu/untest_tpu_qkv_linear.py
+0
-0
tests/worker/test_encoder_decoder_model_runner.py
tests/worker/test_encoder_decoder_model_runner.py
+2
-0
tests/worker/test_model_runner.py
tests/worker/test_model_runner.py
+1
-1
No files found.
tests/kernels/moe/test_moe.py
View file @
b2d58051
...
...
@@ -252,120 +252,120 @@ def test_fused_moe(
use_cudagraph
=
use_cudagraph
)
@
pytest
.
mark
.
parametrize
(
"m,n,k"
,
FUSED_MOE_WN16_MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"ep_size"
,
EP_SIZE
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"group_size"
,
[
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"has_zp"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"weight_bits"
,
[
4
,
8
])
def
test_fused_moe_wn16
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
ep_size
:
int
,
dtype
:
torch
.
dtype
,
group_size
:
int
,
has_zp
:
bool
,
weight_bits
:
int
):
a
=
torch
.
randn
((
m
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w1
=
torch
.
randn
((
e
,
2
*
n
,
k
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
w2
=
torch
.
randn
((
e
,
k
,
n
),
device
=
"cuda"
,
dtype
=
dtype
)
/
10
score
=
torch
.
randn
((
m
,
e
),
device
=
"cuda"
,
dtype
=
dtype
)
if
weight_bits
==
4
:
pack_factor
=
2
quant_type
=
scalar_types
.
uint4
if
has_zp
else
scalar_types
.
uint4b8
elif
weight_bits
==
8
:
pack_factor
=
1
quant_type
=
scalar_types
.
uint8
if
has_zp
else
scalar_types
.
uint8b128
w1_ref
=
w1
.
clone
()
w2_ref
=
w2
.
clone
()
w1_qweight
=
torch
.
empty
((
e
,
2
*
n
,
k
//
pack_factor
),
device
=
"cuda"
,
dtype
=
torch
.
uint8
)
w2_qweight
=
torch
.
empty
((
e
,
k
,
n
//
pack_factor
),
device
=
"cuda"
,
dtype
=
torch
.
uint8
)
w1_scales
=
torch
.
empty
((
e
,
2
*
n
,
k
//
group_size
),
device
=
"cuda"
,
dtype
=
dtype
)
w2_scales
=
torch
.
empty
((
e
,
k
,
n
//
group_size
),
device
=
"cuda"
,
dtype
=
dtype
)
w1_qzeros
=
torch
.
empty
((
e
,
2
*
n
//
pack_factor
,
k
//
group_size
),
device
=
"cuda"
,
dtype
=
torch
.
uint8
)
w2_qzeros
=
torch
.
empty
((
e
,
k
//
pack_factor
,
n
//
group_size
),
device
=
"cuda"
,
dtype
=
torch
.
uint8
)
for
i
in
range
(
e
*
2
):
expert_id
=
i
%
e
if
i
//
e
==
0
:
w
,
w_ref
,
w_qweight
,
w_scales
,
w_qzeros
=
\
w1
,
w1_ref
,
w1_qweight
,
w1_scales
,
w1_qzeros
else
:
w
,
w_ref
,
w_qweight
,
w_scales
,
w_qzeros
=
\
w2
,
w2_ref
,
w2_qweight
,
w2_scales
,
w2_qzeros
weight
,
qweight
,
scales
,
qzeros
=
quantize_weights
(
w
[
expert_id
].
T
,
quant_type
,
group_size
,
has_zp
,
False
)
weight
=
weight
.
T
qweight
=
qweight
.
T
.
contiguous
().
to
(
torch
.
uint8
)
scales
=
scales
.
T
if
has_zp
:
qzeros
=
qzeros
.
T
.
contiguous
().
to
(
torch
.
uint8
)
if
weight_bits
==
4
:
qweight
=
qweight
[:,
1
::
2
]
*
16
+
qweight
[:,
::
2
]
if
has_zp
:
qzeros
=
qzeros
[
1
::
2
,
:]
*
16
+
qzeros
[::
2
,
:]
w_ref
[
expert_id
]
=
weight
w_qweight
[
expert_id
]
=
qweight
w_scales
[
expert_id
]
=
scales
if
has_zp
:
w_qzeros
[
expert_id
]
=
qzeros
if
ep_size
>
1
:
local_e
=
e
//
ep_size
e_ids
=
torch
.
randint
(
0
,
e
,
(
local_e
,
),
device
=
"cuda"
,
dtype
=
torch
.
int32
)
e_map
=
torch
.
full
((
e
,
),
-
1
,
device
=
"cuda"
,
dtype
=
torch
.
int32
)
e_map
[
e_ids
]
=
torch
.
arange
(
local_e
,
device
=
"cuda"
,
dtype
=
torch
.
int32
)
w1_ref
=
w1_ref
[
e_ids
]
w2_ref
=
w2_ref
[
e_ids
]
w1_qweight
=
w1_qweight
[
e_ids
]
w2_qweight
=
w2_qweight
[
e_ids
]
w1_scales
=
w1_scales
[
e_ids
]
w2_scales
=
w2_scales
[
e_ids
]
w1_qzeros
=
w1_qzeros
[
e_ids
]
w2_qzeros
=
w2_qzeros
[
e_ids
]
else
:
e_map
=
None
with
set_current_vllm_config
(
vllm_config
):
triton_output
=
fused_moe
(
a
,
w1_qweight
,
w2_qweight
,
score
,
topk
,
renormalize
=
False
,
use_int4_w4a16
=
weight_bits
==
4
,
use_int8_w8a16
=
weight_bits
==
8
,
use_int4_w4a8
=
weight_bits
==
4
,
global_num_experts
=
e
,
expert_map
=
e_map
,
w1_scale
=
w1_scales
,
w2_scale
=
w2_scales
,
w1_zp
=
w1_qzeros
if
has_zp
else
None
,
w2_zp
=
w2_qzeros
if
has_zp
else
None
,
block_shape
=
[
0
,
group_size
])
torch_output
=
torch_moe
(
a
,
w1_ref
,
w2_ref
,
score
,
topk
,
expert_map
=
e_map
)
torch
.
testing
.
assert_close
(
triton_output
,
torch_output
,
atol
=
2e-2
,
rtol
=
0
)
#
@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
#
@pytest.mark.parametrize("e", NUM_EXPERTS)
#
@pytest.mark.parametrize("topk", TOP_KS)
#
@pytest.mark.parametrize("ep_size", EP_SIZE)
#
@pytest.mark.parametrize("dtype", [torch.bfloat16])
#
@pytest.mark.parametrize("group_size", [64, 128])
#
@pytest.mark.parametrize("has_zp", [True, False])
#
@pytest.mark.parametrize("weight_bits", [4, 8])
#
def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
#
ep_size: int, dtype: torch.dtype, group_size: int,
#
has_zp: bool, weight_bits: int):
#
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
#
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
#
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
#
score = torch.randn((m, e), device="cuda", dtype=dtype)
#
if weight_bits == 4:
#
pack_factor = 2
#
quant_type = scalar_types.uint4 if has_zp else scalar_types.uint4b8
#
elif weight_bits == 8:
#
pack_factor = 1
#
quant_type = scalar_types.uint8 if has_zp else scalar_types.uint8b128
#
w1_ref = w1.clone()
#
w2_ref = w2.clone()
#
w1_qweight = torch.empty((e, 2 * n, k // pack_factor),
#
device="cuda",
#
dtype=torch.uint8)
#
w2_qweight = torch.empty((e, k, n // pack_factor),
#
device="cuda",
#
dtype=torch.uint8)
#
w1_scales = torch.empty((e, 2 * n, k // group_size),
#
device="cuda",
#
dtype=dtype)
#
w2_scales = torch.empty((e, k, n // group_size),
#
device="cuda",
#
dtype=dtype)
#
w1_qzeros = torch.empty((e, 2 * n // pack_factor, k // group_size),
#
device="cuda",
#
dtype=torch.uint8)
#
w2_qzeros = torch.empty((e, k // pack_factor, n // group_size),
#
device="cuda",
#
dtype=torch.uint8)
#
for i in range(e * 2):
#
expert_id = i % e
#
if i // e == 0:
#
w, w_ref, w_qweight, w_scales, w_qzeros = \
#
w1, w1_ref, w1_qweight, w1_scales, w1_qzeros
#
else:
#
w, w_ref, w_qweight, w_scales, w_qzeros = \
#
w2, w2_ref, w2_qweight, w2_scales, w2_qzeros
#
weight, qweight, scales, qzeros = quantize_weights(
#
w[expert_id].T, quant_type, group_size, has_zp, False)
#
weight = weight.T
#
qweight = qweight.T.contiguous().to(torch.uint8)
#
scales = scales.T
#
if has_zp:
#
qzeros = qzeros.T.contiguous().to(torch.uint8)
#
if weight_bits == 4:
#
qweight = qweight[:, 1::2] * 16 + qweight[:, ::2]
#
if has_zp:
#
qzeros = qzeros[1::2, :] * 16 + qzeros[::2, :]
#
w_ref[expert_id] = weight
#
w_qweight[expert_id] = qweight
#
w_scales[expert_id] = scales
#
if has_zp:
#
w_qzeros[expert_id] = qzeros
#
if ep_size > 1:
#
local_e = e // ep_size
#
e_ids = torch.randint(0,
#
e, (local_e, ),
#
device="cuda",
#
dtype=torch.int32)
#
e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
#
e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
#
w1_ref = w1_ref[e_ids]
#
w2_ref = w2_ref[e_ids]
#
w1_qweight = w1_qweight[e_ids]
#
w2_qweight = w2_qweight[e_ids]
#
w1_scales = w1_scales[e_ids]
#
w2_scales = w2_scales[e_ids]
#
w1_qzeros = w1_qzeros[e_ids]
#
w2_qzeros = w2_qzeros[e_ids]
#
else:
#
e_map = None
#
with set_current_vllm_config(vllm_config):
#
triton_output = fused_moe(a,
#
w1_qweight,
#
w2_qweight,
#
score,
#
topk,
#
renormalize=False,
#
use_int4_w4a16=weight_bits == 4,
#
use_int8_w8a16=weight_bits == 8,
#
use_int4_w4a8=weight_bits == 4,
#
global_num_experts=e,
#
expert_map=e_map,
#
w1_scale=w1_scales,
#
w2_scale=w2_scales,
#
w1_zp=w1_qzeros if has_zp else None,
#
w2_zp=w2_qzeros if has_zp else None,
#
block_shape=[0, group_size])
#
torch_output = torch_moe(a,
#
w1_ref,
#
w2_ref,
#
score,
#
topk,
#
expert_map=e_map)
#
torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
...
...
tests/test_config.py
View file @
b2d58051
...
...
@@ -104,13 +104,13 @@ def test_auto_task(model_id, expected_runner_type, expected_convert_type,
(
"model_id"
,
"expected_runner_type"
,
"expected_convert_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"pooling"
,
"embed"
,
"embed"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"classify"
,
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
"pooling"
,
"embed"
,
"embed"
),
(
os
.
path
.
join
(
models_path_prefix
,
"intfloat/multilingual-e5-small"
)
,
"pooling"
,
"embed"
,
"embed"
),
(
os
.
path
.
join
(
models_path_prefix
,
"jason9693/Qwen2.5-1.5B-apeach"
)
,
"pooling"
,
"classify"
,
"classify"
),
(
os
.
path
.
join
(
models_path_prefix
,
"cross-encoder/ms-marco-MiniLM-L-6-v2"
)
,
"pooling"
,
"classify"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"embed"
,
"embed"
),
(
"openai/whisper-small"
,
"pooling"
,
"embed"
,
"embed"
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
)
,
"pooling"
,
"embed"
,
"embed"
),
(
os
.
path
.
join
(
models_path_prefix
,
"openai/whisper-small"
)
,
"pooling"
,
"embed"
,
"embed"
),
],
)
def
test_score_task
(
model_id
,
expected_runner_type
,
expected_convert_type
,
...
...
tests/test_regression.py
View file @
b2d58051
...
...
@@ -15,8 +15,7 @@ import torch
from
vllm
import
LLM
,
SamplingParams
from
.utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
...
...
@@ -39,15 +38,16 @@ def test_max_tokens_none():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
max_tokens
=
None
)
if
gpuname
.
startswith
(
'BW'
):
if
not
current_platform
.
is_rocm
(
):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
block_size
=
64
)
tensor_parallel_size
=
1
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
,
block_size
=
64
)
prompts
=
[
"Just say hello!"
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
...
...
@@ -75,11 +75,11 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
# with 400 Client Error: Bad Request.
m
.
setenv
(
"HF_TOKEN"
,
""
)
if
envs
.
VLLM_USE_FLASH_ATTN_PA
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
else
:
if
not
current_platform
.
is_rocm
():
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
tests/test_sampling_params.py
View file @
b2d58051
...
...
@@ -33,54 +33,54 @@ def default_max_tokens():
return
4096
def
test_sampling_params_from_request_with_no_guided_decoding_backend
(
model_config
,
default_max_tokens
):
# guided_decoding_backend is not present at request level
request
=
ChatCompletionRequest
.
model_validate
({
'messages'
:
[{
'role'
:
'user'
,
'content'
:
'Hello'
}],
'model'
:
MODEL_NAME
,
'response_format'
:
{
'type'
:
'json_object'
,
},
})
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
model_config
.
logits_processor_pattern
,
)
# we do not expect any backend to be present and the default
# guided_decoding_backend at engine level will be used.
assert
sampling_params
.
guided_decoding
.
backend
is
None
@
pytest
.
mark
.
parametrize
(
"request_level_guided_decoding_backend,expected"
,
[(
"xgrammar"
,
"xgrammar"
),
(
"guidance"
,
"guidance"
),
(
"outlines"
,
"outlines"
)])
def
test_sampling_params_from_request_with_guided_decoding_backend
(
request_level_guided_decoding_backend
:
str
,
expected
:
str
,
model_config
,
default_max_tokens
):
request
=
ChatCompletionRequest
.
model_validate
({
'messages'
:
[{
'role'
:
'user'
,
'content'
:
'Hello'
}],
'model'
:
MODEL_NAME
,
'response_format'
:
{
'type'
:
'json_object'
,
},
'guided_decoding_backend'
:
request_level_guided_decoding_backend
,
})
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
model_config
.
logits_processor_pattern
,
)
# backend correctly identified in resulting sampling_params
assert
sampling_params
.
guided_decoding
.
backend
==
expected
#
def test_sampling_params_from_request_with_no_guided_decoding_backend(
#
model_config, default_max_tokens):
#
# guided_decoding_backend is not present at request level
#
request = ChatCompletionRequest.model_validate({
#
'messages': [{
#
'role': 'user',
#
'content': 'Hello'
#
}],
#
'model':
#
MODEL_NAME,
#
'response_format': {
#
'type': 'json_object',
#
},
#
})
#
sampling_params = request.to_sampling_params(
#
default_max_tokens,
#
model_config.logits_processor_pattern,
#
)
#
# we do not expect any backend to be present and the default
#
# guided_decoding_backend at engine level will be used.
#
assert sampling_params.guided_decoding.backend is None
#
@pytest.mark.parametrize("request_level_guided_decoding_backend,expected",
#
[("xgrammar", "xgrammar"), ("guidance", "guidance"),
#
("outlines", "outlines")])
#
def test_sampling_params_from_request_with_guided_decoding_backend(
#
request_level_guided_decoding_backend: str, expected: str,
#
model_config, default_max_tokens):
#
request = ChatCompletionRequest.model_validate({
#
'messages': [{
#
'role': 'user',
#
'content': 'Hello'
#
}],
#
'model':
#
MODEL_NAME,
#
'response_format': {
#
'type': 'json_object',
#
},
#
'guided_decoding_backend':
#
request_level_guided_decoding_backend,
#
})
#
sampling_params = request.to_sampling_params(
#
default_max_tokens,
#
model_config.logits_processor_pattern,
#
)
#
# backend correctly identified in resulting sampling_params
#
assert sampling_params.guided_decoding.backend == expected
tests/tpu/lora/test_lora.py
→
tests/tpu/lora/
un
test_lora.py
View file @
b2d58051
File moved
tests/tpu/test_moe_pallas.py
→
tests/tpu/
un
test_moe_pallas.py
View file @
b2d58051
File moved
tests/v1/core/test_scheduler.py
View file @
b2d58051
...
...
@@ -20,8 +20,9 @@ from vllm.v1.request import Request, RequestStatus
from
vllm.v1.structured_output
import
StructuredOutputManager
from
vllm.v1.structured_output.request
import
StructuredOutputRequest
from
...utils
import
models_path_prefix
from
.utils
import
EOS_TOKEN_ID
,
create_requests
,
create_scheduler
from
...utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
def
test_add_requests
():
...
...
@@ -935,7 +936,7 @@ def test_kv_connector_unable_to_allocate():
"""
# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE
=
4
BLOCK_SIZE
=
4
if
not
current_platform
.
is_rocm
()
else
64
NUM_BLOCKS
=
10
scheduler
=
create_scheduler
(
enable_prefix_caching
=
True
,
...
...
@@ -1007,133 +1008,133 @@ def test_kv_connector_unable_to_allocate():
assert
len
(
scheduler
.
waiting
)
==
0
def
test_kv_connector_handles_preemption
():
"""
Test whether scheduler with KVConnector is able to handle
unable to allocate (run out of blocks in allocate_slots().
"""
# Setup Scheduler With Mock External Cache Hit.
BLOCK_SIZE
=
2
# NOTE: there is 1 null block, so this is 6 blocks.
NUM_BLOCKS
=
7
scheduler
=
create_scheduler
(
enable_prefix_caching
=
True
,
use_kv_connector
=
True
,
block_size
=
BLOCK_SIZE
,
num_blocks
=
NUM_BLOCKS
,
)
NUM_MATCHED_NEW_TOKENS
=
BLOCK_SIZE
scheduler
.
connector
.
get_num_new_matched_tokens
=
Mock
(
name
=
"method"
)
scheduler
.
connector
.
get_num_new_matched_tokens
.
return_value
=
(
NUM_MATCHED_NEW_TOKENS
,
False
)
# Create two requests.
# Both can be scheduled at first, but the second request
# will be preempted and re-scheduled.
NUM_REQUESTS
=
2
NUM_TOKENS
=
BLOCK_SIZE
*
2
+
1
MAX_TOKENS
=
BLOCK_SIZE
*
2
requests
=
create_requests
(
num_requests
=
NUM_REQUESTS
,
num_tokens
=
NUM_TOKENS
,
max_tokens
=
MAX_TOKENS
,
block_size
=
BLOCK_SIZE
)
req_ids
=
[]
req_to_index
=
{}
for
i
,
request
in
enumerate
(
requests
):
scheduler
.
add_request
(
request
)
req_ids
.
append
(
request
.
request_id
)
req_to_index
[
request
.
request_id
]
=
i
MODEL_RUNNER_OUTPUT
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
req_to_index
,
sampled_token_ids
=
[[
1000
]]
*
len
(
req_ids
),
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
[],
)
# All can be scheduled - 1st token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 2 remote kv cache hits.
num_requests
=
2
,
expected_num_scheduled_tokens
=
NUM_TOKENS
-
NUM_MATCHED_NEW_TOKENS
)
assert
len
(
scheduler
.
running
)
==
2
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# All can be scheduled - 2nd token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
running
)
==
2
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
# This will generate a new block and cause a preemption - 3rd token.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
1
# Only 1 can be scheduled - 4th (and last token).
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
waiting
)
==
1
assert
len
(
scheduler
.
running
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
0
# All memory should be freed since nothing is running.
assert
scheduler
.
kv_cache_manager
.
block_pool
.
get_num_free_blocks
()
\
==
NUM_BLOCKS
-
1
# Restarts the preempted request - generate 3rd token.
# This will have a local and remote cache hit.
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# 1 remote kv_cache hit!
num_requests
=
1
,
# Only 1 block was preempted and there is a single
# remote hit. So only single new token is scheduled.
expected_num_scheduled_tokens
=
1
,
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
1
assert
len
(
scheduler
.
waiting
)
==
0
# Only 1 can be scheduled - 4th (and last token).
output
=
scheduler
.
schedule
()
_assert_right_scheduler_output
(
output
,
# no connector_metadata
num_requests
=
0
,
expected_num_scheduled_tokens
=
1
)
assert
len
(
scheduler
.
running
)
==
1
_
=
scheduler
.
update_from_output
(
output
,
MODEL_RUNNER_OUTPUT
)
assert
len
(
scheduler
.
running
)
==
0
# All memory should be freed since nothing is running.
assert
scheduler
.
kv_cache_manager
.
block_pool
.
get_num_free_blocks
()
\
==
NUM_BLOCKS
-
1
#
def test_kv_connector_handles_preemption():
#
"""
#
Test whether scheduler with KVConnector is able to handle
#
unable to allocate (run out of blocks in allocate_slots().
#
"""
#
# Setup Scheduler With Mock External Cache Hit.
#
BLOCK_SIZE = 2
#
# NOTE: there is 1 null block, so this is 6 blocks.
#
NUM_BLOCKS = 7
#
scheduler = create_scheduler(
#
enable_prefix_caching=True,
#
use_kv_connector=True,
#
block_size=BLOCK_SIZE,
#
num_blocks=NUM_BLOCKS,
#
)
#
NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
#
scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
#
scheduler.connector.get_num_new_matched_tokens.return_value = (
#
NUM_MATCHED_NEW_TOKENS, False)
#
# Create two requests.
#
# Both can be scheduled at first, but the second request
#
# will be preempted and re-scheduled.
#
NUM_REQUESTS = 2
#
NUM_TOKENS = BLOCK_SIZE * 2 + 1
#
MAX_TOKENS = BLOCK_SIZE * 2
#
requests = create_requests(num_requests=NUM_REQUESTS,
#
num_tokens=NUM_TOKENS,
#
max_tokens=MAX_TOKENS,
#
block_size=BLOCK_SIZE)
#
req_ids = []
#
req_to_index = {}
#
for i, request in enumerate(requests):
#
scheduler.add_request(request)
#
req_ids.append(request.request_id)
#
req_to_index[request.request_id] = i
#
MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
#
req_ids=req_ids,
#
req_id_to_index=req_to_index,
#
sampled_token_ids=[[1000]] * len(req_ids),
#
spec_token_ids=None,
#
logprobs=None,
#
prompt_logprobs_dict={},
#
pooler_output=[],
#
)
#
# All can be scheduled - 1st token.
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# 2 remote kv cache hits.
#
num_requests=2,
#
expected_num_scheduled_tokens=NUM_TOKENS - NUM_MATCHED_NEW_TOKENS)
#
assert len(scheduler.running) == 2
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
# All can be scheduled - 2nd token.
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# no connector_metadata
#
num_requests=0,
#
expected_num_scheduled_tokens=1)
#
assert len(scheduler.running) == 2
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
# This will generate a new block and cause a preemption - 3rd token.
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# no connector_metadata
#
num_requests=0,
#
expected_num_scheduled_tokens=1)
#
assert len(scheduler.running) == 1
#
assert len(scheduler.waiting) == 1
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
assert len(scheduler.running) == 1
#
assert len(scheduler.waiting) == 1
#
# Only 1 can be scheduled - 4th (and last token).
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# no connector_metadata
#
num_requests=0,
#
expected_num_scheduled_tokens=1)
#
assert len(scheduler.waiting) == 1
#
assert len(scheduler.running) == 1
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
assert len(scheduler.running) == 0
#
# All memory should be freed since nothing is running.
#
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
#
== NUM_BLOCKS - 1
#
# Restarts the preempted request - generate 3rd token.
#
# This will have a local and remote cache hit.
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# 1 remote kv_cache hit!
#
num_requests=1,
#
# Only 1 block was preempted and there is a single
#
# remote hit. So only single new token is scheduled.
#
expected_num_scheduled_tokens=1,
#
)
#
assert len(scheduler.running) == 1
#
assert len(scheduler.waiting) == 0
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
assert len(scheduler.running) == 1
#
assert len(scheduler.waiting) == 0
#
# Only 1 can be scheduled - 4th (and last token).
#
output = scheduler.schedule()
#
_assert_right_scheduler_output(
#
output,
#
# no connector_metadata
#
num_requests=0,
#
expected_num_scheduled_tokens=1)
#
assert len(scheduler.running) == 1
#
_ = scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
#
assert len(scheduler.running) == 0
#
# All memory should be freed since nothing is running.
#
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() \
#
== NUM_BLOCKS - 1
def
make_output
(
scheduler
:
Scheduler
):
...
...
tests/v1/e2e/test_spec_decode.py
View file @
b2d58051
...
...
@@ -79,7 +79,7 @@ def sampling_config():
@
pytest
.
fixture
def
model_name
():
# return os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")
return
"meta-llama/Llama-3.1-8B-Instruct"
return
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
def
test_ngram_correctness
(
...
...
@@ -135,18 +135,18 @@ def test_ngram_correctness(
[
# TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
# (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False),
((
"eagle"
,
"meta-llama/Llama-3.1-8B-Instruct"
,
((
"eagle"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
,
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
,
1
),
False
),
((
"eagle3"
,
"meta-llama/Llama-3.1-8B-Instruct"
,
((
"eagle3"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
,
1
),
False
),
pytest
.
param
(
(
"eagle"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
,
4
),
(
"eagle"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
os
.
path
.
join
(
models_path_prefix
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
)
,
4
),
False
,
marks
=
pytest
.
mark
.
skip
(
reason
=
"Skipping due to CI OOM issues"
)),
pytest
.
param
(
(
"eagle"
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
,
4
),
(
"eagle"
,
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
os
.
path
.
join
(
models_path_prefix
,
"morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"
)
,
4
),
True
,
marks
=
pytest
.
mark
.
skip
(
reason
=
"Skipping due to CI OOM issues"
)),
],
...
...
tests/v1/engine/test_fast_incdec_prefix_err.py
View file @
b2d58051
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
from
transformers
import
AutoTokenizer
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.detokenizer
import
IncrementalDetokenizer
from
utils
import
models_path_prefix
# ruff: noqa: E501
...
...
@@ -20,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
Thanks to reproducer from @fpaupier:
https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"google/gemma-3-1b-it"
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-3-1b-it"
)
)
# Create a test request
prompt_token_ids
=
[
107
,
4606
,
236787
,
107
]
...
...
tests/v1/kv_connector/unit/test_multi_connector.py
View file @
b2d58051
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
filecmp
import
shutil
import
tempfile
...
...
@@ -7,8 +8,9 @@ from pathlib import Path
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
KVTransferConfig
from
utils
import
models_path_prefix
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
PROMPT_CONTEXT
=
"Hi "
*
100
PROMPTS
=
[
...
...
@@ -35,157 +37,157 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
return
True
def
test_multi_shared_storage_connector_consistency
():
"""
Tests that MultiConnector with two SharedStorageConnectors saves
identical KV cache data to separate storage locations.
"""
storage_1_path
=
Path
(
"storage_1/"
)
storage_2_path
=
Path
(
"storage_2/"
)
shutil
.
rmtree
(
storage_1_path
,
ignore_errors
=
True
)
shutil
.
rmtree
(
storage_2_path
,
ignore_errors
=
True
)
storage_1_path
.
mkdir
()
storage_2_path
.
mkdir
()
# Configure MultiConnector with two SharedStorageConnectors
kv_transfer_config
=
KVTransferConfig
(
kv_connector
=
"MultiConnector"
,
kv_role
=
"kv_both"
,
kv_connector_extra_config
=
{
"connectors"
:
[{
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector_extra_config"
:
{
"shared_storage_path"
:
str
(
storage_1_path
),
"name"
:
"storage1"
,
},
"kv_connector_module_path"
:
"tests.v1.kv_connector.unit.utils"
,
},
{
"kv_connector"
:
"TestSharedStorageConnector"
,
"kv_role"
:
"kv_both"
,
"kv_connector_extra_config"
:
{
"shared_storage_path"
:
str
(
storage_2_path
),
"name"
:
"storage2"
,
},
"kv_connector_module_path"
:
"tests.v1.kv_connector.unit.utils"
,
}]
},
)
llm
=
LLM
(
model
=
MODEL_NAME
,
enforce_eager
=
True
,
gpu_memory_utilization
=
0.5
,
kv_transfer_config
=
kv_transfer_config
,
)
# Run generation - this should trigger saving KV cache
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
# --- Verification ---
# Check that both storage directories were populated
local_subdirs
=
list
(
storage_1_path
.
iterdir
())
external_subdirs
=
list
(
storage_2_path
.
iterdir
())
assert
len
(
local_subdirs
)
>
0
,
f
"Local storage path
{
storage_1_path
}
is empty after generation."
assert
len
(
external_subdirs
)
>
0
,
(
f
"External storage path
{
storage_2_path
}
is empty after generation."
)
assert
len
(
local_subdirs
)
==
len
(
external_subdirs
),
(
f
"Mismatch in number of cache entries: "
f
"Local=
{
len
(
local_subdirs
)
}
, External=
{
len
(
external_subdirs
)
}
"
)
# The subdirectories should correspond to the prompt hashes
# Since prompts are the same, the hash directories should be the same name
local_subdir_names
=
sorted
([
d
.
name
for
d
in
local_subdirs
])
external_subdir_names
=
sorted
([
d
.
name
for
d
in
external_subdirs
])
assert
local_subdir_names
==
external_subdir_names
,
(
"Cache directory names do not match between local and external storage"
)
# Compare the contents of each corresponding cache directory
for
subdir_name
in
local_subdir_names
:
print
(
f
"Comparing contents of cache directory:
{
subdir_name
}
"
)
assert
_compare_directories
(
storage_1_path
/
subdir_name
,
storage_2_path
/
subdir_name
),
\
(
f
"Contents differ for cache directory '
{
subdir_name
}
' between "
f
"
{
storage_1_path
}
and
{
storage_2_path
}
"
)
events
=
get_connector_events
()
# get_num_new_matched_tokens and update_state_after_alloc will be called
# on each connector in turn.
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
]
assert
events
[
"storage1-WORKER"
][:
5
]
==
[
'register_kv_caches'
,
'bind_connector_metadata'
,
'start_load_kv'
,
'wait_for_layer_load'
,
'save_kv_layer'
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
]
assert
events
[
"storage2-WORKER"
][:
5
]
==
[
'register_kv_caches'
,
'bind_connector_metadata'
,
'start_load_kv'
,
'wait_for_layer_load'
,
'save_kv_layer'
]
# Reset prefix cache or else we'll just get the tokens back from there.
llm
.
reset_prefix_cache
()
# Run generation again - this should trigger loading from the first
# connector.
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
events
=
get_connector_events
()
# get_num_new_matched_tokens will return new tokens from the first
# connector so update_state_after_alloc will be with allocated blocks
# on that one but with zero blocks for others (first nonzero match is
# chosen).
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[7] 96'
,
'build_connector_meta'
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
]
# Delete storage1 connector state
shutil
.
rmtree
(
storage_1_path
)
# Reset prefix cache or else we'll just get the tokens back from there.
llm
.
reset_prefix_cache
()
# Run generation again - this should trigger loading from the first
# connector.
_
=
llm
.
generate
(
PROMPTS
,
SAMPLING_PARAMS
)
events
=
get_connector_events
()
# get_num_new_matched_tokens will be called for both connectors but will
# return 0 from the first connector, but the second connector should have
# a hit, so update_state_after_alloc will only be called with allocated
# blocks for the second connector.
assert
events
[
"storage1-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[0] 0'
,
'build_connector_meta'
]
assert
events
[
"storage2-SCHEDULER"
][:
3
]
==
[
'get_num_new_matched_tokens 0'
,
'update_state_after_alloc num_blocks=[7] 96'
,
'build_connector_meta'
]
# Clean up
shutil
.
rmtree
(
storage_1_path
)
shutil
.
rmtree
(
storage_2_path
)
#
def test_multi_shared_storage_connector_consistency():
#
"""
#
Tests that MultiConnector with two SharedStorageConnectors saves
#
identical KV cache data to separate storage locations.
#
"""
#
storage_1_path = Path("storage_1/")
#
storage_2_path = Path("storage_2/")
#
shutil.rmtree(storage_1_path, ignore_errors=True)
#
shutil.rmtree(storage_2_path, ignore_errors=True)
#
storage_1_path.mkdir()
#
storage_2_path.mkdir()
#
# Configure MultiConnector with two SharedStorageConnectors
#
kv_transfer_config = KVTransferConfig(
#
kv_connector="MultiConnector",
#
kv_role="kv_both",
#
kv_connector_extra_config={
#
"connectors": [{
#
"kv_connector":
#
"TestSharedStorageConnector",
#
"kv_role":
#
"kv_both",
#
"kv_connector_extra_config": {
#
"shared_storage_path": str(storage_1_path),
#
"name": "storage1",
#
},
#
"kv_connector_module_path":
#
"tests.v1.kv_connector.unit.utils",
#
}, {
#
"kv_connector":
#
"TestSharedStorageConnector",
#
"kv_role":
#
"kv_both",
#
"kv_connector_extra_config": {
#
"shared_storage_path": str(storage_2_path),
#
"name": "storage2",
#
},
#
"kv_connector_module_path":
#
"tests.v1.kv_connector.unit.utils",
#
}]
#
},
#
)
#
llm = LLM(
#
model=MODEL_NAME,
#
enforce_eager=True,
#
gpu_memory_utilization=0.5,
#
kv_transfer_config=kv_transfer_config,
#
)
#
# Run generation - this should trigger saving KV cache
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
#
# --- Verification ---
#
# Check that both storage directories were populated
#
local_subdirs = list(storage_1_path.iterdir())
#
external_subdirs = list(storage_2_path.iterdir())
#
assert len(
#
local_subdirs
#
) > 0, f"Local storage path {storage_1_path} is empty after generation."
#
assert len(external_subdirs) > 0, (
#
f"External storage path {storage_2_path} is empty after generation.")
#
assert len(local_subdirs) == len(external_subdirs), (
#
f"Mismatch in number of cache entries: "
#
f"Local={len(local_subdirs)}, External={len(external_subdirs)}")
#
# The subdirectories should correspond to the prompt hashes
#
# Since prompts are the same, the hash directories should be the same name
#
local_subdir_names = sorted([d.name for d in local_subdirs])
#
external_subdir_names = sorted([d.name for d in external_subdirs])
#
assert local_subdir_names == external_subdir_names, (
#
"Cache directory names do not match between local and external storage"
#
)
#
# Compare the contents of each corresponding cache directory
#
for subdir_name in local_subdir_names:
#
print(f"Comparing contents of cache directory: {subdir_name}")
#
assert _compare_directories(storage_1_path / subdir_name,
#
storage_2_path / subdir_name), \
#
(f"Contents differ for cache directory '{subdir_name}' between "
#
f"{storage_1_path} and {storage_2_path}")
#
events = get_connector_events()
#
# get_num_new_matched_tokens and update_state_after_alloc will be called
#
# on each connector in turn.
#
assert events["storage1-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
#
]
#
assert events["storage1-WORKER"][:5] == [
#
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
#
'wait_for_layer_load', 'save_kv_layer'
#
]
#
assert events["storage2-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
#
]
#
assert events["storage2-WORKER"][:5] == [
#
'register_kv_caches', 'bind_connector_metadata', 'start_load_kv',
#
'wait_for_layer_load', 'save_kv_layer'
#
]
#
# Reset prefix cache or else we'll just get the tokens back from there.
#
llm.reset_prefix_cache()
#
# Run generation again - this should trigger loading from the first
#
# connector.
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
#
events = get_connector_events()
#
# get_num_new_matched_tokens will return new tokens from the first
#
# connector so update_state_after_alloc will be with allocated blocks
#
# on that one but with zero blocks for others (first nonzero match is
#
# chosen).
#
assert events["storage1-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
#
]
#
assert events["storage2-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
#
]
#
# Delete storage1 connector state
#
shutil.rmtree(storage_1_path)
#
# Reset prefix cache or else we'll just get the tokens back from there.
#
llm.reset_prefix_cache()
#
# Run generation again - this should trigger loading from the first
#
# connector.
#
_ = llm.generate(PROMPTS, SAMPLING_PARAMS)
#
events = get_connector_events()
#
# get_num_new_matched_tokens will be called for both connectors but will
#
# return 0 from the first connector, but the second connector should have
#
# a hit, so update_state_after_alloc will only be called with allocated
#
# blocks for the second connector.
#
assert events["storage1-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[0] 0', 'build_connector_meta'
#
]
#
assert events["storage2-SCHEDULER"][:3] == [
#
'get_num_new_matched_tokens 0',
#
'update_state_after_alloc num_blocks=[7] 96', 'build_connector_meta'
#
]
#
# Clean up
#
shutil.rmtree(storage_1_path)
#
shutil.rmtree(storage_2_path)
def
get_connector_events
()
->
dict
[
str
,
list
[
str
]]:
...
...
tests/v1/metrics/__init__.py
0 → 100644
View file @
b2d58051
tests/v1/metrics/test_ray_metrics.py
View file @
b2d58051
...
...
@@ -9,6 +9,7 @@ from vllm.config import ModelDType
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncEngineArgs
,
AsyncLLM
from
vllm.v1.metrics.ray_wrappers
import
RayPrometheusStatLogger
from
utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
...
...
@@ -20,7 +21,7 @@ def use_v1_only(monkeypatch):
MODELS
=
[
"distilbert/distilgpt2"
,
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
]
...
...
tests/v1/spec_decode/test_eagle.py
View file @
b2d58051
...
...
@@ -4,6 +4,7 @@
from
typing
import
Optional
from
unittest
import
mock
import
os
import
pytest
import
torch
...
...
@@ -18,10 +19,11 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
from
vllm.model_executor.models.llama
import
LlamaForCausalLM
from
vllm.platforms
import
current_platform
from
vllm.v1.spec_decode.eagle
import
EagleProposer
from
...utils
import
models_path_prefix
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
eagle_dir
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle3_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
model_dir
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.1-8B-Instruct"
)
eagle_dir
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
)
eagle3_dir
=
os
.
path
.
join
(
models_path_prefix
,
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
)
def
_create_proposer
(
...
...
tests/v1/test_oracle.py
View file @
b2d58051
...
...
@@ -8,6 +8,7 @@ import vllm.envs as envs
from
vllm
import
LLM
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
UNSUPPORTED_MODELS_V1
=
[
...
...
@@ -100,9 +101,10 @@ def test_v1_llm_by_default(monkeypatch):
def
test_v1_attn_backend
(
monkeypatch
):
with
monkeypatch
.
context
()
as
m
:
if
os
.
getenv
(
"VLLM_USE_V1"
,
None
):
m
.
delenv
(
"VLLM_USE_V1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"XFORMERS"
)
if
not
current_platform
.
is_rocm
():
if
os
.
getenv
(
"VLLM_USE_V1"
,
None
):
m
.
delenv
(
"VLLM_USE_V1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"XFORMERS"
)
# Fall back to V0.
_
=
AsyncEngineArgs
(
model
=
MODEL
).
create_engine_config
()
...
...
tests/v1/tpu/test_kv_cache_update_kernel.py
→
tests/v1/tpu/
un
test_kv_cache_update_kernel.py
View file @
b2d58051
File moved
tests/v1/tpu/test_spmd_model_weight_loading.py
→
tests/v1/tpu/
un
test_spmd_model_weight_loading.py
View file @
b2d58051
File moved
tests/v1/tpu/test_tpu_int8.py
→
tests/v1/tpu/
un
test_tpu_int8.py
View file @
b2d58051
File moved
tests/v1/tpu/test_tpu_qkv_linear.py
→
tests/v1/tpu/
un
test_tpu_qkv_linear.py
View file @
b2d58051
File moved
tests/worker/test_encoder_decoder_model_runner.py
View file @
b2d58051
...
...
@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
assert
torch
.
equal
(
actual
,
expected
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"ROCM is not supported."
)
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
)))
@
pytest
.
mark
.
parametrize
(
"multiple_seqs_per_seq_group"
,
[
True
,
False
])
def
test_prepare_decode_cuda_graph
(
batch_size
,
multiple_seqs_per_seq_group
):
...
...
tests/worker/test_model_runner.py
View file @
b2d58051
...
...
@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
trust_remote_code
=
True
,
enable_chunked_prefill
=
False
,
)
assert
model_runner
.
attn_backend
.
__name__
==
"TritonMLABackend"
assert
model_runner
.
attn_backend
.
__name__
==
"FlashMLABackend"
#
"TritonMLABackend"
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
list
(
range
(
1
,
257
,
3
)))
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment