Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
511
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1181 additions
and
515 deletions
+1181
-515
requirements/test.txt
requirements/test.txt
+24
-21
requirements/tpu.txt
requirements/tpu.txt
+1
-0
setup.py
setup.py
+15
-4
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+1
-8
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+0
-296
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+31
-0
tests/benchmarks/test_random_dataset.py
tests/benchmarks/test_random_dataset.py
+344
-0
tests/compile/piecewise/test_multiple_graphs.py
tests/compile/piecewise/test_multiple_graphs.py
+36
-101
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+1
-1
tests/compile/test_decorator.py
tests/compile/test_decorator.py
+251
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+0
-6
tests/compile/test_functionalization.py
tests/compile/test_functionalization.py
+3
-2
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+12
-13
tests/compile/test_fusion_all_reduce.py
tests/compile/test_fusion_all_reduce.py
+1
-1
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+326
-5
tests/compile/test_sequence_parallelism.py
tests/compile/test_sequence_parallelism.py
+1
-2
tests/compile/test_silu_mul_quant_fusion.py
tests/compile/test_silu_mul_quant_fusion.py
+74
-32
tests/conftest.py
tests/conftest.py
+53
-10
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+2
-6
tests/distributed/test_comm_ops.py
tests/distributed/test_comm_ops.py
+5
-7
No files found.
Too many changes to show.
To preserve performance only
511 of 511+
files are displayed.
Plain diff
Email patch
requirements/test.txt
View file @
d2b52805
...
...
@@ -156,6 +156,8 @@ datasets==3.0.2
# mteb
decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements/test.in
dill==0.3.8
# via
# datasets
...
...
@@ -408,7 +410,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval
==0.4.8
lm-eval
@ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
# via -r requirements/test.in
lxml==5.3.0
# via
...
...
@@ -493,6 +495,7 @@ numpy==1.26.4
# contourpy
# cupy-cuda12x
# datasets
# decord
# einx
# encodec
# evaluate
...
...
@@ -538,42 +541,42 @@ numpy==1.26.4
# tritonclient
# vocos
# xarray
nvidia-cublas-cu12==12.8.
3
.1
4
nvidia-cublas-cu12==12.8.
4
.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.8.
57
nvidia-cuda-cupti-cu12==12.8.
90
# via torch
nvidia-cuda-nvrtc-cu12==12.8.
61
nvidia-cuda-nvrtc-cu12==12.8.
93
# via torch
nvidia-cuda-runtime-cu12==12.8.
57
nvidia-cuda-runtime-cu12==12.8.
90
# via torch
nvidia-cudnn-cu12==9.
7.1
.2
6
nvidia-cudnn-cu12==9.
10.2
.2
1
# via torch
nvidia-cufft-cu12==11.3.3.
41
nvidia-cufft-cu12==11.3.3.
83
# via torch
nvidia-cufile-cu12==1.13.
0.11
nvidia-cufile-cu12==1.13.
1.3
# via torch
nvidia-curand-cu12==10.3.9.
55
nvidia-curand-cu12==10.3.9.
90
# via torch
nvidia-cusolver-cu12==11.7.
2.55
nvidia-cusolver-cu12==11.7.
3.90
# via torch
nvidia-cusparse-cu12==12.5.
7.5
3
nvidia-cusparse-cu12==12.5.
8.9
3
# via
# nvidia-cusolver-cu12
# torch
nvidia-cusparselt-cu12==0.
6.3
nvidia-cusparselt-cu12==0.
7.1
# via torch
nvidia-nccl-cu12==2.2
6.2
nvidia-nccl-cu12==2.2
7.3
# via torch
nvidia-nvjitlink-cu12==12.8.
61
nvidia-nvjitlink-cu12==12.8.
93
# via
# nvidia-cufft-cu12
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
# torch
nvidia-nvtx-cu12==12.8.
55
nvidia-nvtx-cu12==12.8.
90
# via torch
omegaconf==2.3.0
# via
...
...
@@ -742,7 +745,7 @@ pycparser==2.22
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.11.
5
pydantic==2.11.
7
# via
# -r requirements/test.in
# albumentations
...
...
@@ -1066,7 +1069,7 @@ tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.
7.1
+cu128
torch==2.
8.0
+cu128
# via
# -r requirements/test.in
# accelerate
...
...
@@ -1095,7 +1098,7 @@ torch==2.7.1+cu128
# torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.
7.1
+cu128
torchaudio==2.
8.0
+cu128
# via
# -r requirements/test.in
# encodec
...
...
@@ -1108,7 +1111,7 @@ torchmetrics==1.7.4
# pytorch-lightning
# terratorch
# torchgeo
torchvision==0.2
2.1
+cu128
torchvision==0.2
3.0
+cu128
# via
# -r requirements/test.in
# lightly
...
...
@@ -1139,7 +1142,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.55.
0
transformers==4.55.
2
# via
# -r requirements/test.in
# genai-perf
...
...
@@ -1149,7 +1152,7 @@ transformers==4.55.0
# transformers-stream-generator
transformers-stream-generator==0.0.5
# via -r requirements/test.in
triton==3.
3.1
triton==3.
4.0
# via torch
tritonclient==2.51.0
# via
...
...
requirements/tpu.txt
View file @
d2b52805
...
...
@@ -11,6 +11,7 @@ ray[default]
ray[data]
setuptools==78.1.0
nixl==0.3.0
tpu_info==0.4.0
# Install torch_xla
--pre
...
...
setup.py
View file @
d2b52805
...
...
@@ -643,16 +643,25 @@ if envs.VLLM_USE_PRECOMPILED:
if
wheel_location
is
not
None
:
wheel_url
=
wheel_location
else
:
import
platform
arch
=
platform
.
machine
()
if
arch
==
"x86_64"
:
wheel_tag
=
"manylinux1_x86_64"
elif
arch
==
"aarch64"
:
wheel_tag
=
"manylinux2014_aarch64"
else
:
raise
ValueError
(
f
"Unsupported architecture:
{
arch
}
"
)
base_commit
=
precompiled_wheel_utils
.
get_base_commit_in_main_branch
()
wheel_url
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
wheel_url
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-
{
wheel_tag
}
.whl"
nightly_wheel_url
=
f
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-
{
wheel_tag
}
.whl"
from
urllib.request
import
urlopen
try
:
with
urlopen
(
wheel_url
)
as
resp
:
if
resp
.
status
!=
200
:
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
wheel_url
=
nightly_wheel_url
except
Exception
as
e
:
print
(
f
"[warn] Falling back to nightly wheel:
{
e
}
"
)
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
wheel_url
=
nightly_wheel_url
patch
=
precompiled_wheel_utils
.
extract_precompiled_and_patch_package
(
wheel_url
)
...
...
@@ -685,7 +694,9 @@ setup(
"mistral_common[audio]"
],
# Required for audio processing
"video"
:
[],
# Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer"
:
[
"flashinfer-python==0.2.11"
],
"flashinfer"
:
[
"flashinfer-python==0.2.14.post1"
],
# Optional deps for AMD FP4 quantization support
"petit-kernel"
:
[
"petit-kernel"
],
},
cmdclass
=
cmdclass
,
package_data
=
package_data
,
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
d2b52805
...
...
@@ -12,7 +12,6 @@ import pytest
import
torch
from
vllm
import
LLM
,
envs
from
vllm.platforms
import
current_platform
from
vllm.v1.engine.llm_engine
import
LLMEngine
as
LLMEngineV1
from
..conftest
import
HfRunner
,
VllmRunner
...
...
@@ -78,11 +77,7 @@ def test_models(
"VLLM_USE_V1"
)
and
envs
.
VLLM_USE_V1
:
pytest
.
skip
(
"enable_prompt_embeds is not supported in v1."
)
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
backend
in
(
"XFORMERS"
,
"FLASHINFER"
)
and
model
==
"google/gemma-2-2b-it"
:
if
backend
==
"XFORMERS"
and
model
==
"google/gemma-2-2b-it"
:
pytest
.
skip
(
f
"
{
backend
}
does not support gemma2 with full context length."
)
...
...
@@ -141,8 +136,6 @@ def test_models(
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"mp"
,
""
,
"L4"
,
{}),
(
"distilbert/distilgpt2"
,
"ray"
,
""
,
"A100"
,
{}),
(
"distilbert/distilgpt2"
,
"mp"
,
""
,
"A100"
,
{}),
(
"distilbert/distilgpt2"
,
"mp"
,
"FLASHINFER"
,
"A100"
,
{}),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
,
{}),
])
@
pytest
.
mark
.
parametrize
(
"enable_prompt_embeds"
,
[
True
,
False
])
def
test_models_distributed
(
...
...
tests/basic_correctness/test_chunked_prefill.py
deleted
100644 → 0
View file @
9a521c23
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Compare the outputs of HF and vLLM when using greedy sampling.
It tests chunked prefill. Chunked prefill can be enabled by
enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
from
__future__
import
annotations
from
typing
import
TYPE_CHECKING
import
pytest
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
..models.utils
import
check_logprobs_close
,
check_outputs_equal
from
..utils
import
multi_gpu_test
if
TYPE_CHECKING
:
from
.conftest
import
HfRunner
,
VllmRunner
MODELS
=
[
"facebook/opt-125m"
,
"meta-llama/Llama-3.2-1B-Instruct"
,
]
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
pytest
.
param
(
"FLASHINFER"
,
marks
=
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"FLASHINFER isn't supported on ROCm"
)),
"FLASH_ATTN"
])
def
test_models
(
hf_runner
:
HfRunner
,
vllm_runner
:
VllmRunner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
attention_backend
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
max_num_seqs
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
pytest
.
param
(
"FLASHINFER"
,
marks
=
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"FLASHINFER isn't supported on ROCm"
)),
"FLASH_ATTN"
])
def
test_models_distributed
(
hf_runner
:
HfRunner
,
vllm_runner
:
VllmRunner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
attention_backend
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
attention_backend
)
if
(
model
==
"meta-llama/Llama-3.2-1B-Instruct"
and
distributed_executor_backend
==
"ray"
):
# test Ray Compiled Graph
m
.
setenv
(
"VLLM_USE_RAY_SPMD_WORKER"
,
"1"
)
m
.
setenv
(
"VLLM_USE_RAY_COMPILED_DAG"
,
"1"
)
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
assert
chunked_prefill_token_size
!=
-
1
enable_chunked_prefill
=
True
max_num_batched_tokens
=
chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with
# fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
max_num_seqs
=
max_num_seqs
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
,
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype,model"
,
[(
"fp8_e4m3"
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
)])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
# Due to low-precision numerical divergence, this test is too sensitive to
# the async postprocessor
@
pytest
.
mark
.
parametrize
(
"disable_async_output_proc"
,
[
True
])
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"machete_prepack_B isn't supported on ROCm"
)
def
test_models_with_fp8_kv_cache
(
vllm_runner
:
VllmRunner
,
example_prompts
,
kv_cache_dtype
:
str
,
model
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
tensor_parallel_size
:
int
,
disable_async_output_proc
:
bool
,
)
->
None
:
"""
Check output logprobs match between no_chunked_prefill and chunked_prefill
with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
so here we only check chunked prefill.
"""
NUM_LOG_PROBS
=
8
max_num_seqs
=
chunked_prefill_token_size
max_num_batched_tokens
=
chunked_prefill_token_size
with
vllm_runner
(
model
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
no_chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
kv_cache_dtype
=
kv_cache_dtype
,
disable_async_output_proc
=
disable_async_output_proc
,
)
as
vllm_model
:
chunked_prefill_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
no_chunked_prefill_outputs
,
outputs_1_lst
=
chunked_prefill_outputs
,
name_0
=
"no_chunked_prefill"
,
name_1
=
"chunked_prefill"
,
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"chunk_size"
,
[
30
,
32
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
def
test_with_prefix_caching
(
vllm_runner
:
VllmRunner
,
max_tokens
:
int
,
enforce_eager
:
bool
,
chunk_size
:
int
,
tensor_parallel_size
:
int
,
dtype
:
str
,
)
->
None
:
"""
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model
=
"meta-llama/Llama-3.2-1B-Instruct"
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt
=
"You are a helpful AI assistant "
*
20
unique_prompts
=
[
"Question"
,
# Warmup
"Question"
,
# Fully cached
"Another question"
,
# Partial cached
]
full_prompts
=
[
f
"
{
common_prompt
}
\n
{
p
}
"
for
p
in
unique_prompts
]
max_num_batched_tokens
=
max_num_seqs
=
chunk_size
outputs
=
{}
# type: ignore
for
enable
in
(
True
,
False
):
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
enable
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
)
as
vllm_model
:
outputs
[
enable
]
=
[]
for
prompt
in
full_prompts
:
outputs
[
enable
]
+=
vllm_model
.
generate_greedy
(
[
prompt
],
max_tokens
,
)
check_outputs_equal
(
outputs_0_lst
=
outputs
[
False
],
outputs_1_lst
=
outputs
[
True
],
name_0
=
"w/o prefix caching"
,
name_1
=
"with prefix caching"
,
)
tests/basic_correctness/test_cumem.py
View file @
d2b52805
...
...
@@ -177,3 +177,34 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
# cmp output
assert
output
[
0
].
outputs
[
0
].
text
==
output3
[
0
].
outputs
[
0
].
text
@
create_new_process_for_each_test
()
def
test_deep_sleep
():
model
=
"Qwen/Qwen3-0.6B"
free
,
total
=
torch
.
cuda
.
mem_get_info
()
used_bytes_baseline
=
total
-
free
# in case other process is running
llm
=
LLM
(
model
,
enable_sleep_mode
=
True
)
prompt
=
"How are you?"
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
10
)
output
=
llm
.
generate
(
prompt
,
sampling_params
)
# Put the engine to deep sleep
llm
.
sleep
(
level
=
2
)
free_gpu_bytes_after_sleep
,
total
=
torch
.
cuda
.
mem_get_info
()
used_bytes
=
total
-
free_gpu_bytes_after_sleep
-
used_bytes_baseline
assert
used_bytes
<
3
*
GiB_bytes
llm
.
wake_up
(
tags
=
[
"weights"
])
llm
.
collective_rpc
(
"reload_weights"
)
free_gpu_bytes_wake_up_w
,
total
=
torch
.
cuda
.
mem_get_info
()
used_bytes
=
total
-
free_gpu_bytes_wake_up_w
-
used_bytes_baseline
assert
used_bytes
<
4
*
GiB_bytes
# now allocate kv cache and cuda graph memory
llm
.
wake_up
(
tags
=
[
"kv_cache"
])
output2
=
llm
.
generate
(
prompt
,
sampling_params
)
# cmp output
assert
output
[
0
].
outputs
[
0
].
text
==
output2
[
0
].
outputs
[
0
].
text
tests/benchmarks/test_random_dataset.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
from
typing
import
Any
,
NamedTuple
,
Optional
,
cast
import
numpy
as
np
import
pytest
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizerBase
from
vllm.benchmarks.datasets
import
(
RandomDataset
,
RandomMultiModalDataset
,
SampleRequest
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
hf_tokenizer
()
->
PreTrainedTokenizerBase
:
# Use a small, commonly available tokenizer
return
AutoTokenizer
.
from_pretrained
(
"gpt2"
)
class
Params
(
NamedTuple
):
num_requests
:
int
prefix_len
:
int
range_ratio
:
float
input_len
:
int
output_len
:
int
@
pytest
.
fixture
(
scope
=
"session"
)
def
random_dataset_params
()
->
Params
:
return
Params
(
num_requests
=
16
,
prefix_len
=
7
,
range_ratio
=
0.3
,
input_len
=
50
,
output_len
=
20
)
def
_fingerprint_sample
(
req
:
SampleRequest
)
->
tuple
[
str
,
int
,
int
]:
"""Project a SampleRequest into a comparable tuple."""
return
(
req
.
prompt
,
req
.
prompt_len
,
req
.
expected_output_len
)
def
_collect_samples
(
dataset
:
RandomDataset
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
=
16
,
prefix_len
:
int
=
7
,
range_ratio
:
float
=
0.3
,
input_len
:
int
=
50
,
output_len
:
int
=
20
)
->
list
[
tuple
[
str
,
int
,
int
]]:
samples
=
dataset
.
sample
(
tokenizer
=
tokenizer
,
num_requests
=
num_requests
,
prefix_len
=
prefix_len
,
range_ratio
=
range_ratio
,
input_len
=
input_len
,
output_len
=
output_len
,
)
return
[
_fingerprint_sample
(
s
)
for
s
in
samples
]
@
pytest
.
mark
.
benchmark
def
test_random_dataset_same_seed
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
random_dataset_params
:
Params
)
->
None
:
"""Same seed should yield identical outputs, even if global RNGs change.
This guards against accidental reliance on Python's random or np.random
in RandomDataset after moving to numpy.default_rng.
"""
p
=
random_dataset_params
common_seed
=
123
dataset_a
=
RandomDataset
(
random_seed
=
common_seed
)
dataset_b
=
RandomDataset
(
random_seed
=
common_seed
)
a
=
_collect_samples
(
dataset_a
,
hf_tokenizer
,
num_requests
=
p
.
num_requests
,
prefix_len
=
p
.
prefix_len
,
range_ratio
=
p
.
range_ratio
,
input_len
=
p
.
input_len
,
output_len
=
p
.
output_len
)
# Perturb global RNG state to ensure isolation
random
.
seed
(
999
)
_
=
[
random
.
random
()
for
_
in
range
(
100
)]
np
.
random
.
seed
(
888
)
_
=
[
np
.
random
.
random
()
for
_
in
range
(
100
)]
b
=
_collect_samples
(
dataset_b
,
hf_tokenizer
,
num_requests
=
p
.
num_requests
,
prefix_len
=
p
.
prefix_len
,
range_ratio
=
p
.
range_ratio
,
input_len
=
p
.
input_len
,
output_len
=
p
.
output_len
)
assert
a
==
b
@
pytest
.
mark
.
benchmark
def
test_random_dataset_different_seeds
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
random_dataset_params
:
Params
)
->
None
:
"""Different seeds should change outputs with overwhelming likelihood."""
p
=
random_dataset_params
seed_a
=
0
dataset_a
=
RandomDataset
(
random_seed
=
seed_a
)
a
=
_collect_samples
(
dataset_a
,
hf_tokenizer
,
num_requests
=
p
.
num_requests
,
prefix_len
=
p
.
prefix_len
,
range_ratio
=
p
.
range_ratio
,
input_len
=
p
.
input_len
,
output_len
=
p
.
output_len
)
seed_b
=
999
dataset_b
=
RandomDataset
(
random_seed
=
seed_b
)
# Perturb global RNG with same seed as dataset_a to ensure isolation
random
.
seed
(
seed_a
)
np
.
random
.
seed
(
seed_a
)
b
=
_collect_samples
(
dataset_b
,
hf_tokenizer
,
num_requests
=
p
.
num_requests
,
prefix_len
=
p
.
prefix_len
,
range_ratio
=
p
.
range_ratio
,
input_len
=
p
.
input_len
,
output_len
=
p
.
output_len
)
assert
a
!=
b
# -----------------------------
# RandomMultiModalDataset tests
# -----------------------------
def
_mm_fingerprint_sample
(
req
:
SampleRequest
,
)
->
tuple
[
str
,
int
,
int
,
int
,
list
[
str
]]:
"""Create a compact fingerprint for multimodal samples.
Includes:
- prompt string
- prompt_len
- expected_output_len
- count of multimodal items
- per-item type and URL prefix (e.g., 'data:image/jpeg;base64,')
"""
items
=
req
.
multi_modal_data
or
[]
item_prefixes
:
list
[
str
]
=
[]
for
it
in
items
:
if
isinstance
(
it
,
dict
)
and
it
.
get
(
"type"
)
==
"image_url"
:
url
=
it
.
get
(
"image_url"
,
{}).
get
(
"url"
,
""
)
# Only keep a short identifying prefix to avoid huge strings
item_prefixes
.
append
(
f
"image:
{
url
[:
22
]
}
"
)
elif
isinstance
(
it
,
dict
)
and
it
.
get
(
"type"
)
==
"video_url"
:
url
=
it
.
get
(
"video_url"
,
{}).
get
(
"url"
,
""
)
item_prefixes
.
append
(
f
"video:
{
url
[:
22
]
}
"
)
else
:
item_prefixes
.
append
(
"unknown:"
)
return
(
req
.
prompt
,
req
.
prompt_len
,
req
.
expected_output_len
,
len
(
items
),
item_prefixes
)
def
_collect_mm_samples
(
dataset
:
RandomMultiModalDataset
,
tokenizer
:
PreTrainedTokenizerBase
,
*
,
num_requests
:
int
=
8
,
prefix_len
:
int
=
3
,
range_ratio
:
float
=
0.0
,
input_len
:
int
=
20
,
output_len
:
int
=
5
,
base_items_per_request
:
int
=
2
,
num_mm_items_range_ratio
:
float
=
0.0
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
=
None
,
bucket_config
:
Optional
[
dict
[
tuple
[
int
,
int
,
int
],
float
]]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
)
->
list
[
SampleRequest
]:
if
limit_mm_per_prompt
is
None
:
limit_mm_per_prompt
=
{
"image"
:
5
,
"video"
:
0
}
if
bucket_config
is
None
:
bucket_config
=
{(
32
,
32
,
1
):
0.5
,
(
52
,
64
,
1
):
0.5
}
return
dataset
.
sample
(
tokenizer
=
tokenizer
,
num_requests
=
num_requests
,
prefix_len
=
prefix_len
,
range_ratio
=
range_ratio
,
input_len
=
input_len
,
output_len
=
output_len
,
base_items_per_request
=
base_items_per_request
,
num_mm_items_range_ratio
=
num_mm_items_range_ratio
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
bucket_config
=
bucket_config
,
enable_multimodal_chat
=
enable_multimodal_chat
,
)
@
pytest
.
mark
.
benchmark
def
test_random_mm_same_seed
(
hf_tokenizer
:
PreTrainedTokenizerBase
)
->
None
:
seed
=
42
ds_a
=
RandomMultiModalDataset
(
random_seed
=
seed
)
ds_b
=
RandomMultiModalDataset
(
random_seed
=
seed
)
a
=
_collect_mm_samples
(
ds_a
,
hf_tokenizer
)
b
=
_collect_mm_samples
(
ds_b
,
hf_tokenizer
)
fa
=
[
_mm_fingerprint_sample
(
s
)
for
s
in
a
]
fb
=
[
_mm_fingerprint_sample
(
s
)
for
s
in
b
]
assert
fa
==
fb
@
pytest
.
mark
.
benchmark
def
test_random_mm_different_seeds
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
)
->
None
:
ds_a
=
RandomMultiModalDataset
(
random_seed
=
0
)
ds_b
=
RandomMultiModalDataset
(
random_seed
=
999
)
a
=
_collect_mm_samples
(
ds_a
,
hf_tokenizer
)
b
=
_collect_mm_samples
(
ds_b
,
hf_tokenizer
)
fa
=
[
_mm_fingerprint_sample
(
s
)
for
s
in
a
]
fb
=
[
_mm_fingerprint_sample
(
s
)
for
s
in
b
]
assert
fa
!=
fb
@
pytest
.
mark
.
benchmark
def
test_random_mm_respects_limits
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
)
->
None
:
ds
=
RandomMultiModalDataset
(
random_seed
=
0
)
# Requesting 3 items with a per-prompt limit of 1 should error per current
# design (dataset refuses to silently clamp below the requested baseline).
with
pytest
.
raises
(
ValueError
):
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
12
,
base_items_per_request
=
3
,
num_mm_items_range_ratio
=
0.0
,
limit_mm_per_prompt
=
{
"image"
:
1
,
"video"
:
0
},
bucket_config
=
{(
32
,
32
,
1
):
1.0
},
)
@
pytest
.
mark
.
benchmark
def
test_random_mm_zero_prob_entries_are_removed
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
)
->
None
:
ds
=
RandomMultiModalDataset
(
random_seed
=
0
)
# Second bucket has zero probability and should be ignored after
# normalization
samples
=
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
6
,
base_items_per_request
=
2
,
num_mm_items_range_ratio
=
0.0
,
limit_mm_per_prompt
=
{
"image"
:
10
,
"video"
:
0
},
bucket_config
=
{(
32
,
32
,
1
):
1.0
,
(
52
,
64
,
1
):
0.0
},
)
for
s
in
samples
:
assert
isinstance
(
s
.
multi_modal_data
,
list
)
typed_mm
=
cast
(
list
[
dict
[
str
,
Any
]],
s
.
multi_modal_data
)
for
it
in
typed_mm
:
assert
it
.
get
(
"type"
)
==
"image_url"
@
pytest
.
mark
.
benchmark
def
test_random_mm_zero_items
(
hf_tokenizer
:
PreTrainedTokenizerBase
)
->
None
:
ds
=
RandomMultiModalDataset
(
random_seed
=
0
)
samples
=
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
5
,
base_items_per_request
=
0
,
num_mm_items_range_ratio
=
0.0
,
limit_mm_per_prompt
=
{
"image"
:
5
,
"video"
:
0
},
bucket_config
=
{(
32
,
32
,
1
):
1.0
},
)
for
s
in
samples
:
assert
s
.
multi_modal_data
==
[]
@
pytest
.
mark
.
benchmark
def
test_random_mm_num_items_per_prompt
(
hf_tokenizer
:
PreTrainedTokenizerBase
)
->
None
:
ds
=
RandomMultiModalDataset
(
random_seed
=
0
)
# Fixed number of images per prompt
# set num_mm_items_range_ratio to 0.0
# TODO: modify video values when video sampling is implemented
samples_fixed_items
=
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
5
,
base_items_per_request
=
3
,
num_mm_items_range_ratio
=
0.0
,
limit_mm_per_prompt
=
{
"image"
:
3
,
"video"
:
0
},
bucket_config
=
{(
32
,
32
,
1
):
1.0
},
)
# Must have 5 requests each with 3 mm items per prompt
assert
len
(
samples_fixed_items
)
==
5
for
s
in
samples_fixed_items
:
mm_data
=
cast
(
list
[
dict
[
str
,
Any
]],
s
.
multi_modal_data
)
assert
len
(
mm_data
)
==
3
for
it
in
mm_data
:
assert
it
.
get
(
"type"
)
==
"image_url"
@
pytest
.
mark
.
benchmark
def
test_random_mm_bucket_config_not_mutated
(
hf_tokenizer
:
PreTrainedTokenizerBase
,
)
->
None
:
ds
=
RandomMultiModalDataset
(
random_seed
=
0
)
# This bucket config is not normalized to sum to 1
# and has more buckets than requested images
original
=
{(
32
,
32
,
1
):
0.2
,
(
52
,
64
,
1
):
6
,
(
25
,
64
,
1
):
3
}
# Keep a snapshot to compare after sampling
snapshot
=
dict
(
original
)
_
=
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
4
,
base_items_per_request
=
1
,
num_mm_items_range_ratio
=
0.0
,
limit_mm_per_prompt
=
{
"image"
:
1
,
"video"
:
0
},
bucket_config
=
original
,
)
# Ensure the original dict content is unchanged
assert
original
==
snapshot
# Vary number of mm items per prompt
# set num_mm_items_range_ratio to 0.5
samples_varying_items
=
_collect_mm_samples
(
ds
,
hf_tokenizer
,
num_requests
=
5
,
base_items_per_request
=
2
,
num_mm_items_range_ratio
=
0.5
,
limit_mm_per_prompt
=
{
"image"
:
4
,
"video"
:
0
},
bucket_config
=
{(
32
,
32
,
1
):
1.0
},
)
# Must have 5 requests each with less than 4 mm items per prompt
# but at least 1 mm item per prompt
assert
len
(
samples_varying_items
)
==
5
for
s
in
samples_varying_items
:
mm_data
=
cast
(
list
[
dict
[
str
,
Any
]],
s
.
multi_modal_data
)
assert
len
(
mm_data
)
<=
4
assert
len
(
mm_data
)
>=
1
for
it
in
mm_data
:
assert
it
.
get
(
"type"
)
==
"image_url"
tests/compile/piecewise/test_multiple_graphs.py
View file @
d2b52805
...
...
@@ -12,10 +12,9 @@ from vllm.compilation.backends import set_model_tag
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
(
ignore_torch_compile
,
support_torch_compile
)
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.forward_context
import
set_forward_context
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
...
...
@@ -164,103 +163,33 @@ class SimpleModelWithTwoGraphs(ParentModel):
return
x
def
test_ignore_torch_compile_decorator
():
assert
VLLM_USE_V1
# piecewise
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
@
support_torch_compile
class
A
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
x
+
x
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
x
*
3
return
x
@
ignore_torch_compile
class
B
(
A
):
...
@
support_torch_compile
class
C
(
B
):
...
with
set_current_vllm_config
(
vllm_config
):
mod_A
=
A
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# A has support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
),
set_forward_context
({},
vllm_config
=
vllm_config
):
# first run is for compile
mod_A
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
# run cudagraph captured sizes
mod_A
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_A
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
with
set_current_vllm_config
(
vllm_config
):
mod_B
=
B
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# B's ignore_torch_compile should override A's support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
num_piecewise_graphs_seen
=
0
,
num_piecewise_capturable_graphs_seen
=
0
,
num_backend_compilations
=
0
,
num_cudagraph_captured
=
0
,
),
set_forward_context
({},
vllm_config
=
vllm_config
):
mod_B
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
mod_B
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_B
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
with
set_current_vllm_config
(
vllm_config
):
mod_C
=
C
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# C's support_torch_compile should override B's ignore_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
),
set_forward_context
({},
vllm_config
=
vllm_config
):
mod_C
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
mod_C
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_C
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
@
torch
.
inference_mode
def
run_model
(
vllm_config
,
model
:
nn
.
Module
,
inputs
:
torch
.
Tensor
):
def
run_model
(
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
inputs
:
torch
.
Tensor
,
cudagraph_runtime_mode
:
CUDAGraphMode
):
with
set_forward_context
({},
vllm_config
=
vllm_config
):
#
First run is for compile
#
warmup for the model with cudagraph_mode NONE
model
(
inputs
)
# Run CUDAGraph captured sizes
# simulate cudagraphs capturing
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
model
(
inputs
[:
2
])
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
1
,
)):
model
(
inputs
[:
1
])
# simulate cudagraphs replay
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
output
=
model
(
inputs
[:
2
])
output
=
output
.
cpu
()
...
...
@@ -277,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
...
...
@@ -299,11 +229,13 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_cudagraph_captured
=
8
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
,
cudagraph_runtime_mode
))
# no compile or cudagraph
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
NO_COMPILATION
,
))
cudagraph_runtime_mode
=
CUDAGraphMode
.
NONE
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
...
...
@@ -318,7 +250,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_backend_compilations
=
0
,
num_cudagraph_captured
=
0
,
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
,
cudagraph_runtime_mode
))
# piecewise compile without CUDA graph
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
...
...
@@ -326,6 +259,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
use_cudagraph
=
False
,
splitting_ops
=
[
"silly.attention"
],
))
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
...
...
@@ -340,7 +274,8 @@ def test_multi_graph_piecewise_compile_outputs_equal():
num_backend_compilations
=
4
,
num_cudagraph_captured
=
0
,
# no cudagraph captured
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
,
cudagraph_runtime_mode
))
# Generally don't expect outputs with and without inductor
# to be bitwise equivalent
...
...
tests/compile/test_basic_correctness.py
View file @
d2b52805
...
...
@@ -34,7 +34,7 @@ class TestSetting:
model_args
=
[
"--max-model-len"
,
"2048"
],
pp_size
=
2
,
tp_size
=
2
,
attn_backend
=
"FLASH
INFER
"
,
attn_backend
=
"FLASH
_ATTN
"
,
method
=
"generate"
,
fullgraph
=
True
,
),
...
...
tests/compile/test_decorator.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
(
ignore_torch_compile
,
support_torch_compile
)
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
BATCH_SIZE
=
32
MLP_SIZE
=
128
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
torch
.
inference_mode
def
run_model
(
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
cudagraph_runtime_mode
:
CUDAGraphMode
):
with
set_forward_context
({},
vllm_config
=
vllm_config
):
# warmup for the model with cudagraph_mode NONE
model
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
# simulate cudagraphs capturing
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
model
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
1
,
)):
model
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
# simulate cudagraphs replay
with
set_forward_context
({},
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
cudagraph_runtime_mode
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
output
=
model
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
output
=
output
.
cpu
()
return
output
.
cpu
()
def
test_ignore_torch_compile_decorator
():
# piecewise
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
@
support_torch_compile
class
A
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
x
+
x
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
x
*
3
return
x
@
ignore_torch_compile
class
B
(
A
):
...
@
support_torch_compile
class
C
(
B
):
...
with
set_current_vllm_config
(
vllm_config
):
mod_A
=
A
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# A has support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model
(
vllm_config
,
mod_A
,
cudagraph_runtime_mode
)
with
set_current_vllm_config
(
vllm_config
):
mod_B
=
B
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# B's ignore_torch_compile should override A's support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
num_piecewise_graphs_seen
=
0
,
num_piecewise_capturable_graphs_seen
=
0
,
num_backend_compilations
=
0
,
num_cudagraph_captured
=
0
,
):
run_model
(
vllm_config
,
mod_B
,
cudagraph_runtime_mode
)
with
set_current_vllm_config
(
vllm_config
):
mod_C
=
C
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# C's support_torch_compile should override B's ignore_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model
(
vllm_config
,
mod_C
,
cudagraph_runtime_mode
)
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=True
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
vllm_config
.
cache_config
.
kv_sharing_fast_prefill
)
class
B
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
x
+
x
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
x
+
x
return
x
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=False
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
not
vllm_config
.
cache_config
.
kv_sharing_fast_prefill
)
class
A
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
self
.
mod1
=
B
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
**
kwargs
)
self
.
mod2
=
B
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
**
kwargs
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
self
.
mod1
(
x
)
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
self
.
mod2
(
x
)
return
x
def
test_conditional_compile_enable_if
():
vllm_config
=
VllmConfig
(
cache_config
=
CacheConfig
(
kv_sharing_fast_prefill
=
True
,
),
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
with
set_current_vllm_config
(
vllm_config
):
mod_A
=
A
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2
# to be compiled
with
compilation_counter
.
expect
(
num_graphs_seen
=
2
,
num_piecewise_graphs_seen
=
6
,
# 3 piecewise graphs per instance of B()
num_piecewise_capturable_graphs_seen
=
4
,
num_backend_compilations
=
4
,
num_cudagraph_captured
=
8
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model
(
vllm_config
,
mod_A
,
cudagraph_runtime_mode
)
# Set kv_sharing_fast_prefill=False
# which will cause A to be compiled and B to not be compiled
vllm_config
=
VllmConfig
(
cache_config
=
CacheConfig
(
kv_sharing_fast_prefill
=
False
,
),
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
with
set_current_vllm_config
(
vllm_config
):
mod_A
=
A
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
7
,
# 3 attn ops and 4 non-attn ops
num_piecewise_capturable_graphs_seen
=
4
,
num_backend_compilations
=
4
,
num_cudagraph_captured
=
8
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
run_model
(
vllm_config
,
mod_A
,
cudagraph_runtime_mode
)
tests/compile/test_full_graph.py
View file @
d2b52805
...
...
@@ -53,12 +53,6 @@ def models_list(*, all: bool = True, keywords: Optional[list[str]] = None):
"quantization"
:
"gptq_marlin_24"
}))
if
is_quant_method_supported
(
"marlin"
):
TEST_MODELS
.
append
(
(
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
,
{
"quantization"
:
"marlin"
}))
if
not
current_platform
.
is_rocm
()
and
is_quant_method_supported
(
"awq"
):
TEST_MODELS
.
append
((
"TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"
,
{
"quantization"
:
"AWQ"
...
...
tests/compile/test_functionalization.py
View file @
d2b52805
...
...
@@ -8,11 +8,12 @@ import vllm.envs as envs
from
vllm
import
LLM
,
SamplingParams
from
vllm.compilation.activation_quant_fusion
import
ActivationQuantFusionPass
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fusion
import
(
FUSED_OPS
,
FusionPass
,
QuantKey
,
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
from
vllm.compilation.fusion
import
FUSED_OPS
,
FusionPass
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
,
PassConfig
,
VllmConfig
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
QuantKey
,
kFp8DynamicTokenSym
,
kFp8StaticTensorSym
)
from
.backend
import
TestBackend
...
...
tests/compile/test_fusion.py
View file @
d2b52805
...
...
@@ -7,13 +7,15 @@ import torch
import
vllm.envs
as
envs
import
vllm.plugins
from
vllm.compilation.fusion
import
(
FUSED_OPS
,
QUANT_OPS
,
FusedRMSQuantKey
,
FusionPass
,
GroupShape
,
QuantKey
)
FusionPass
)
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
PassConfig
,
VllmConfig
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
QuantKey
,
ScaleDesc
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
CUTLASS_FP8_SUPPORTED
,
Fp8LinearOp
,
maybe_create_device_identity
)
Fp8LinearOp
,
maybe_create_device_identity
)
from
vllm.platforms
import
current_platform
from
.backend
import
TestBackend
...
...
@@ -24,16 +26,14 @@ FP8_DTYPE = current_platform.fp8_dtype()
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
,
static
:
bool
,
cutlass_fp8_enabled
:
bool
,
*
args
,
**
kwargs
):
force_fp8_e4m3fnuz
:
bool
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
cutlass_fp8_enabled
=
cutlass_fp8_enabled
self
.
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
group_shape
=
GroupShape
.
PER_TENSOR
if
static
else
GroupShape
.
PER_TOKEN
self
.
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
static
=
static
,
group_shape
=
group_shape
,
symmetric
=
True
)
quant_scale
=
ScaleDesc
(
torch
.
float32
,
static
,
group_shape
)
self
.
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
scale
=
quant_scale
,
symmetric
=
True
)
if
static
:
self
.
scale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
else
:
...
...
@@ -43,7 +43,7 @@ class TestModel(torch.nn.Module):
for
_
in
range
(
2
)
]
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
cutlass_fp8_enabled
,
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
,
act_quant_static
=
static
,
act_quant_group_shape
=
group_shape
,
)
...
...
@@ -81,12 +81,11 @@ class TestModel(torch.nn.Module):
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
7
,
256
,
533
,
2048
,
2049
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"static"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"cutlass_fp8_enabled"
,
[
True
,
False
]
if
CUTLASS_FP8_SUPPORTED
else
[
False
])
@
pytest
.
mark
.
parametrize
(
"force_fp8_e4m3fnuz"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
reason
=
"Only test on CUDA and ROCm"
)
def
test_fusion_rmsnorm_quant
(
dtype
,
hidden_size
,
num_tokens
,
eps
,
static
,
cutlass_fp8_enabled
):
force_fp8_e4m3fnuz
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
...
...
@@ -103,7 +102,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
cutlass_fp8_enabled
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
force_fp8_e4m3fnuz
)
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
...
...
tests/compile/test_fusion_all_reduce.py
View file @
d2b52805
...
...
@@ -148,7 +148,7 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module):
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
@
pytest
.
mark
.
skipif
(
...
...
tests/compile/test_fusion_attn.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
copy
from
typing
import
Optional
import
pytest
...
...
@@ -7,13 +8,29 @@ import torch._dynamo
from
tests.compile.backend
import
TestBackend
from
tests.models.utils
import
check_outputs_equal
from
tests.v1.attention.utils
import
(
BatchSpec
,
_Backend
,
create_common_attn_metadata
)
from
vllm
import
LLM
,
SamplingParams
from
vllm.compilation.fusion
import
QUANT_OPS
,
QuantKey
,
kFp8StaticTensorSym
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
from
vllm.attention
import
Attention
from
vllm.attention.selector
import
global_force_attn_backend_context_manager
from
vllm.compilation.fusion
import
QUANT_OPS
from
vllm.compilation.fusion_attn
import
ATTN_OP
,
AttnFusionPass
from
vllm.compilation.fx_utils
import
find_op_nodes
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
,
CompilationLevel
,
VllmConfig
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationLevel
,
ModelConfig
,
PassConfig
,
SchedulerConfig
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
get_forward_context
,
set_forward_context
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
QuantKey
,
kFp8StaticTensorSym
,
kNvfp4Quant
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
Fp8LinearOp
)
from
vllm.platforms
import
current_platform
from
vllm.v1.kv_cache_interface
import
AttentionSpec
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP4_DTYPE
=
torch
.
uint8
# globals needed for string-import custom Dynamo backend field
backend
:
Optional
[
TestBackend
]
=
None
...
...
@@ -90,9 +107,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# check support
attn_fusion_supported
=
[
layer
.
impl
.
fused_output_quant_supported
(
quant_key
.
dtype
,
quant_key
.
static
,
quant_key
.
group_shape
)
layer
.
impl
.
fused_output_quant_supported
(
quant_key
)
for
key
,
layer
in
compile_config
.
static_forward_context
.
items
()
]
...
...
@@ -132,3 +147,309 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# Reset backend to make sure llm2 gets released
backend
=
None
class
AttentionQuantPatternModel
(
torch
.
nn
.
Module
):
"""Base model for AttentionQuantPattern fusion."""
def
__init__
(
self
,
num_qo_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
kv_cache_dtype
:
torch
.
dtype
,
device
:
torch
.
device
,
vllm_config
:
VllmConfig
,
**
kwargs
):
super
().
__init__
()
self
.
num_qo_heads
=
num_qo_heads
self
.
num_kv_heads
=
num_kv_heads
self
.
head_size
=
head_size
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
device
=
device
self
.
vllm_config
=
vllm_config
self
.
attn
=
Attention
(
num_heads
=
self
.
num_qo_heads
,
head_size
=
self
.
head_size
,
scale
=
1.0
/
(
self
.
head_size
**
0.5
),
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
vllm_config
.
cache_config
,
prefix
=
"model.layers.0.self_attn.attn"
,
)
self
.
block_size
=
16
# Initialize attn MetadataBuilder
self
.
builder
=
self
.
attn
.
attn_backend
.
get_builder_cls
()(
kv_cache_spec
=
AttentionSpec
(
block_size
=
self
.
block_size
,
num_kv_heads
=
self
.
num_kv_heads
,
head_size
=
self
.
head_size
,
dtype
=
self
.
kv_cache_dtype
,
use_mla
=
False
,
),
layer_names
=
[
self
.
attn
.
layer_name
],
vllm_config
=
self
.
vllm_config
,
device
=
self
.
device
,
)
def
build_attn_metadata
(
self
,
batch_size
:
int
):
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec
=
BatchSpec
(
seq_lens
=
[
1
]
*
batch_size
,
query_lens
=
[
1
]
*
batch_size
)
common_attn_metadata
=
create_common_attn_metadata
(
batch_spec
,
self
.
block_size
,
self
.
device
,
arange_block_indices
=
True
)
max_blocks
=
(
max
(
batch_spec
.
seq_lens
)
+
self
.
block_size
-
1
)
//
self
.
block_size
num_blocks
=
batch_size
*
max_blocks
# Create dummy KV cache for FlashInfer TRTLLM
# - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
kv_cache
=
torch
.
zeros
(
num_blocks
,
2
,
self
.
num_kv_heads
,
self
.
block_size
,
self
.
head_size
,
dtype
=
self
.
kv_cache_dtype
,
device
=
self
.
device
)
kv_cache
=
kv_cache
.
permute
(
0
,
1
,
3
,
2
,
4
)
self
.
attn
.
kv_cache
=
[
kv_cache
]
# Build attn metadata
self
.
attn_metadata
=
self
.
builder
.
build
(
common_prefix_len
=
0
,
common_attn_metadata
=
common_attn_metadata
)
return
self
.
attn_metadata
class
TestAttentionFp8StaticQuantPatternModel
(
AttentionQuantPatternModel
):
"""Test model for AttentionFp8StaticQuantPattern fusion."""
quant_key
=
kFp8StaticTensorSym
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
fp8_linear
=
Fp8LinearOp
(
act_quant_static
=
self
.
quant_key
.
scale
.
static
,
act_quant_group_shape
=
self
.
quant_key
.
scale
.
group_shape
)
hidden_size
=
self
.
num_qo_heads
*
self
.
head_size
self
.
w
=
kwargs
.
get
(
"w"
,
{
"weight"
:
torch
.
randn
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
,
device
=
self
.
device
).
t
(),
"wscale"
:
torch
.
tensor
([
1.0
],
dtype
=
torch
.
float32
,
device
=
self
.
device
),
"scale"
:
torch
.
tensor
([
1.0
],
dtype
=
torch
.
float32
,
device
=
self
.
device
),
})
def
forward
(
self
,
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
):
"""Forward pass that creates the pattern to be fused."""
attn_output
=
self
.
attn
(
q
,
k
,
v
)
return
self
.
fp8_linear
.
apply
(
input
=
attn_output
,
weight
=
self
.
w
[
"weight"
],
weight_scale
=
self
.
w
[
"wscale"
],
input_scale
=
self
.
w
[
"scale"
])
class
TestAttentionNvfp4QuantPatternModel
(
AttentionQuantPatternModel
):
"""Test model for AttentionNvfp4QuantPattern fusion."""
quant_key
=
kNvfp4Quant
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
hidden_size
=
self
.
num_qo_heads
*
self
.
head_size
self
.
w
=
kwargs
.
get
(
"w"
,
{
"weight"
:
torch
.
randint
(
256
,
(
hidden_size
,
hidden_size
//
2
),
dtype
=
FP4_DTYPE
,
device
=
self
.
device
),
"wscale_swizzled"
:
torch
.
randn
(
hidden_size
,
hidden_size
//
16
).
to
(
dtype
=
FP8_DTYPE
,
device
=
self
.
device
),
"wscale"
:
torch
.
tensor
([
500
],
dtype
=
torch
.
float32
,
device
=
self
.
device
),
"scale"
:
torch
.
tensor
([
0.002
],
dtype
=
torch
.
float32
,
device
=
self
.
device
),
})
def
forward
(
self
,
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
):
"""Forward pass that creates the pattern to be fused."""
attn_output
=
self
.
attn
(
q
,
k
,
v
)
quant_output
,
output_block_scale
=
scaled_fp4_quant
(
attn_output
,
1
/
self
.
w
[
"scale"
])
return
cutlass_scaled_fp4_mm
(
a
=
quant_output
,
b
=
self
.
w
[
"weight"
],
block_scale_a
=
output_block_scale
,
block_scale_b
=
self
.
w
[
"wscale_swizzled"
],
alpha
=
self
.
w
[
"scale"
]
*
self
.
w
[
"wscale"
],
out_dtype
=
attn_output
.
dtype
)
@
pytest
.
mark
.
parametrize
(
"num_qo_heads, num_kv_heads"
,
[(
64
,
8
),
(
40
,
8
)])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
7
,
256
,
533
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"model_name, model_class"
,
[(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
TestAttentionFp8StaticQuantPatternModel
),
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
TestAttentionNvfp4QuantPatternModel
)])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
_Backend
.
FLASHINFER
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"Only test CUDA"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_device_capability
((
10
,
0
)),
reason
=
"Only test on SM100(Blackwell)"
)
def
test_attention_quant_pattern
(
num_qo_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
model_name
:
str
,
model_class
:
type
[
AttentionQuantPatternModel
],
backend
:
_Backend
,
monkeypatch
,
dist_init
):
"""Test AttentionStaticQuantPattern fusion pass"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
device
=
torch
.
device
(
"cuda:0"
)
torch
.
manual_seed
(
42
)
vllm_config
=
VllmConfig
(
model_config
=
ModelConfig
(
model
=
model_name
,
max_model_len
=
2048
,
),
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
1024
),
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+quant_fp8"
],
),
cache_config
=
CacheConfig
(
cache_dtype
=
"fp8"
))
# Create test inputs
q
=
torch
.
randn
(
batch_size
,
num_qo_heads
*
head_size
,
dtype
=
dtype
,
device
=
device
)
k
=
torch
.
randn
(
batch_size
,
num_kv_heads
*
head_size
,
dtype
=
dtype
,
device
=
device
)
v
=
torch
.
randn
(
batch_size
,
num_kv_heads
*
head_size
,
dtype
=
dtype
,
device
=
device
)
# Mark first dimension as dynamic for realistic testing
torch
.
_dynamo
.
mark_dynamic
(
q
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
k
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
v
,
0
)
# Run model directly without compilation and fusion
vllm_config_unfused
=
copy
.
deepcopy
(
vllm_config
)
with
set_current_vllm_config
(
vllm_config_unfused
),
set_forward_context
(
attn_metadata
=
None
,
vllm_config
=
vllm_config_unfused
),
global_force_attn_backend_context_manager
(
backend
):
model_unfused
=
model_class
(
num_qo_heads
=
num_qo_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
kv_cache_dtype
=
FP8_DTYPE
,
device
=
device
,
vllm_config
=
vllm_config_unfused
)
model_unfused
=
model_unfused
.
to
(
device
)
forward_ctx
=
get_forward_context
()
forward_ctx
.
attn_metadata
=
model_unfused
.
build_attn_metadata
(
batch_size
)
# Run model directly without compilation and fusion
result_unfused
=
model_unfused
(
q
,
k
,
v
)
# Run model with attn fusion enabled
vllm_config
.
compilation_config
.
pass_config
=
PassConfig
(
enable_attn_fusion
=
True
,
enable_noop
=
True
)
with
set_current_vllm_config
(
vllm_config
),
set_forward_context
(
attn_metadata
=
None
,
vllm_config
=
vllm_config
),
global_force_attn_backend_context_manager
(
backend
):
model_fused
=
model_class
(
num_qo_heads
=
num_qo_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
kv_cache_dtype
=
FP8_DTYPE
,
device
=
device
,
vllm_config
=
vllm_config
,
w
=
model_unfused
.
w
)
model_fused
=
model_fused
.
to
(
device
)
forward_ctx
=
get_forward_context
()
forward_ctx
.
attn_metadata
=
model_fused
.
build_attn_metadata
(
batch_size
)
# Create test backend with fusion passes enabled
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
attn_pass
=
lambda
*
args
,
**
kw
:
AttnFusionPass
(
vllm_config
)(
*
args
,
**
kw
)
test_backend
=
TestBackend
(
noop_pass
,
attn_pass
)
# Compile model with fusion enabled
model_compiled
=
torch
.
compile
(
model_fused
,
backend
=
test_backend
,
fullgraph
=
True
)
assert
model_compiled
.
attn
.
_o_scale_float
is
None
result_fused_1
=
model_compiled
(
q
,
k
,
v
)
# After the 1st round of the forward pass, output quant scale should be
# loaded into the attn layer's _o_scale_float, the 2nd round should
# reuse the loaded _o_scale_float
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
result_fused_2
=
model_compiled
(
q
,
k
,
v
)
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
# Check attn fusion support
quant_key
=
model_class
.
quant_key
attn_fusion_supported
=
[
layer
.
impl
.
fused_output_quant_supported
(
quant_key
)
for
key
,
layer
in
vllm_config
.
compilation_config
.
static_forward_context
.
items
()
]
if
any
(
attn_fusion_supported
):
# Check quantization ops in the graph before and after fusion
test_backend
.
check_before_ops
([
QUANT_OPS
[
quant_key
]],
fully_replaced
=
True
)
# Check attention ops in the graph before and after fusion
attn_nodes_pre
=
list
(
find_op_nodes
(
ATTN_OP
,
test_backend
.
graph_pre_pass
))
attn_nodes_post
=
list
(
find_op_nodes
(
ATTN_OP
,
test_backend
.
graph_post_pass
))
assert
len
(
attn_nodes_pre
)
>
0
,
"Should have attention nodes before fusion"
assert
len
(
attn_nodes_pre
)
==
len
(
attn_nodes_post
),
\
"Should have same number of attention nodes before and after fusion"
assert
attn_nodes_pre
[
0
].
kwargs
.
get
(
"output_scale"
)
is
None
,
\
"Attention should not have output_scale before fusion"
assert
attn_nodes_post
[
0
].
kwargs
.
get
(
"output_scale"
)
is
not
None
,
\
"Attention should have output_scale after fusion"
assert
attn_nodes_pre
[
0
].
kwargs
.
get
(
"output_block_scale"
)
is
None
,
\
"Attention should not have output_block_scale before fusion"
if
quant_key
.
dtype
==
FP8_DTYPE
:
assert
attn_nodes_post
[
0
].
kwargs
.
get
(
"output_block_scale"
)
is
None
,
\
"Attention should not have output_block_scale after FP8 fusion"
elif
quant_key
.
dtype
==
FP4_DTYPE
:
assert
attn_nodes_post
[
0
].
kwargs
.
get
(
"output_block_scale"
)
is
not
None
,
\
"Attention should have output_block_scale after FP4 fusion"
# noqa: E501
# Check that results are closed
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_1
,
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_2
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/compile/test_sequence_parallelism.py
View file @
d2b52805
...
...
@@ -104,8 +104,7 @@ class TestQuantModel(torch.nn.Module):
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
True
,
use_per_token_if_dynamic
=
False
)
self
.
fp8_linear
=
Fp8LinearOp
(
use_per_token_if_dynamic
=
False
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
# Create a weight that is compatible with torch._scaled_mm,
...
...
tests/compile/test_silu_mul_quant_fusion.py
View file @
d2b52805
...
...
@@ -4,35 +4,44 @@ import pytest
import
torch
import
vllm.envs
as
envs
from
vllm.compilation.activation_quant_fusion
import
ActivationQuantFusionPass
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.compilation.activation_quant_fusion
import
(
FUSED_OPS
,
SILU_MUL_OP
,
ActivationQuantFusionPass
)
# yapf: enable
from
vllm.compilation.fusion
import
QUANT_OPS
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
,
PassConfig
,
VllmConfig
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
)
GroupShape
,
kFp8StaticTensorSym
,
kNvfp4Quant
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
CUTLASS_FP8_SUPPORTED
,
Fp8LinearOp
)
Fp8LinearOp
)
from
vllm.platforms
import
current_platform
from
.backend
import
TestBackend
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP4_DTYPE
=
torch
.
uint8
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
cutlass_fp8_enabled
:
bool
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
def
is_nvfp4_supported
():
return
current_platform
.
has_device_capability
(
100
)
class
TestSiluMulFp8QuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
force_fp8_e4m3fnuz
:
bool
,
**
kwargs
):
super
().
__init__
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
wscale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
w
=
(
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
current_platform
.
fp8_dtype
()).
t
())
self
.
w
=
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
cutlass_fp8_enabled
,
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
,
act_quant_static
=
True
,
act_quant_group_shape
=
GroupShape
.
PER_TENSOR
,
)
...
...
@@ -45,15 +54,56 @@ class TestModel(torch.nn.Module):
input_scale
=
self
.
wscale
)
return
x2
def
ops_in_model_before
(
self
):
return
[
SILU_MUL_OP
,
QUANT_OPS
[
kFp8StaticTensorSym
]]
def
ops_in_model_after
(
self
):
return
[
FUSED_OPS
[
kFp8StaticTensorSym
]]
class
TestSiluMulNvfp4QuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
**
kwargs
):
super
().
__init__
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
w
=
torch
.
randint
(
256
,
(
hidden_size
,
hidden_size
//
2
),
dtype
=
FP4_DTYPE
)
self
.
wscale
=
torch
.
randn
(
hidden_size
,
hidden_size
//
16
).
to
(
dtype
=
FP8_DTYPE
)
self
.
wscale2
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"cutlass_fp8_enabled"
,
[
True
,
False
]
if
CUTLASS_FP8_SUPPORTED
else
[
False
])
def
forward
(
self
,
x
):
y
=
self
.
silu_and_mul
(
x
)
y_quant
,
y_block_scale
=
scaled_fp4_quant
(
y
,
1
/
self
.
scale
)
out
=
cutlass_scaled_fp4_mm
(
a
=
y_quant
,
b
=
self
.
w
,
block_scale_a
=
y_block_scale
,
block_scale_b
=
self
.
wscale
,
alpha
=
self
.
scale
*
self
.
wscale2
,
out_dtype
=
y
.
dtype
)
return
out
def
ops_in_model_before
(
self
):
return
[
SILU_MUL_OP
,
QUANT_OPS
[
kNvfp4Quant
]]
def
ops_in_model_after
(
self
):
return
[
FUSED_OPS
[
kNvfp4Quant
]]
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"model_class"
,
[
TestSiluMulFp8QuantModel
,
TestSiluMulNvfp4QuantModel
]
if
is_nvfp4_supported
()
else
[
TestSiluMulFp8QuantModel
])
@
pytest
.
mark
.
parametrize
(
"force_fp8_e4m3fnuz"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
reason
=
"Only test on CUDA and ROCm"
)
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
,
cutlass_fp8_enabled
):
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
,
model_class
,
force_fp8_e4m3fnuz
):
if
model_class
==
TestSiluMulNvfp4QuantModel
and
force_fp8_e4m3fnuz
:
pytest
.
skip
(
"Duplicate tests for NVFP4"
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
torch
.
float16
)
...
...
@@ -64,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
fusion_pass
=
ActivationQuantFusionPass
(
config
)
backend
=
TestBackend
(
NoOpEliminationPass
(
config
),
fusion_pass
)
model
=
TestModel
(
hidden_size
,
cutlass_fp8_enabled
)
model
=
model_class
(
hidden_size
=
hidden_size
,
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
)
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
*
2
)
...
...
@@ -81,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
atol
=
1e-3
,
rtol
=
1e-3
)
# Check substitution worked
pre_nodes
=
backend
.
graph_pre_pass
.
nodes
post_nodes
=
backend
.
graph_post_pass
.
nodes
silu_and_mul_quant
=
torch
.
ops
.
_C
.
silu_and_mul_quant
.
default
fp8_quant
=
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
# In pre-nodes, fp8 quant should be present and fused kernels should not
assert
find_auto_fn_maybe
(
pre_nodes
,
silu_and_mul_quant
)
is
None
find_auto_fn
(
pre_nodes
,
fp8_quant
)
# In pre-nodes, quant op should be present and fused kernels should not
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
# In post-nodes, fused kernels should be present and fp8 quant should not
find_auto_fn
(
post_nodes
,
silu_and_mul_quant
)
assert
find_auto_fn_maybe
(
post_nodes
,
fp8_quant
)
is
None
# In post-nodes, fused kernels should be present and quant op should not
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
tests/conftest.py
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
math
import
os
import
tempfile
from
enum
import
Enum
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
,
cast
import
numpy
as
np
import
pytest
...
...
@@ -33,6 +34,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sequence
import
Logprob
from
vllm.transformers_utils.utils
import
maybe_model_redirect
logger
=
init_logger
(
__name__
)
...
...
@@ -454,8 +456,15 @@ class HfRunner:
# output is final logits
all_inputs
=
self
.
get_inputs
(
prompts
)
outputs
=
[]
problem_type
=
getattr
(
self
.
config
,
"problem_type"
,
""
)
for
inputs
in
all_inputs
:
output
=
self
.
model
(
**
self
.
wrap_device
(
inputs
))
if
problem_type
==
"regression"
:
logits
=
output
.
logits
[
0
].
tolist
()
elif
problem_type
==
"multi_label_classification"
:
logits
=
output
.
logits
.
sigmoid
()[
0
].
tolist
()
else
:
logits
=
output
.
logits
.
softmax
(
dim
=-
1
)[
0
].
tolist
()
outputs
.
append
(
logits
)
...
...
@@ -594,7 +603,7 @@ class HfRunner:
def
_hidden_states_to_logprobs
(
self
,
hidden_states
:
tuple
[
tuple
[
torch
.
Tensor
,
...],
...],
num_logprobs
:
int
,
num_logprobs
:
Optional
[
int
]
,
)
->
tuple
[
list
[
dict
[
int
,
float
]],
int
]:
seq_logprobs
=
self
.
_hidden_states_to_seq_logprobs
(
hidden_states
)
output_len
=
len
(
hidden_states
)
...
...
@@ -622,7 +631,7 @@ class HfRunner:
self
,
prompts
:
list
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
Optional
[
int
]
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
...
...
@@ -669,7 +678,7 @@ class HfRunner:
self
,
encoder_decoder_prompts
:
list
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
Optional
[
int
]
,
images
:
Optional
[
PromptImageInput
]
=
None
,
**
kwargs
:
Any
,
)
->
list
[
TokensTextLogprobs
]:
...
...
@@ -958,7 +967,7 @@ class VllmRunner:
self
,
prompts
:
list
[
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
Optional
[
int
]
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
images
:
Optional
[
PromptImageInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
...
...
@@ -983,11 +992,40 @@ class VllmRunner:
videos
=
videos
,
**
kwargs
)
def
generate_prompt_perplexity
(
self
,
prompts
:
list
[
str
])
->
list
[
float
]:
"""
Return the perplexity score associated with generating the prompts
:param prompts: list of prompts to score
:return: perplexity score of each prompt
"""
outputs
=
self
.
generate_greedy_logprobs
(
prompts
,
max_tokens
=
1
,
num_logprobs
=
None
,
num_prompt_logprobs
=
0
)
perplexities
=
[]
for
output
in
outputs
:
output
=
cast
(
TokensTextLogprobsPromptLogprobs
,
output
)
token_datas
=
cast
(
list
[
Optional
[
dict
[
int
,
Logprob
]]],
output
[
3
])
assert
token_datas
[
0
]
is
None
token_log_probs
=
[]
for
token_data
in
token_datas
[
1
:]:
assert
token_data
is
not
None
assert
len
(
token_data
)
==
1
token_log_prob
=
list
(
token_data
.
values
())[
0
].
logprob
token_log_probs
.
append
(
token_log_prob
)
perplexity
=
math
.
exp
(
-
sum
(
token_log_probs
)
/
len
(
token_log_probs
))
perplexities
.
append
(
perplexity
)
return
perplexities
def
generate_encoder_decoder_greedy_logprobs
(
self
,
encoder_decoder_prompts
:
list
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
Optional
[
int
]
,
num_prompt_logprobs
:
Optional
[
int
]
=
None
,
skip_special_tokens
:
bool
=
True
,
)
->
Union
[
list
[
TokensTextLogprobs
],
...
...
@@ -1014,15 +1052,17 @@ class VllmRunner:
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
concurrency_limit
:
Optional
[
int
]
=
None
,
)
->
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
audios
=
audios
)
outputs
=
self
.
llm
.
beam_search
(
inputs
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
outputs
=
self
.
llm
.
beam_search
(
inputs
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
),
concurrency_limit
=
concurrency_limit
)
returned_outputs
=
[]
for
output
in
outputs
:
token_ids
=
[
x
.
tokens
for
x
in
output
.
sequences
]
...
...
@@ -1080,6 +1120,9 @@ class VllmRunner:
return
self
.
llm
.
llm_engine
.
collective_rpc
(
_apply_model
)
def
get_llm
(
self
)
->
LLM
:
return
self
.
llm
def
__enter__
(
self
):
return
self
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
d2b52805
...
...
@@ -32,7 +32,7 @@ BLOCK_SIZE = 16
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
])
def
test_sliding_window_retrieval
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
...
...
@@ -43,8 +43,6 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
Additionally, we compare the results of the v1 and v2 managers.
"""
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
backend
==
"XFORMERS"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Xformers does not support ROCm/HIP."
)
...
...
@@ -96,7 +94,7 @@ def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[{
"enable_chunked_prefill"
:
True
}])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
])
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
...
...
@@ -107,8 +105,6 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
The results with and without chunked prefill are not the same due to
numerical instabilities.
"""
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
backend
==
"XFORMERS"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Xformers does not support ROCm/HIP."
)
override_backend_env_variable
(
monkeypatch
,
backend
)
...
...
tests/distributed/test_comm_ops.py
View file @
d2b52805
...
...
@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
tensor_model_parallel_all_reduce
,
tensor_model_parallel_reduce_scatter
)
from
..utils
import
init_test_distributed_environment
,
multi_process_parallel
from
..utils
import
(
init_test_distributed_environment
,
multi_gpu_test
,
multi_process_parallel
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
...
...
@@ -226,8 +227,7 @@ def send_recv_test_worker(
torch
.
testing
.
assert_close
(
test_tensor
,
recv_tensor
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
,
...
...
@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
multi_process_parallel
(
monkeypatch
,
tp_size
,
1
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
send_recv_test_worker
,
send_recv_tensor_dict_test_worker
])
...
...
@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
multi_process_parallel
(
monkeypatch
,
1
,
pp_size
,
test_target
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
4
,
reason
=
"Need at least 4 GPUs to run the test."
)
@
multi_gpu_test
(
num_gpus
=
4
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"pp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment