Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
711aa9d5
Commit
711aa9d5
authored
Jul 30, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.0' into v0.10.0-dev
parents
751c492c
6d8d0a24
Changes
519
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
803 additions
and
196 deletions
+803
-196
requirements/tpu.txt
requirements/tpu.txt
+4
-5
setup.py
setup.py
+10
-40
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+2
-6
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+3
-3
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+0
-58
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+5
-5
tests/compile/piecewise/test_multiple_graphs.py
tests/compile/piecewise/test_multiple_graphs.py
+350
-0
tests/compile/test_config.py
tests/compile/test_config.py
+24
-0
tests/compile/test_full_graph.py
tests/compile/test_full_graph.py
+6
-0
tests/compile/test_fusion_all_reduce.py
tests/compile/test_fusion_all_reduce.py
+150
-0
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+2
-0
tests/compile/test_silu_mul_quant_fusion.py
tests/compile/test_silu_mul_quant_fusion.py
+30
-7
tests/compile/untest_fusion.py
tests/compile/untest_fusion.py
+7
-4
tests/conftest.py
tests/conftest.py
+28
-22
tests/core/test_num_computed_tokens_update.py
tests/core/test_num_computed_tokens_update.py
+2
-3
tests/core/test_serialization.py
tests/core/test_serialization.py
+1
-1
tests/core/utils.py
tests/core/utils.py
+131
-3
tests/detokenizer/test_stop_reason.py
tests/detokenizer/test_stop_reason.py
+1
-1
tests/detokenizer/test_stop_strings.py
tests/detokenizer/test_stop_strings.py
+21
-22
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+26
-16
No files found.
Too many changes to show.
To preserve performance only
519 of 519+
files are displayed.
Plain diff
Email patch
requirements/tpu.txt
View file @
711aa9d5
...
...
@@ -18,9 +18,8 @@ setuptools==78.1.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.8.0.dev20250618
torchvision==0.23.0.dev20250618
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch==2.9.0.dev20250716
torchvision==0.24.0.dev20250716
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.9.0.dev20250716-cp312-cp312-linux_x86_64.whl ; python_version == "3.12"
setup.py
View file @
711aa9d5
...
...
@@ -421,29 +421,6 @@ class repackage_wheel(build_ext):
package_data
[
package_name
].
append
(
file_name
)
def
_is_hpu
()
->
bool
:
# if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
if
os
.
getenv
(
"VLLM_TARGET_DEVICE"
,
None
)
==
VLLM_TARGET_DEVICE
:
return
VLLM_TARGET_DEVICE
==
"hpu"
# if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
# and if it doesn't, check if habanalabs driver is loaded
is_hpu_available
=
False
try
:
out
=
subprocess
.
run
([
"hl-smi"
],
capture_output
=
True
,
check
=
True
)
is_hpu_available
=
out
.
returncode
==
0
except
(
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
if
sys
.
platform
.
startswith
(
"linux"
):
try
:
output
=
subprocess
.
check_output
(
'lsmod | grep habanalabs | wc -l'
,
shell
=
True
)
is_hpu_available
=
int
(
output
)
>
0
except
(
ValueError
,
FileNotFoundError
,
PermissionError
,
subprocess
.
CalledProcessError
):
pass
return
is_hpu_available
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
...
...
@@ -451,7 +428,7 @@ def _no_device() -> bool:
def
_is_cuda
()
->
bool
:
has_cuda
=
torch
.
version
.
cuda
is
not
None
return
(
VLLM_TARGET_DEVICE
==
"cuda"
and
has_cuda
and
not
(
_is_neuron
()
or
_is_tpu
()
or
_is_hpu
()
))
and
not
(
_is_neuron
()
or
_is_tpu
()))
def
_is_hip
()
->
bool
:
...
...
@@ -576,9 +553,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content
=
f
"""
try:
__version__ = "0.
9.2
"
__version_tuple__ = (0,
9
,
2
)
__hcu_version__ = f'0.
9.2
+
{
version
}
'
__version__ = "0.
10.0
"
__version_tuple__ = (0,
10
,
0
)
__hcu_version__ = f'0.
10.0
+
{
version
}
'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
...
...
@@ -677,12 +654,6 @@ def get_vllm_version() -> str:
if
neuron_version
!=
MAIN_CUDA_VERSION
:
neuron_version_str
=
neuron_version
.
replace
(
"."
,
""
)[:
3
]
version
+=
f
"
{
sep
}
neuron
{
neuron_version_str
}
"
elif
_is_hpu
():
# Get the Intel Gaudi Software Suite version
gaudi_sw_version
=
str
(
get_gaudi_sw_version
())
if
gaudi_sw_version
!=
MAIN_CUDA_VERSION
:
gaudi_sw_version
=
gaudi_sw_version
.
replace
(
"."
,
""
)[:
3
]
version
+=
f
"
{
sep
}
gaudi
{
gaudi_sw_version
}
"
elif
_is_tpu
():
version
+=
f
"
{
sep
}
tpu"
elif
_is_cpu
():
...
...
@@ -729,8 +700,6 @@ def get_requirements() -> list[str]:
requirements
=
_read_requirements
(
"rocm.txt"
)
elif
_is_neuron
():
requirements
=
_read_requirements
(
"neuron.txt"
)
elif
_is_hpu
():
requirements
=
_read_requirements
(
"hpu.txt"
)
elif
_is_tpu
():
requirements
=
_read_requirements
(
"tpu.txt"
)
elif
_is_cpu
():
...
...
@@ -739,8 +708,7 @@ def get_requirements() -> list[str]:
requirements
=
_read_requirements
(
"xpu.txt"
)
else
:
raise
ValueError
(
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
"or CPU."
)
"Unsupported platform, please use CUDA, ROCm, Neuron, or CPU."
)
return
requirements
...
...
@@ -811,10 +779,12 @@ setup(
install_requires
=
get_requirements
(),
extras_require
=
{
"bench"
:
[
"pandas"
,
"datasets"
],
"tensorizer"
:
[
"tensorizer
>
=2.
9.0
"
],
"tensorizer"
:
[
"tensorizer
=
=2.
10.1
"
],
"fastsafetensors"
:
[
"fastsafetensors >= 0.1.10"
],
"runai"
:
[
"runai-model-streamer"
,
"runai-model-streamer-s3"
,
"boto3"
],
"audio"
:
[
"librosa"
,
"soundfile"
],
# Required for audio processing
"runai"
:
[
"runai-model-streamer >= 0.13.3"
,
"runai-model-streamer-s3"
,
"boto3"
],
"audio"
:
[
"librosa"
,
"soundfile"
,
"mistral_common[audio]"
],
# Required for audio processing
"video"
:
[]
# Kept for backwards compatibility
},
cmdclass
=
cmdclass
,
...
...
tests/async_engine/test_api_server.py
View file @
711aa9d5
...
...
@@ -31,7 +31,7 @@ def _query_server_long(prompt: str) -> dict:
@
pytest
.
fixture
def
api_server
(
tokenizer_pool_size
:
int
,
distributed_executor_backend
:
str
):
def
api_server
(
distributed_executor_backend
:
str
):
script_path
=
Path
(
__file__
).
parent
.
joinpath
(
"api_server_async_engine.py"
).
absolute
()
commands
=
[
...
...
@@ -42,8 +42,6 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
"--host"
,
"127.0.0.1"
,
"--tokenizer-pool-size"
,
str
(
tokenizer_pool_size
),
"--distributed-executor-backend"
,
distributed_executor_backend
,
]
...
...
@@ -56,10 +54,8 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
uvicorn_process
.
terminate
()
@
pytest
.
mark
.
parametrize
(
"tokenizer_pool_size"
,
[
0
,
2
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"mp"
,
"ray"
])
def
test_api_server
(
api_server
,
tokenizer_pool_size
:
int
,
distributed_executor_backend
:
str
):
def
test_api_server
(
api_server
,
distributed_executor_backend
:
str
):
"""
Run the API server and test it.
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
711aa9d5
...
...
@@ -259,14 +259,14 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
# Needed to mock an error in the same process
monkeypatch
.
setenv
(
'VLLM_ENABLE_V1_MULTIPROCESSING'
,
'0'
)
with
vllm_runner
(
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
),
enforce_eager
=
True
)
as
vllm_model
:
if
isinstance
(
vllm_model
.
model
.
llm_engine
,
LLMEngineV1
):
with
vllm_runner
(
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
),
enforce_eager
=
True
)
as
vllm_model
:
if
isinstance
(
vllm_model
.
llm
.
llm_engine
,
LLMEngineV1
):
v1_test_failed_model_execution
(
vllm_model
)
def
v1_test_failed_model_execution
(
vllm_model
):
engine
=
vllm_model
.
model
.
llm_engine
engine
=
vllm_model
.
llm
.
llm_engine
mocked_execute_model
=
Mock
(
side_effect
=
RuntimeError
(
"Mocked Critical Error"
))
engine
.
engine_core
.
engine_core
.
model_executor
.
execute_model
=
\
...
...
tests/basic_correctness/test_chunked_prefill.py
View file @
711aa9d5
...
...
@@ -301,61 +301,3 @@ def test_with_prefix_caching(
name_0
=
"w/o prefix caching"
,
name_1
=
"with prefix caching"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/opt-125m"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
1
,
4
,
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"TORCH_SDPA"
])
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"CPU only"
)
def
test_models_cpu
(
hf_runner
:
HfRunner
,
vllm_runner
:
VllmRunner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
enforce_eager
:
bool
,
attention_backend
:
str
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
,
dtype
,
max_tokens
,
chunked_prefill_token_size
,
enforce_eager
,
1
,
attention_backend
,
monkeypatch
,
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"chunk_size"
,
[
30
,
32
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
,
"half"
])
@
pytest
.
mark
.
cpu_model
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
(),
reason
=
"CPU only"
)
def
test_with_prefix_caching_cpu
(
vllm_runner
:
VllmRunner
,
max_tokens
:
int
,
enforce_eager
:
bool
,
chunk_size
:
int
,
dtype
:
str
,
)
->
None
:
test_with_prefix_caching
(
vllm_runner
,
max_tokens
,
enforce_eager
,
chunk_size
,
1
,
dtype
,
)
\ No newline at end of file
tests/basic_correctness/test_preemption.py
View file @
711aa9d5
...
...
@@ -84,7 +84,7 @@ def test_chunked_prefill_recompute(
disable_log_stats
=
False
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
for
i
in
range
(
len
(
example_prompts
)):
...
...
@@ -122,10 +122,10 @@ def test_preemption(
distributed_executor_backend
=
distributed_executor_backend
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
total_preemption
=
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
num_cumulative_preemption
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
...
...
@@ -179,12 +179,12 @@ def test_preemption_infeasible(
)
as
vllm_model
:
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
req_outputs
=
vllm_model
.
llm
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
assert
(
vllm_model
.
llm
.
llm_engine
.
scheduler
[
0
].
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
# Verify the request is ignored and not hang.
...
...
tests/compile/piecewise/test_multiple_graphs.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Test (piecewise) compilation with a simple model where multiple submodules
are compiled and graph captured separately.
"""
import
torch
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.backends
import
set_model_tag
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
(
ignore_torch_compile
,
support_torch_compile
)
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.forward_context
import
set_forward_context
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
BATCH_SIZE
=
32
MLP_SIZE
=
128
HIDDEN_SIZE
=
1024
RANDOM_SEED
=
0
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
support_torch_compile
class
ParentModel
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
x
class
Attention
(
nn
.
Module
):
def
__init__
(
self
,
mlp_size
:
int
,
hidden_size
:
int
)
->
None
:
super
().
__init__
()
self
.
pre_attn
=
nn
.
Linear
(
mlp_size
,
hidden_size
,
bias
=
False
)
self
.
post_attn
=
nn
.
Linear
(
hidden_size
,
mlp_size
,
bias
=
False
)
self
.
rms_norm_weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
# Initialize to same weights for testing
nn
.
init
.
xavier_normal_
(
self
.
pre_attn
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
RANDOM_SEED
),
gain
=
0.001
)
nn
.
init
.
xavier_normal_
(
self
.
post_attn
.
weight
.
data
,
generator
=
torch
.
Generator
().
manual_seed
(
RANDOM_SEED
),
gain
=
0.001
)
def
rms_norm_ref
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x_f32
=
x
.
float
()
return
(
x_f32
*
torch
.
rsqrt
(
torch
.
mean
(
x_f32
.
square
(),
dim
=-
1
,
keepdim
=
True
)
+
1e-6
)
*
self
.
rms_norm_weight
).
to
(
x
.
dtype
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
self
.
pre_attn
(
x
)
x
=
self
.
rms_norm_ref
(
x
)
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
self
.
rms_norm_ref
(
x
)
x
=
self
.
post_attn
(
x
)
return
x
@
support_torch_compile
class
CompiledAttention
(
nn
.
Module
):
def
__init__
(
self
,
*
,
mlp_size
:
int
,
hidden_size
:
int
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
self
.
attn
=
Attention
(
mlp_size
,
hidden_size
)
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
attn
(
x
)
@
support_torch_compile
class
CompiledAttentionTwo
(
CompiledAttention
):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
attn
(
x
)
+
x
@
ignore_torch_compile
class
SimpleModelWithTwoGraphs
(
ParentModel
):
def
__init__
(
self
,
*
,
mlp_size
:
int
,
hidden_size
:
int
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
# Test will fail without set_model_tag here with error:
# "ValueError: too many values to unpack (expected 3)"
# This is because CompiledAttention and CompiledAttentionTwo
# have different implmentations but the same torch.compile
# cache dir will be used as default prefix is 'model_tag'
with
set_model_tag
(
"attn_one"
):
self
.
attn_one
=
CompiledAttention
(
mlp_size
=
mlp_size
,
hidden_size
=
hidden_size
,
vllm_config
=
vllm_config
,
prefix
=
f
"
{
prefix
}
.attn_one"
,
)
with
set_model_tag
(
"attn_two"
):
self
.
attn_two
=
CompiledAttentionTwo
(
mlp_size
=
mlp_size
,
hidden_size
=
hidden_size
,
vllm_config
=
vllm_config
,
prefix
=
f
"
{
prefix
}
.attn_two"
,
)
self
.
hidden_states
=
torch
.
zeros
((
BATCH_SIZE
,
MLP_SIZE
)).
cuda
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
bsz
=
x
.
shape
[
0
]
# CUDAGraph expects same tensor addresses for each run
self
.
hidden_states
[:
bsz
].
copy_
(
x
)
x
=
self
.
attn_one
(
self
.
hidden_states
[:
bsz
])
self
.
hidden_states
[:
bsz
].
copy_
(
x
)
x
=
self
.
attn_two
(
self
.
hidden_states
[:
bsz
])
return
x
def
test_ignore_torch_compile_decorator
():
assert
VLLM_USE_V1
# piecewise
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
@
support_torch_compile
class
A
(
nn
.
Module
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
''
,
**
kwargs
)
->
None
:
super
().
__init__
()
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
x
=
x
+
x
attn_output
=
torch
.
empty_like
(
x
)
torch
.
ops
.
silly
.
attention
(
x
,
x
,
x
,
attn_output
)
x
=
attn_output
x
=
x
*
3
return
x
@
ignore_torch_compile
class
B
(
A
):
...
@
support_torch_compile
class
C
(
B
):
...
with
set_current_vllm_config
(
vllm_config
):
mod_A
=
A
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# A has support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
),
set_forward_context
({},
vllm_config
=
vllm_config
):
# first run is for compile
mod_A
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
# run cudagraph captured sizes
mod_A
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_A
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
with
set_current_vllm_config
(
vllm_config
):
mod_B
=
B
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# B's ignore_torch_compile should override A's support_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
num_piecewise_graphs_seen
=
0
,
num_piecewise_capturable_graphs_seen
=
0
,
num_backend_compilations
=
0
,
num_cudagraph_captured
=
0
,
),
set_forward_context
({},
vllm_config
=
vllm_config
):
mod_B
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
mod_B
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_B
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
with
set_current_vllm_config
(
vllm_config
):
mod_C
=
C
(
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# C's support_torch_compile should override B's ignore_torch_compile
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_piecewise_graphs_seen
=
3
,
num_piecewise_capturable_graphs_seen
=
2
,
num_backend_compilations
=
2
,
num_cudagraph_captured
=
4
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
),
set_forward_context
({},
vllm_config
=
vllm_config
):
mod_C
(
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
())
mod_C
(
torch
.
randn
(
2
,
MLP_SIZE
).
cuda
())
mod_C
(
torch
.
randn
(
1
,
MLP_SIZE
).
cuda
())
@
torch
.
inference_mode
def
run_model
(
vllm_config
,
model
:
nn
.
Module
,
inputs
:
torch
.
Tensor
):
with
set_forward_context
({},
vllm_config
=
vllm_config
):
# First run is for compile
model
(
inputs
)
# Run CUDAGraph captured sizes
model
(
inputs
[:
2
])
model
(
inputs
[:
1
])
output
=
model
(
inputs
[:
2
])
output
=
output
.
cpu
()
return
output
.
cpu
()
def
test_multi_graph_piecewise_compile_outputs_equal
():
outputs
=
[]
# piecewise compile
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
True
,
splitting_ops
=
[
"silly.attention"
],
cudagraph_capture_sizes
=
[
1
,
2
],
))
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
hidden_size
=
HIDDEN_SIZE
,
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
# Pre-allocate memory for CUDAGraph which expects
# static tensor addresses
inputs
=
torch
.
randn
(
BATCH_SIZE
,
MLP_SIZE
).
cuda
()
with
compilation_counter
.
expect
(
num_graphs_seen
=
2
,
# two graphs for the model
num_piecewise_graphs_seen
=
6
,
# attn_one, attn_two each has 3 piecewise graphs
# (pre attn, post attn, silly_attention) each
num_piecewise_capturable_graphs_seen
=
4
,
# attn_one, attn_two has pre attn and post attn each, total=4
num_backend_compilations
=
4
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_captured
=
8
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
# no compile or cudagraph
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
NO_COMPILATION
,
))
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
hidden_size
=
HIDDEN_SIZE
,
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
with
compilation_counter
.
expect
(
num_graphs_seen
=
0
,
num_piecewise_graphs_seen
=
0
,
num_piecewise_capturable_graphs_seen
=
0
,
num_backend_compilations
=
0
,
num_cudagraph_captured
=
0
,
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
# piecewise compile without CUDA graph
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
False
,
splitting_ops
=
[
"silly.attention"
],
))
with
set_current_vllm_config
(
vllm_config
):
model
=
SimpleModelWithTwoGraphs
(
mlp_size
=
MLP_SIZE
,
hidden_size
=
HIDDEN_SIZE
,
vllm_config
=
vllm_config
,
prefix
=
''
).
eval
().
cuda
()
with
compilation_counter
.
expect
(
num_graphs_seen
=
2
,
num_piecewise_graphs_seen
=
6
,
num_piecewise_capturable_graphs_seen
=
4
,
num_backend_compilations
=
4
,
num_cudagraph_captured
=
0
,
# no cudagraph captured
):
outputs
.
append
(
run_model
(
vllm_config
,
model
,
inputs
))
# Generally don't expect outputs with and without inductor
# to be bitwise equivalent
assert
torch
.
allclose
(
outputs
[
0
],
outputs
[
1
])
# Expect bitwise equivalence using inductor w/ and w/o cudagraph
assert
torch
.
equal
(
outputs
[
0
],
outputs
[
2
])
tests/compile/test_config.py
View file @
711aa9d5
...
...
@@ -26,6 +26,30 @@ def test_use_cudagraphs_dynamic(monkeypatch):
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
# NB: We don't test VLLM_DISABLE_COMPILE_CACHE=0 because that depends
# on the state of the cache directory on the current machine, which
# may be influenced by other tests.
@
pytest
.
mark
.
parametrize
(
"val"
,
[
"1"
])
def
test_VLLM_DISABLE_COMPILE_CACHE
(
vllm_runner
,
monkeypatch
,
val
):
assert
vllm
.
envs
.
VLLM_USE_V1
# spawn means that the counters are in the same process.
monkeypatch
.
setenv
(
'VLLM_WORKER_MULTIPROC_METHOD'
,
"spawn"
)
monkeypatch
.
setenv
(
'VLLM_DISABLE_COMPILE_CACHE'
,
val
)
compilation_config
=
{
"use_cudagraph"
:
False
,
# speed things up a bit
}
with
(
compilation_counter
.
expect
(
num_cache_entries_updated
=
0
,
num_compiled_artifacts_saved
=
0
),
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
compilation_config
=
compilation_config
,
gpu_memory_utilization
=
0.4
)
as
_
):
pass
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
...
...
tests/compile/test_full_graph.py
View file @
711aa9d5
...
...
@@ -3,6 +3,7 @@
from
__future__
import
annotations
import
tempfile
from
typing
import
Any
,
Optional
,
Union
import
pytest
...
...
@@ -111,6 +112,11 @@ def test_full_graph(
pass_config
=
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
)),
model
)
for
model
in
models_list
(
keywords
=
[
"FP8-dynamic"
,
"quantized.w8a8"
])
]
+
[
# Test depyf integration works
(
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
debug_dump_path
=
tempfile
.
gettempdir
()),
(
"facebook/opt-125m"
,
{})),
])
# only test some of the models
@
create_new_process_for_each_test
()
...
...
tests/compile/test_fusion_all_reduce.py
0 → 100644
View file @
711aa9d5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
importlib.util
import
find_spec
import
pytest
import
torch
import
vllm.envs
as
envs
from
vllm.compilation.collective_fusion
import
AllReduceFusionPass
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
DeviceConfig
,
ModelConfig
,
PassConfig
,
VllmConfig
)
from
vllm.distributed
import
tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
from
..utils
import
multi_gpu_test
from
.backend
import
TestBackend
class
TestAllReduceRMSNormModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
eps
=
1e-6
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
eps
=
eps
self
.
norm
=
RMSNorm
(
hidden_size
,
eps
)
def
forward
(
self
,
hidden_states
,
residual
):
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
all_reduce
=
tensor_model_parallel_all_reduce
(
view
)
norm
=
self
.
norm
(
all_reduce
)
return
norm
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
all_reduce
.
default
]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
vllm
.
flashinfer_trtllm_fused_allreduce_norm
.
default
]
class
TestAllReduceFusedAddRMSNormModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
eps
=
1e-6
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
eps
=
eps
self
.
norm
=
RMSNorm
(
hidden_size
,
eps
)
def
forward
(
self
,
hidden_states
,
residual
):
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
all_reduce
=
tensor_model_parallel_all_reduce
(
view
)
norm
,
_
=
self
.
norm
(
all_reduce
,
residual
)
return
norm
def
ops_in_model_before
(
self
):
return
[
torch
.
ops
.
vllm
.
all_reduce
.
default
]
def
ops_in_model_after
(
self
):
return
[
torch
.
ops
.
vllm
.
flashinfer_trtllm_fused_allreduce_norm
.
default
]
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"test_model"
,
[
TestAllReduceRMSNormModel
,
TestAllReduceFusedAddRMSNormModel
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
4096
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
@
pytest
.
mark
.
skipif
(
not
find_spec
(
"flashinfer"
),
reason
=
"flashinfer is not installed"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_device_capability
(
100
),
reason
=
"Only test on SM100"
)
def
test_all_reduce_fusion_pass_replace
(
test_model
:
torch
.
nn
.
Module
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
test_model
,
batch_size
,
seq_len
,
hidden_size
,
dtype
),
nprocs
=
nprocs
)
run_torch_spawn
(
all_reduce_fusion_pass_on_test_model
,
num_processes
)
def
all_reduce_fusion_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
test_model_cls
:
torch
.
nn
.
Module
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
torch
.
set_default_device
(
device
)
torch
.
set_default_dtype
(
dtype
)
update_environment_variables
({
'RANK'
:
str
(
local_rank
),
'LOCAL_RANK'
:
str
(
local_rank
),
'WORLD_SIZE'
:
str
(
world_size
),
'MASTER_ADDR'
:
'localhost'
,
'MASTER_PORT'
:
'12345'
,
})
init_distributed_environment
()
initialize_model_parallel
(
tensor_model_parallel_size
=
world_size
)
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
],
compile_sizes
=
[
2
,
4
,
8
]))
vllm_config
.
compilation_config
.
pass_config
=
PassConfig
(
enable_fi_allreduce_fusion
=
True
)
vllm_config
.
device_config
=
DeviceConfig
(
device
=
torch
.
device
(
"cuda"
))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model_name
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model_name
,
task
=
"auto"
,
tokenizer
=
model_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
dtype
,
seed
=
42
)
all_reduce_fusion_pass
=
AllReduceFusionPass
(
vllm_config
)
backend
=
TestBackend
(
all_reduce_fusion_pass
)
model
=
test_model_cls
(
hidden_size
)
hidden_states
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
requires_grad
=
False
)
residual
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
requires_grad
=
False
)
compiled_model
=
torch
.
compile
(
model
,
backend
=
backend
)
compiled_model
(
hidden_states
,
residual
)
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
fully_replaced
=
False
)
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
del
all_reduce_fusion_pass
tests/compile/test_fusion_attn.py
View file @
711aa9d5
...
...
@@ -50,6 +50,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# DYNAMO_ONCE does not properly propagate shapes.
level
=
CompilationLevel
.
DYNAMO_AS_IS
,
backend
=
"tests.compile.test_fusion_attn.backend_unfused"
,
custom_ops
=
[
"+quant_fp8"
],
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
backend_unfused
=
TestBackend
(
NoOpEliminationPass
(
vllm_config
))
...
...
@@ -73,6 +74,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
# DYNAMO_ONCE does not properly propagate shapes.
level
=
CompilationLevel
.
DYNAMO_AS_IS
,
backend
=
"tests.compile.test_fusion_attn.backend"
,
custom_ops
=
[
"+quant_fp8"
],
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
...
...
tests/compile/test_silu_mul_quant_fusion.py
View file @
711aa9d5
...
...
@@ -4,33 +4,56 @@ import pytest
import
torch
import
vllm.envs
as
envs
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm.compilation.activation_quant_fusion
import
ActivationQuantFusionPass
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
,
PassConfig
,
VllmConfig
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
CUTLASS_FP8_SUPPORTED
,
Fp8LinearOp
)
from
vllm.platforms
import
current_platform
from
.backend
import
TestBackend
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
hidden_size
:
int
,
cutlass_fp8_enabled
:
bool
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
silu_and_mul
=
SiluAndMul
()
self
.
wscale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
w
=
(
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
current_platform
.
fp8_dtype
()).
t
())
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
cutlass_fp8_enabled
,
act_quant_static
=
True
,
act_quant_group_shape
=
GroupShape
.
PER_TENSOR
,
)
def
forward
(
self
,
x
):
y
=
self
.
silu_and_mul
(
x
)
x2
=
scaled_fp8_quant
(
y
,
self
.
scale
)
x2
=
self
.
fp8_linear
.
apply
(
y
,
self
.
w
,
self
.
wscale
,
input_scale
=
self
.
wscale
)
return
x2
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
256
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"cutlass_fp8_enabled"
,
[
True
,
False
]
if
CUTLASS_FP8_SUPPORTED
else
[
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
reason
=
"Only test on CUDA and ROCm"
)
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
):
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
,
cutlass_fp8_enabled
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
torch
.
float16
)
...
...
@@ -40,11 +63,11 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size):
pass_config
=
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
))
fusion_pass
=
ActivationQuantFusionPass
(
config
)
backend
=
TestBackend
(
fusion_pass
)
model
=
TestModel
()
backend
=
TestBackend
(
NoOpEliminationPass
(
config
),
fusion_pass
)
model
=
TestModel
(
hidden_size
,
cutlass_fp8_enabled
)
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
*
2
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
result
=
model
(
x
)
...
...
tests/compile/untest_fusion.py
View file @
711aa9d5
...
...
@@ -44,7 +44,9 @@ class TestModel(torch.nn.Module):
]
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
cutlass_fp8_enabled
,
use_per_token_if_dynamic
=
True
)
act_quant_static
=
static
,
act_quant_group_shape
=
group_shape
,
)
def
forward
(
self
,
x
):
resid
=
torch
.
sqrt
(
x
)
...
...
@@ -91,9 +93,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
maybe_create_device_identity
()
# needed for certain non-cutlass fp8 paths
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
]))
vllm_config
.
compilation_config
.
pass_config
=
\
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
)
level
=
CompilationLevel
.
PIECEWISE
,
custom_ops
=
[
"+rms_norm"
,
"+quant_fp8"
],
pass_config
=
PassConfig
(
enable_fusion
=
True
,
enable_noop
=
True
),
))
with
vllm
.
config
.
set_current_vllm_config
(
vllm_config
):
# Reshape pass is needed for the fusion pass to work
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
...
...
tests/conftest.py
View file @
711aa9d5
...
...
@@ -765,7 +765,8 @@ class VllmRunner:
- `trust_remote_code`: Set to `True` instead of `False` for convenience.
- `seed`: Set to `0` instead of `None` for test reproducibility.
- `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
- `block_size`: Set to `16` instead of `None` to reduce memory usage.
- `block_size`: To reduce memory usage, set default to `64` if on XPU
devices, otherwise default to `16`.
- `enable_chunked_prefill`: Set to `False` instead of `None` for
test reproducibility.
- `enforce_eager`: Set to `False` to test CUDA graph.
...
...
@@ -783,13 +784,13 @@ class VllmRunner:
dtype
:
str
=
"auto"
,
disable_log_stats
:
bool
=
True
,
tensor_parallel_size
:
int
=
1
,
block_size
:
int
=
16
,
block_size
:
int
=
16
if
not
torch
.
xpu
.
is_available
()
else
64
,
enable_chunked_prefill
:
Optional
[
bool
]
=
False
,
swap_space
:
int
=
4
,
enforce_eager
:
Optional
[
bool
]
=
False
,
**
kwargs
,
)
->
None
:
self
.
model
=
LLM
(
self
.
llm
=
LLM
(
model
=
model_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
...
...
@@ -809,7 +810,7 @@ class VllmRunner:
def
get_inputs
(
self
,
prompts
:
Union
[
list
[
str
],
list
[
torch
.
Tensor
]],
prompts
:
Union
[
list
[
str
],
list
[
torch
.
Tensor
]
,
list
[
int
]
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
...
...
@@ -831,11 +832,16 @@ class VllmRunner:
if
audios
is
not
None
and
(
audio
:
=
audios
[
i
])
is
not
None
:
multi_modal_data
[
"audio"
]
=
audio
text_prompt_kwargs
=
{
(
"prompt"
if
isinstance
(
prompt
,
str
)
else
"prompt_embeds"
):
prompt
,
text_prompt_kwargs
:
dict
[
str
,
Any
]
=
{
"multi_modal_data"
:
multi_modal_data
or
None
}
if
isinstance
(
prompt
,
str
):
text_prompt_kwargs
[
"prompt"
]
=
prompt
elif
isinstance
(
prompt
,
list
):
text_prompt_kwargs
[
"prompt_token_ids"
]
=
prompt
else
:
text_prompt_kwargs
[
"prompt_embeds"
]
=
prompt
inputs
.
append
(
TextPrompt
(
**
text_prompt_kwargs
))
return
inputs
...
...
@@ -854,9 +860,9 @@ class VllmRunner:
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
,
**
kwargs
)
req_outputs
=
self
.
llm
.
generate
(
inputs
,
sampling_params
=
sampling_params
,
**
kwargs
)
outputs
:
list
[
tuple
[
list
[
list
[
int
]],
list
[
str
]]]
=
[]
for
req_output
in
req_outputs
:
...
...
@@ -902,9 +908,9 @@ class VllmRunner:
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
generate
(
inputs
,
sampling_params
=
sampling_params
,
**
kwargs
)
req_outputs
=
self
.
llm
.
generate
(
inputs
,
sampling_params
=
sampling_params
,
**
kwargs
)
toks_str_logsprobs_prompt_logprobs
=
(
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
...
...
@@ -924,8 +930,8 @@ class VllmRunner:
'''
assert
sampling_params
.
logprobs
is
not
None
req_outputs
=
self
.
model
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
req_outputs
=
self
.
llm
.
generate
(
encoder_decoder_prompts
,
sampling_params
=
sampling_params
)
toks_str_logsprobs_prompt_logprobs
=
(
self
.
_final_steps_generate_w_logprobs
(
req_outputs
))
# Omit prompt logprobs if not required by sampling params
...
...
@@ -1018,7 +1024,7 @@ class VllmRunner:
videos
=
videos
,
audios
=
audios
)
outputs
=
self
.
model
.
beam_search
(
outputs
=
self
.
llm
.
beam_search
(
inputs
,
BeamSearchParams
(
beam_width
=
beam_width
,
max_tokens
=
max_tokens
))
returned_outputs
=
[]
...
...
@@ -1029,7 +1035,7 @@ class VllmRunner:
return
returned_outputs
def
classify
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
model
.
classify
(
prompts
)
req_outputs
=
self
.
llm
.
classify
(
prompts
)
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
def
embed
(
self
,
...
...
@@ -1044,11 +1050,11 @@ class VllmRunner:
videos
=
videos
,
audios
=
audios
)
req_outputs
=
self
.
model
.
embed
(
inputs
,
*
args
,
**
kwargs
)
req_outputs
=
self
.
llm
.
embed
(
inputs
,
*
args
,
**
kwargs
)
return
[
req_output
.
outputs
.
embedding
for
req_output
in
req_outputs
]
def
encode
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
req_outputs
=
self
.
llm
.
encode
(
prompts
)
return
[
req_output
.
outputs
.
data
for
req_output
in
req_outputs
]
def
score
(
...
...
@@ -1058,18 +1064,18 @@ class VllmRunner:
*
args
,
**
kwargs
,
)
->
list
[
float
]:
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
,
*
args
,
**
kwargs
)
req_outputs
=
self
.
llm
.
score
(
text_1
,
text_2
,
*
args
,
**
kwargs
)
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
def
apply_model
(
self
,
func
:
Callable
[[
nn
.
Module
],
_R
])
->
list
[
_R
]:
executor
=
self
.
model
.
llm_engine
.
model_executor
executor
=
self
.
llm
.
llm_engine
.
model_executor
return
executor
.
apply_model
(
func
)
def
__enter__
(
self
):
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
del
self
.
model
del
self
.
llm
cleanup_dist_env_and_memory
()
...
...
tests/core/test_num_computed_tokens_update.py
View file @
711aa9d5
...
...
@@ -40,9 +40,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization
=
0.7
,
num_scheduler_steps
=
num_scheduler_steps
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enforce_eager
=
enforce_eager
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
)
engine
:
LLMEngine
=
runner
.
model
.
llm_engine
enforce_eager
=
enforce_eager
)
engine
:
LLMEngine
=
runner
.
llm
.
llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
...
...
tests/core/test_serialization.py
View file @
711aa9d5
...
...
@@ -6,7 +6,7 @@ import msgspec
from
vllm.executor.msgspec_utils
import
decode_hook
,
encode_hook
from
vllm.sequence
import
ExecuteModelRequest
from
..spec_decode
.utils
import
create_batch
from
.utils
import
create_batch
def
test_msgspec_serialization
():
...
...
tests/core/utils.py
View file @
711aa9d5
...
...
@@ -4,15 +4,16 @@
import
time
from
collections
import
defaultdict
from
collections.abc
import
Sequence
as
GenericSequence
from
typing
import
Any
,
Optional
from
itertools
import
count
from
typing
import
Any
,
Optional
,
Union
import
torch
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.inputs
import
EncoderDecoderInputs
,
embeds_inputs
,
token_inputs
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
(
Logprob
,
Sequence
,
SequenceGroup
,
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
Logprob
,
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
)
...
...
@@ -262,3 +263,130 @@ class SchedulerProxy:
self
,
)
->
tuple
[
list
[
SequenceGroupMetadata
],
SchedulerOutputs
,
Any
]:
_
,
_
,
ret
=
self
.
call_history
[
"schedule"
][
-
1
]
return
ret
def
create_seq_group_metadata_from_prompts
(
prompts
:
list
[
list
[
int
]],
num_gpu_blocks
:
int
,
block_size
:
int
,
final_prompt_lens
:
list
[
int
],
continuations
:
Optional
[
list
[
list
[
int
]]]
=
None
,
seq_ids
:
Optional
[
list
[
int
]]
=
None
,
)
->
list
[
SequenceGroupMetadata
]:
if
continuations
is
None
:
continuations
=
[[]
for
_
in
prompts
]
if
seq_ids
is
None
:
seq_ids
=
list
(
i
for
i
,
_
in
enumerate
(
prompts
))
free_gpu_blocks
=
list
(
range
(
num_gpu_blocks
))
block_allocations
=
{
i
:
[
free_gpu_blocks
.
pop
()
for
_
in
range
(
round_up_to_next_block
(
final_len
,
block_size
))
]
for
i
,
final_len
in
enumerate
(
final_prompt_lens
)
}
seq_grou_metadata_list
=
[]
for
i
,
(
prompt_token_ids
,
cont_token_ids
)
in
enumerate
(
zip
(
prompts
,
continuations
)):
data
=
SequenceData
.
from_seqs
(
prompt_token_ids
,
cont_token_ids
)
data
.
update_num_computed_tokens
(
len
(
prompt_token_ids
)
+
len
(
cont_token_ids
)
-
1
)
seq_data
=
{
i
:
data
}
seq_grou_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
str
(
i
),
is_prompt
=
len
(
cont_token_ids
)
==
0
,
seq_data
=
seq_data
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
),
block_tables
=
{
i
:
block_allocations
[
i
][:]},
))
return
seq_grou_metadata_list
def
create_chunked_seq_group_metadata_from_prompt
(
prompt
:
list
[
int
],
num_gpu_blocks
:
int
,
chunk_size
:
int
,
block_size
:
int
,
seq_id
:
Optional
[
int
]
=
None
)
->
list
[
SequenceGroupMetadata
]:
if
seq_id
is
None
:
seq_id
=
0
free_gpu_blocks
=
list
(
range
(
num_gpu_blocks
))
block_allocations
=
[
free_gpu_blocks
.
pop
()
for
_
in
range
(
round_up_to_next_block
(
len
(
prompt
),
block_size
))
]
seq_group_metadata_list
=
[]
for
i
,
idx
in
enumerate
(
range
(
0
,
len
(
prompt
),
chunk_size
)):
chunk_ids
=
prompt
[
idx
:
idx
+
chunk_size
]
data
=
SequenceData
.
from_seqs
(
prompt
)
data
.
update_num_computed_tokens
(
idx
)
seq_data
=
{
i
:
data
}
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
str
(
seq_id
),
is_prompt
=
True
,
do_sample
=
idx
+
chunk_size
>=
len
(
prompt
),
# terminal chunk
seq_data
=
seq_data
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
),
block_tables
=
{
i
:
block_allocations
},
token_chunk_size
=
len
(
chunk_ids
)))
return
seq_group_metadata_list
def
create_batch
(
batch_size
,
k
,
prompt_len
:
Union
[
int
,
list
[
int
]]
=
10
,
prev_output_token_len
:
int
=
10
,
seq_ids
:
Optional
[
list
[
int
]]
=
None
,
num_gpu_blocks
:
Optional
[
int
]
=
None
,
block_size
:
Optional
[
int
]
=
None
,
prefill_chunk_size
:
Optional
[
int
]
=
None
):
if
block_size
is
None
:
block_size
=
8
if
num_gpu_blocks
is
None
:
num_gpu_blocks
=
2048
//
block_size
iterator
=
count
()
if
isinstance
(
prompt_len
,
int
):
prompt_lens
=
[
prompt_len
for
_
in
range
(
batch_size
)]
else
:
prompt_lens
=
prompt_len
prompts
=
[[
next
(
iterator
)
for
_
in
range
(
p_len
)]
for
p_len
in
prompt_lens
]
if
prefill_chunk_size
:
# Create a batch of chunked prompts.
if
not
seq_ids
:
seq_ids
=
list
(
range
(
len
(
prompts
)))
seq_group_metadata_list
=
[]
for
p
,
sid
in
zip
(
prompts
,
seq_ids
):
seq_group_metadata_list
+=
\
create_chunked_seq_group_metadata_from_prompt
(
p
,
num_gpu_blocks
,
prefill_chunk_size
,
block_size
,
sid
)
seq_group_metadata_list
=
seq_group_metadata_list
[:
batch_size
]
prev_output_tokens
=
[]
else
:
prev_output_tokens
=
[[
next
(
iterator
)
for
_
in
range
(
prev_output_token_len
)
]
for
_
in
range
(
batch_size
)]
final_prompt_lens
=
[
len
(
prompt
)
+
len
(
prev_output_token
)
+
k
+
1
for
prompt
,
prev_output_token
in
zip
(
prompts
,
prev_output_tokens
)
]
seq_group_metadata_list
=
create_seq_group_metadata_from_prompts
(
prompts
,
num_gpu_blocks
,
block_size
,
final_prompt_lens
,
prev_output_tokens
,
seq_ids
)
return
seq_group_metadata_list
,
prompts
,
prev_output_tokens
tests/detokenizer/test_stop_reason.py
View file @
711aa9d5
...
...
@@ -30,7 +30,7 @@ def vllm_model(vllm_runner):
def
test_stop_reason
(
vllm_model
,
example_prompts
):
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
MODEL
)
stop_token_id
=
tokenizer
.
convert_tokens_to_ids
(
STOP_STR
)
llm
=
vllm_model
.
model
llm
=
vllm_model
.
llm
# test stop token
outputs
=
llm
.
generate
(
example_prompts
,
...
...
tests/detokenizer/test_stop_strings.py
View file @
711aa9d5
...
...
@@ -103,42 +103,41 @@ def _stop_token_id(llm):
def
test_stop_strings
():
# If V0, must set enforce_eager=False since we use
# async output processing below.
v
llm
_model
=
LLM
(
MODEL
,
enforce_eager
=
envs
.
VLLM_USE_V1
)
llm
=
LLM
(
MODEL
,
enforce_eager
=
envs
.
VLLM_USE_V1
)
if
envs
.
VLLM_USE_V1
:
_stop_basic
(
v
llm
_model
)
_stop_basic
(
llm
)
else
:
_set_async_mode
(
v
llm
_model
,
True
)
_stop_basic
(
v
llm
_model
)
_set_async_mode
(
llm
,
True
)
_stop_basic
(
llm
)
_set_async_mode
(
v
llm
_model
,
False
)
_stop_basic
(
v
llm
_model
)
_set_async_mode
(
llm
,
False
)
_stop_basic
(
llm
)
if
envs
.
VLLM_USE_V1
:
_stop_multi_tokens
(
v
llm
_model
)
_stop_multi_tokens
(
llm
)
else
:
_set_async_mode
(
v
llm
_model
,
True
)
_stop_multi_tokens
(
v
llm
_model
)
_set_async_mode
(
llm
,
True
)
_stop_multi_tokens
(
llm
)
_set_async_mode
(
v
llm
_model
,
False
)
_stop_multi_tokens
(
v
llm
_model
)
_set_async_mode
(
llm
,
False
)
_stop_multi_tokens
(
llm
)
if
envs
.
VLLM_USE_V1
:
_stop_partial_token
(
v
llm
_model
)
_stop_partial_token
(
llm
)
else
:
_set_async_mode
(
v
llm
_model
,
True
)
_stop_partial_token
(
v
llm
_model
)
_set_async_mode
(
llm
,
True
)
_stop_partial_token
(
llm
)
_set_async_mode
(
v
llm
_model
,
False
)
_stop_partial_token
(
v
llm
_model
)
_set_async_mode
(
llm
,
False
)
_stop_partial_token
(
llm
)
if
envs
.
VLLM_USE_V1
:
# FIXME: this does not respect include_in_output=False
# _stop_token_id(
v
llm
_model
)
# _stop_token_id(llm)
pass
else
:
_set_async_mode
(
vllm_model
,
True
)
_stop_token_id
(
vllm_model
)
_set_async_mode
(
vllm_model
,
False
)
_stop_token_id
(
vllm_model
)
\ No newline at end of file
_set_async_mode
(
llm
,
True
)
_stop_token_id
(
llm
)
_set_async_mode
(
llm
,
False
)
_stop_token_id
(
llm
)
tests/distributed/test_pipeline_parallel.py
View file @
711aa9d5
...
...
@@ -14,8 +14,9 @@ from typing import Literal, NamedTuple, Optional
import
pytest
from
vllm.config
import
TaskOption
from
vllm.config
import
_FLOAT16_NOT_SUPPORTED_MODELS
,
TaskOption
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.config
import
get_config
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
,
models_path_prefix
...
...
@@ -158,7 +159,7 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"databricks/dbrx-instruct"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"Deci/DeciLM-7B-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/deepseek-llm-7b-chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
):
PPTestSettings
.
fast
(
tp_base
=
2
),
os
.
path
.
join
(
models_path_prefix
,
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"tiiuae/falcon-7b"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
):
PPTestSettings
.
fast
(),
...
...
@@ -171,14 +172,14 @@ TEXT_GENERATION_MODELS = {
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerMoE-3b"
):
PPTestSettings
.
fast
(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"internlm/internlm2-chat-7b"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"inceptionai/jais-13b-chat"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
:
PPTestSettings
.
detailed
(),
# Tests Transformers
Model
os
.
path
.
join
(
models_path_prefix
,
"ArthurZ
/Ilama-3.2-1B"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM-2B-sft-bf16"
)
:
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"openbmb/MiniCPM3-4B"
)
:
PPTestSettings
.
fast
(),
"internlm/internlm2-chat-7b"
:
PPTestSettings
.
fast
(),
"inceptionai/jais-13b-chat"
:
PPTestSettings
.
fast
(),
"ai21labs/Jamba-tiny-dev"
:
PPTestSettings
.
fast
(),
"meta-llama/Llama-3.2-1B-Instruct"
:
PPTestSettings
.
detailed
(),
# Tests Transformers
ForCausalLM
"hmellor
/Ilama-3.2-1B"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM-2B-sft-bf16"
:
PPTestSettings
.
fast
(),
"openbmb/MiniCPM3-4B"
:
PPTestSettings
.
fast
(),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
os
.
path
.
join
(
models_path_prefix
,
"state-spaces/mamba-130m-hf"
):
PPTestSettings
.
fast
(),
...
...
@@ -210,9 +211,11 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS
=
{
# type: ignore[var-annotated]
# [Text-only]
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
):
PPTestSettings
.
fast
(),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
),
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
):
PPTestSettings
.
fast
(
task
=
"embed"
),
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
):
PPTestSettings
.
fast
(
task
=
"embed"
),
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-Math-RM-72B"
):
PPTestSettings
.
fast
(
load_format
=
"dummy"
,
task
=
"embed"
),
}
MULTIMODAL_MODELS
=
{
...
...
@@ -246,8 +249,9 @@ TEST_MODELS = [
# [LANGUAGE GENERATION]
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-MoE-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"
ArthurZ
/Ilama-3.2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"
hmellor
/Ilama-3.2-1B"
),
os
.
path
.
join
(
models_path_prefix
,
"ibm/PowerLM-3b"
),
os
.
path
.
join
(
models_path_prefix
,
"deepseek-ai/DeepSeek-V2-Lite-Chat"
),
# [LANGUAGE EMBEDDING]
os
.
path
.
join
(
models_path_prefix
,
"intfloat/e5-mistral-7b-instruct"
),
os
.
path
.
join
(
models_path_prefix
,
"BAAI/bge-multilingual-gemma2"
),
...
...
@@ -287,6 +291,11 @@ def _compare_tp(
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
hf_config
=
get_config
(
model_id
,
trust_remote_code
)
dtype
=
"float16"
if
hf_config
.
model_type
in
_FLOAT16_NOT_SUPPORTED_MODELS
:
dtype
=
"bfloat16"
if
load_format
==
"dummy"
:
# Avoid OOM
...
...
@@ -316,7 +325,7 @@ def _compare_tp(
common_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
dtype
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
...
...
@@ -338,6 +347,7 @@ def _compare_tp(
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
testing_ray_compiled_graph
=
False
if
distributed_backend
==
"ray"
and
(
vllm_major_version
==
"1"
or
specific_case
):
# For V1, test Ray Compiled Graph for all the tests
...
...
@@ -351,6 +361,7 @@ def _compare_tp(
# Temporary. Currently when zeromq + SPMD is used, it does not properly
# terminate because of a Ray Compiled Graph issue.
common_args
.
append
(
"--disable-frontend-multiprocessing"
)
testing_ray_compiled_graph
=
True
elif
distributed_backend
==
"mp"
:
# Both V0/V1 of multiprocessing executor support PP
pp_env
=
{
...
...
@@ -394,7 +405,6 @@ def _compare_tp(
tp_env
,
method
=
method
)
except
Exception
:
testing_ray_compiled_graph
=
pp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment