Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99324e25
Commit
99324e25
authored
Jul 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.2' into v0.9.2-ori
parents
cc7f22a8
a5dd03c1
Changes
475
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1955 additions
and
170 deletions
+1955
-170
tests/compile/piecewise/test_full_cudagraph.py
tests/compile/piecewise/test_full_cudagraph.py
+108
-51
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_simple.py
+5
-11
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+21
-24
tests/compile/test_async_tp.py
tests/compile/test_async_tp.py
+3
-4
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+18
-14
tests/compile/test_config.py
tests/compile/test_config.py
+39
-23
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+5
-7
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+132
-0
tests/compile/test_sequence_parallelism.py
tests/compile/test_sequence_parallelism.py
+144
-17
tests/config/test_config_generation.py
tests/config/test_config_generation.py
+38
-0
tests/config/test_mp_reducer.py
tests/config/test_mp_reducer.py
+57
-0
tests/conftest.py
tests/conftest.py
+27
-12
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+2
-2
tests/core/block/e2e/test_correctness_sliding_window.py
tests/core/block/e2e/test_correctness_sliding_window.py
+3
-3
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+296
-2
tests/cuda/test_cuda_context.py
tests/cuda/test_cuda_context.py
+80
-0
tests/distributed/test_eplb_algo.py
tests/distributed/test_eplb_algo.py
+292
-0
tests/distributed/test_eplb_execute.py
tests/distributed/test_eplb_execute.py
+504
-0
tests/distributed/test_node_count.py
tests/distributed/test_node_count.py
+43
-0
tests/distributed/test_quick_all_reduce.py
tests/distributed/test_quick_all_reduce.py
+138
-0
No files found.
Too many changes to show.
To preserve performance only
475 of 475+
files are displayed.
Plain diff
Email patch
tests/compile/piecewise/test_full_cudagraph.py
View file @
99324e25
...
...
@@ -2,15 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
contextlib
import
os
import
weakref
from
contextlib
import
ExitStack
import
pytest
from
tests.utils
import
wait_for_gpu_memory_to_clear
from
vllm
import
LLM
,
SamplingParams
from
vllm.config
import
CompilationConfig
from
vllm.platforms
import
current_platform
MODEL
=
"Qwen/Qwen2-1.5B-Instruct"
@
contextlib
.
contextmanager
def
temporary_environ
(
env_vars
):
...
...
@@ -31,71 +32,127 @@ def temporary_environ(env_vars):
os
.
environ
[
k
]
=
v
@
pytest
.
fixture
(
scope
=
"module"
)
def
full_cudagraph_llm
():
with
temporary_environ
({
"VLLM_USE_V1"
:
"1"
,
"VLLM_FLASH_ATTN_VERSION"
:
"3"
}):
return
LLM
(
model
=
MODEL
,
gpu_memory_utilization
=
0.3
,
compilation_config
=
CompilationConfig
(
full_cuda_graph
=
True
))
@
pytest
.
fixture
(
scope
=
"class"
)
def
llm_pair
(
request
):
model
=
request
.
param
@
pytest
.
fixture
(
scope
=
"module"
)
def
piecewise_llm
():
with
temporary_environ
({
"VLLM_USE_V1"
:
"1"
,
"VLLM_FLASH_ATTN_VERSION"
:
"3"
}):
return
LLM
(
model
=
MODEL
,
gpu_memory_utilization
=
0.6
,
compilation_config
=
CompilationConfig
())
def
generate_text
(
llm
:
LLM
,
batch_size
:
int
,
max_tokens
:
int
):
prompts
=
[
"Hi my name is"
]
*
batch_size
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
top_p
=
0.95
)
return
llm
.
generate
(
prompts
,
sampling_params
)
full
=
LLM
(
model
=
model
,
gpu_memory_utilization
=
0.45
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
compilation_config
=
CompilationConfig
(
full_cuda_graph
=
True
),
)
piecewise
=
LLM
(
model
=
model
,
gpu_memory_utilization
=
0.45
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
compilation_config
=
CompilationConfig
(),
)
# PyTest caches the fixture values so we use weakref.proxy to enable GC
yield
weakref
.
proxy
(
full
),
weakref
.
proxy
(
piecewise
)
del
full
del
piecewise
wait_for_gpu_memory_to_clear
(
devices
=
[
0
],
threshold_ratio
=
0.1
,
)
@
pytest
.
mark
.
parametrize
(
"llm_pair"
,
[
# Model names for the llm_pair fixture
"deepseek-ai/DeepSeek-V2-Lite"
,
"Qwen/Qwen2-1.5B-Instruct"
],
indirect
=
True
)
@
pytest
.
mark
.
skipif
(
current_platform
.
get_device_capability
()
!=
(
9
,
0
),
reason
=
"Only Hopper GPUs support FlashAttention 3"
)
@
pytest
.
mark
.
parametrize
((
"batch_size"
,
"max_tokens"
),
[(
1
,
10
),
(
7
,
10
),
(
16
,
10
),
(
25
,
10
),
(
32
,
10
),
(
45
,
10
),
(
64
,
10
),
(
8
,
5
),
(
8
,
20
),
(
8
,
200
)])
def
test_full_cudagraph
(
batch_size
,
max_tokens
,
full_cudagraph_llm
,
piecewise_llm
):
reason
=
"Only Hopper GPUs support FA3 and FlashMLA"
)
class
TestFullCUDAGraph
:
"""
Load full cudagraph model and piecewise model once, and at the same time to
reuse them across var
io
u
s
test cases
.
Use a class such that an llm pair is constructed once for all
batch_size/max_tokens combinat
io
n
s
and released immediately after
.
Test various batch sizes and max_tokens to ensure that the full cudagraph
compilation works for padded cases too
.
Module-scope fixtures would stick around the whole time,
meaning there would be multiple LLM instances hogging memory simultaneously
.
"""
piecewise_responses
=
generate_text
(
piecewise_llm
,
batch_size
=
batch_size
,
max_tokens
=
max_tokens
)
full_cudagraph_responses
=
generate_text
(
full_cudagraph_llm
,
batch_size
=
batch_size
,
max_tokens
=
max_tokens
)
# Check that all responses are the same
for
i
in
range
(
len
(
piecewise_responses
)):
assert
piecewise_responses
[
i
].
outputs
[
0
].
text
==
full_cudagraph_responses
[
i
].
outputs
[
0
].
text
@
pytest
.
mark
.
parametrize
((
"batch_size"
,
"max_tokens"
),
[
(
1
,
10
),
(
7
,
10
),
(
16
,
10
),
(
25
,
10
),
(
32
,
10
),
(
45
,
10
),
(
64
,
10
),
(
123
,
10
),
(
8
,
5
),
(
8
,
30
),
])
def
test_full_cudagraph
(
self
,
batch_size
,
max_tokens
,
llm_pair
:
tuple
[
LLM
,
LLM
]):
"""
Test various batch sizes and max_tokens to ensure that the
full cudagraph compilation works for padded cases too.
"""
piecewise_llm
,
full_cudagraph_llm
=
llm_pair
prompts
=
[
"Hello, my name is"
]
*
batch_size
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
,
top_p
=
0.95
)
piecewise_responses
=
piecewise_llm
.
generate
(
prompts
,
sampling_params
)
full_responses
=
full_cudagraph_llm
.
generate
(
prompts
,
sampling_params
)
# Check that all responses are the same
for
piecewise_res
,
full_res
in
zip
(
piecewise_responses
,
full_responses
):
assert
piecewise_res
.
outputs
[
0
].
text
==
full_res
.
outputs
[
0
].
text
@
pytest
.
mark
.
parametrize
(
"model, supported"
,
[
(
"Qwen/Qwen2-1.5B-Instruct"
,
True
),
# MLA does not support capturing CUDA Graphs with size > max_num_seqs
(
"deepseek-ai/DeepSeek-V2-Lite"
,
False
),
])
@
pytest
.
mark
.
skipif
(
current_platform
.
get_device_capability
()
!=
(
9
,
0
),
reason
=
"Only Hopper GPUs support FA3 and FlashMLA"
)
def
test_lower_max_num_seqs
(
model
,
supported
):
with
temporary_environ
({
"VLLM_USE_V1"
:
"1"
,
"VLLM_FLASH_ATTN_VERSION"
:
"3"
}),
ExitStack
()
as
stack
:
if
not
supported
:
stack
.
enter_context
(
pytest
.
raises
(
RuntimeError
))
llm
=
LLM
(
model
=
model
,
max_num_seqs
=
256
,
trust_remote_code
=
True
,
max_model_len
=
1024
,
compilation_config
=
CompilationConfig
(
full_cuda_graph
=
True
,
cudagraph_capture_sizes
=
[
64
,
256
,
512
]))
llm
.
generate
([
"Hello, my name is"
]
*
10
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"Skip if not cuda"
)
def
test_full_cudagraph_with_invalid_backend
():
with
temporary_environ
({
"VLLM_USE_V1"
:
"1"
,
"VLLM_FLASH_ATTN_VERSION"
:
"2"
#FA2 not supported with full_cuda_graph
}),
pytest
.
raises
(
RuntimeError
):
LLM
(
model
=
MODEL
,
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
,
compilation_config
=
CompilationConfig
(
full_cuda_graph
=
True
))
tests/compile/piecewise/test_simple.py
View file @
99324e25
...
...
@@ -4,7 +4,7 @@
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
"""
import
pytest
import
torch
from
torch
import
nn
from
torch.library
import
Library
...
...
@@ -14,6 +14,7 @@ from vllm.compilation.decorators import support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.forward_context
import
set_forward_context
from
vllm.utils
import
direct_register_custom_op
global_counter
=
0
...
...
@@ -76,7 +77,8 @@ class SillyModel(nn.Module):
return
x
def
_test_simple_piecewise_compile
(
*
,
use_inductor
):
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
def
test_simple_piecewise_compile
(
use_inductor
):
assert
VLLM_USE_V1
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
...
...
@@ -99,7 +101,7 @@ def _test_simple_piecewise_compile(*, use_inductor):
num_backend_compilations
=
3
,
# num_piecewise_capturable_graphs_seen
num_cudagraph_captured
=
6
,
# num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
),
set_forward_context
({},
vllm_config
=
vllm_config
):
model
(
inputs
)
...
...
@@ -112,11 +114,3 @@ def _test_simple_piecewise_compile(*, use_inductor):
output
=
model
(
input
)
assert
global_counter
==
2
assert
torch
.
allclose
(
output
.
cpu
(),
torch
.
tensor
([
3.
,
1.
]))
def
test_simple_piecewise_compile_inductor
():
_test_simple_piecewise_compile
(
use_inductor
=
True
)
def
test_simple_piecewise_compile_no_inductor
():
_test_simple_piecewise_compile
(
use_inductor
=
False
)
tests/compile/piecewise/test_toy_llama.py
View file @
99324e25
...
...
@@ -11,6 +11,7 @@ initialized randomly with a fixed seed.
from
dataclasses
import
dataclass
from
typing
import
Any
,
Optional
import
pytest
import
torch
from
torch
import
nn
from
torch.library
import
Library
...
...
@@ -19,6 +20,7 @@ from vllm.compilation.counter import compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
set_forward_context
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
...
...
@@ -285,29 +287,32 @@ def run_model(llama_config,
vllm_config
=
vllm_config
,
prefix
=
""
).
eval
().
cuda
()
B
=
16
# max batch size
input_ids
=
torch
.
randint
(
0
,
llama_config
.
vocab_size
,
(
B
,
)).
cuda
()
positions
=
torch
.
arange
(
B
).
cuda
()
with
set_forward_context
({},
vllm_config
=
vllm_config
):
B
=
16
# max batch size
input_ids
=
torch
.
randint
(
0
,
llama_config
.
vocab_size
,
(
B
,
)).
cuda
()
positions
=
torch
.
arange
(
B
).
cuda
()
model
(
input_ids
,
positions
)
model
(
input_ids
[:
2
],
positions
[:
2
])
model
(
input_ids
[:
1
],
positions
[:
1
])
model
(
input_ids
,
positions
)
model
(
input_ids
[:
2
],
positions
[:
2
])
model
(
input_ids
[:
1
],
positions
[:
1
])
input_ids
[:
2
].
zero_
()
output
=
model
(
input_ids
[:
2
],
positions
[:
2
])
input_ids
[:
2
].
zero_
()
output
=
model
(
input_ids
[:
2
],
positions
[:
2
])
output
=
output
.
cpu
()
output
=
output
.
cpu
()
if
llama_config
.
tractable_init
:
expected_output
=
tractable_computation
(
input_ids
[:
2
],
positions
[:
2
],
llama_config
).
cpu
()
if
llama_config
.
tractable_init
:
expected_output
=
tractable_computation
(
input_ids
[:
2
],
positions
[:
2
],
llama_config
).
cpu
()
assert
torch
.
allclose
(
output
,
expected_output
)
else
:
return
output
.
cpu
()
assert
torch
.
allclose
(
output
,
expected_output
)
else
:
return
output
.
cpu
()
def
_test_toy_llama
(
*
,
use_inductor
):
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
def
test_toy_llama
(
use_inductor
:
bool
):
# compare output with and without piecewise compilation
llama_config
=
LlamaConfig
(
hidden_size
=
128
,
...
...
@@ -379,14 +384,6 @@ def _test_toy_llama(*, use_inductor):
assert
torch
.
allclose
(
outputs
[
0
],
outputs
[
i
])
def
test_toy_llama_inductor
():
_test_toy_llama
(
use_inductor
=
True
)
def
test_toy_no_inductor
():
_test_toy_llama
(
use_inductor
=
False
)
@
torch
.
inference_mode
def
benchmark
():
from
triton.testing
import
do_bench
...
...
tests/compile/test_async_tp.py
View file @
99324e25
...
...
@@ -169,8 +169,7 @@ def async_tp_pass_on_test_model(local_rank: int, world_size: int,
# In pre-nodes, all gather or reduce scatter should exist,
# fused_matmul_reduce_scatter or fused_all_gather_matmul should not
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
ops_fully_replaced
=
False
)
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
fully_replaced
=
False
)
# In post-nodes, fused_matmul_reduce_scatter or \
# fused_all_gather_matmul should exist
...
...
@@ -223,7 +222,7 @@ def test_async_tp_pass_correctness(
"VLLM_USE_V1"
:
"1"
,
}
a
y
snc_tp_args
=
[
as
y
nc_tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
...
...
@@ -242,7 +241,7 @@ def test_async_tp_pass_correctness(
]
compare_two_settings
(
model_id
,
a
y
snc_tp_args
,
as
y
nc_tp_args
,
tp_args
,
async_tp_env
,
tp_env
,
...
...
tests/compile/test_basic_correctness.py
View file @
99324e25
...
...
@@ -31,7 +31,7 @@ class TestSetting:
# basic llama model
TestSetting
(
model
=
"meta-llama/Llama-3.2-1B-Instruct"
,
model_args
=
[],
model_args
=
[
"--max-model-len"
,
"2048"
],
pp_size
=
2
,
tp_size
=
2
,
attn_backend
=
"FLASHINFER"
,
...
...
@@ -41,7 +41,7 @@ class TestSetting:
# llama model with quantization
TestSetting
(
model
=
"TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"
,
model_args
=
[
"--quantization"
,
"gptq"
],
model_args
=
[
"--quantization"
,
"gptq"
,
"--max-model-len"
,
"2048"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
...
...
@@ -51,7 +51,7 @@ class TestSetting:
# MoE model
TestSetting
(
model
=
"ibm/PowerMoE-3b"
,
model_args
=
[],
model_args
=
[
"--max-model-len"
,
"2048"
],
pp_size
=
1
,
tp_size
=
2
,
attn_backend
=
"FLASH_ATTN"
,
...
...
@@ -61,23 +61,27 @@ class TestSetting:
# embedding model
TestSetting
(
model
=
"BAAI/bge-multilingual-gemma2"
,
model_args
=
[
"--task"
,
"embed"
,
"--dtype"
,
"bfloat16"
],
model_args
=
[
"--task"
,
"embed"
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"FLASH_ATTN"
,
method
=
"encode"
,
fullgraph
=
True
,
),
# encoder-based embedding model (BERT)
TestSetting
(
model
=
"BAAI/bge-base-en-v1.5"
,
model_args
=
[
"--task"
,
"embed"
],
pp_size
=
1
,
tp_size
=
1
,
attn_backend
=
"XFORMERS"
,
method
=
"encode"
,
fullgraph
=
True
,
),
# TODO: bert models are not supported in V1 yet
# # encoder-based embedding model (BERT)
# TestSetting(
# model="BAAI/bge-base-en-v1.5",
# model_args=["--task", "embed"],
# pp_size=1,
# tp_size=1,
# attn_backend="XFORMERS",
# method="encode",
# fullgraph=True,
# ),
# vision language model
TestSetting
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
...
...
tests/compile/test_config.py
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
import
vllm
from
vllm.compilation.counter
import
compilation_counter
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.config
import
VllmConfig
from
vllm.utils
import
_is_torch_equal_or_newer
from
.piecewise.test_simple
import
SillyModel
def
test_version
():
assert
_is_torch_equal_or_newer
(
'2.8.0.dev20250624+cu128'
,
'2.8.0.dev'
)
assert
_is_torch_equal_or_newer
(
'2.8.0a0+gitc82a174'
,
'2.8.0.dev'
)
assert
_is_torch_equal_or_newer
(
'2.8.0'
,
'2.8.0.dev'
)
assert
_is_torch_equal_or_newer
(
'2.8.1'
,
'2.8.0.dev'
)
assert
not
_is_torch_equal_or_newer
(
'2.7.1'
,
'2.8.0.dev'
)
def
test_use_cudagraphs_dynamic
(
monkeypatch
):
assert
vllm
.
envs
.
VLLM_USE_V1
vllm_config
=
VllmConfig
()
assert
vllm_config
.
compilation_config
.
use_cudagraph
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
vllm_config
=
VllmConfig
()
assert
not
vllm_config
.
compilation_config
.
use_cudagraph
@
pytest
.
mark
.
parametrize
(
"enabled"
,
[
True
,
False
])
def
test_use_cudagraphs
(
enabled
):
def
test_use_cudagraphs
(
vllm_runner
,
monkeypatch
,
enabled
):
assert
vllm
.
envs
.
VLLM_USE_V1
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
level
=
CompilationLevel
.
PIECEWISE
,
use_cudagraph
=
enabled
,
cudagraph_capture_sizes
=
[
100
],
))
with
set_current_vllm_config
(
vllm_config
):
model
=
SillyModel
(
vllm_config
=
vllm_config
,
prefix
=
''
)
inputs
=
torch
.
randn
(
100
,
device
=
"cuda"
)
with
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
# one graph for the model
num_cudagraph_captured
=
1
if
enabled
else
0
,
):
# first run is warmup
model
(
inputs
)
# second run does CUDAGraphs recording (if enabled)
model
(
inputs
)
# Disable multiprocessing so that the counter is in the same process
monkeypatch
.
setenv
(
'VLLM_ENABLE_V1_MULTIPROCESSING'
,
'0'
)
compilation_config
=
{
"cudagraph_capture_sizes"
:
[
100
],
"use_cudagraph"
:
enabled
,
}
with
(
compilation_counter
.
expect
(
num_graphs_seen
=
1
,
num_gpu_runner_capture_triggers
=
1
if
enabled
else
0
,
num_cudagraph_captured
=
13
if
enabled
else
0
,
),
# loading the model causes compilation (if enabled) to happen
vllm_runner
(
'facebook/opt-125m'
,
compilation_config
=
compilation_config
,
gpu_memory_utilization
=
0.4
)
as
_
):
pass
tests/compile/test_fusion.py
View file @
99324e25
...
...
@@ -7,8 +7,7 @@ import torch
import
vllm.envs
as
envs
import
vllm.plugins
from
vllm.compilation.fusion
import
(
FUSED_OPS
,
QUANT_OPS
,
FusedRMSQuantKey
,
FusionPass
,
QuantKey
)
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
FusionPass
,
GroupShape
,
QuantKey
)
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
PassConfig
,
VllmConfig
)
...
...
@@ -30,9 +29,10 @@ class TestModel(torch.nn.Module):
self
.
cutlass_fp8_enabled
=
cutlass_fp8_enabled
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
group_shape
=
GroupShape
.
PER_TENSOR
if
static
else
GroupShape
.
PER_TOKEN
self
.
key
=
QuantKey
(
dtype
=
FP8_DTYPE
,
static
=
static
,
per_tensor
=
static
,
group_shape
=
group_shape
,
symmetric
=
True
)
if
static
:
self
.
scale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
...
...
@@ -122,9 +122,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
torch
.
testing
.
assert_close
(
result
,
result2
,
atol
=
ATOL
,
rtol
=
RTOL
)
# In pre-nodes, fp8 quant should be there and fused kernels should not
backend
.
check_before_ops
(
model
.
ops_in_model_before
(),
find_auto_fn
,
find_auto_fn_maybe
)
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
# In post-nodes, fused kernels should be there and fp8 quant should not
backend
.
check_after_ops
(
model
.
ops_in_model_after
(),
find_auto_fn
,
find_auto_fn_maybe
)
backend
.
check_after_ops
(
model
.
ops_in_model_after
())
tests/compile/test_fusion_attn.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
import
pytest
import
torch._dynamo
from
tests.compile.backend
import
TestBackend
from
tests.models.utils
import
check_outputs_equal
from
vllm
import
LLM
,
SamplingParams
from
vllm.compilation.fusion
import
QUANT_OPS
,
QuantKey
,
kFp8StaticTensorSym
from
vllm.compilation.fusion_attn
import
ATTN_OP
,
AttnFusionPass
from
vllm.compilation.fx_utils
import
find_op_nodes
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.config
import
CompilationConfig
,
CompilationLevel
,
VllmConfig
from
vllm.platforms
import
current_platform
# globals needed for string-import custom Dynamo backend field
backend
:
Optional
[
TestBackend
]
=
None
backend_unfused
:
Optional
[
TestBackend
]
=
None
@
pytest
.
mark
.
parametrize
(
"model, quant_key"
,
[(
"amd/Llama-3.1-8B-Instruct-FP8-KV"
,
kFp8StaticTensorSym
)])
@
pytest
.
mark
.
parametrize
(
"use_triton_fa"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Only test CUDA and ROCm"
)
def
test_attention_fusion
(
example_prompts
,
monkeypatch
,
model
:
str
,
quant_key
:
QuantKey
,
use_triton_fa
:
bool
):
# Clean Dynamo cache to avoid reusing other test cases
# (for some reason the reset at the end is not enough)
torch
.
_dynamo
.
reset
()
# Use global backends
global
backend
,
backend_unfused
use_v1
=
False
# can be made a param once V1 support added
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
str
(
int
(
use_v1
)))
monkeypatch
.
setenv
(
"VLLM_USE_TRITON_FLASH_ATTN"
,
str
(
int
(
use_triton_fa
)))
# Prompt 4 seems too open-ended, differs between fused and unfused
# (both outputs look reasonable though)
prompts
=
example_prompts
[:
4
]
+
example_prompts
[
5
:]
compile_config
=
CompilationConfig
(
# DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
# DYNAMO_ONCE does not properly propagate shapes.
level
=
CompilationLevel
.
DYNAMO_AS_IS
,
backend
=
"tests.compile.test_fusion_attn.backend_unfused"
,
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
backend_unfused
=
TestBackend
(
NoOpEliminationPass
(
vllm_config
))
llm
=
LLM
(
model
,
enforce_eager
=
True
,
compilation_config
=
compile_config
,
gpu_memory_utilization
=
0.9
,
max_model_len
=
2048
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
10
,
top_p
=
0.95
)
unfused_output
=
llm
.
generate
(
prompts
,
sampling_params
)
backend_unfused
=
None
# Reset backend to make sure llm gets released
del
llm
compile_config
=
CompilationConfig
(
# DYNAMO_AS_IS triggers custom backend & does full Dynamo compilation
# DYNAMO_ONCE does not properly propagate shapes.
level
=
CompilationLevel
.
DYNAMO_AS_IS
,
backend
=
"tests.compile.test_fusion_attn.backend"
,
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
# AttnFusionPass needs attention layers to be registered in config upon init
# so we initialize it during compilation.
attn_pass
=
lambda
*
args
,
**
kw
:
AttnFusionPass
(
vllm_config
)(
*
args
,
**
kw
)
backend
=
TestBackend
(
NoOpEliminationPass
(
vllm_config
),
attn_pass
)
llm2
=
LLM
(
model
,
enforce_eager
=
True
,
compilation_config
=
compile_config
,
gpu_memory_utilization
=
0.9
,
max_model_len
=
2048
)
# check support
attn_fusion_supported
=
[
layer
.
impl
.
fused_output_quant_supported
(
quant_key
.
dtype
,
quant_key
.
static
,
quant_key
.
group_shape
)
for
key
,
layer
in
compile_config
.
static_forward_context
.
items
()
]
print
(
f
"
{
attn_fusion_supported
=
}
"
)
if
any
(
attn_fusion_supported
):
# Check quant ops
backend
.
check_before_ops
([
QUANT_OPS
[
quant_key
]],
fully_replaced
=
False
)
# attention ops present in both, just output_scale param changes
attn_nodes_pre
=
list
(
find_op_nodes
(
ATTN_OP
,
backend
.
graph_pre_pass
))
attn_nodes_post
=
list
(
find_op_nodes
(
ATTN_OP
,
backend
.
graph_post_pass
))
assert
len
(
attn_nodes_pre
)
==
len
(
attn_nodes_post
)
for
i
in
range
(
len
(
attn_nodes_pre
)):
assert
attn_nodes_pre
[
i
].
kwargs
[
"output_scale"
]
is
None
fused
=
attn_nodes_post
[
i
].
kwargs
[
"output_scale"
]
is
not
None
assert
fused
==
attn_fusion_supported
[
i
],
\
f
"Node
{
i
}
{
''
if
fused
else
'not '
}
expected "
\
f
"to have fused output quant"
# check outputs
fused_output
=
llm2
.
generate
(
prompts
,
sampling_params
)
# transform outputs to format expected by check_outputs_equal
sample_outs
=
lambda
s
:
(
list
(
s
.
token_ids
),
s
.
text
)
outs_lst
=
lambda
ros
:
[
sample_outs
(
ro
.
outputs
[
0
])
for
ro
in
ros
]
check_outputs_equal
(
outputs_0_lst
=
outs_lst
(
unfused_output
),
outputs_1_lst
=
outs_lst
(
fused_output
),
name_0
=
"unfused"
,
name_1
=
"fused"
,
)
# Clean Dynamo cache to avoid polluting other case(s)
torch
.
_dynamo
.
reset
()
# Reset backend to make sure llm2 gets released
backend
=
None
tests/compile/test_sequence_parallelism.py
View file @
99324e25
...
...
@@ -6,7 +6,9 @@ import torch
import
vllm.envs
as
envs
from
vllm.compilation.fix_functionalization
import
FixFunctionalizationPass
from
vllm.compilation.fusion
import
FusionPass
from
vllm.compilation.fx_utils
import
find_auto_fn
,
find_auto_fn_maybe
,
is_func
from
vllm.compilation.noop_elimination
import
NoOpEliminationPass
from
vllm.compilation.sequence_parallelism
import
SequenceParallelismPass
from
vllm.config
import
(
CompilationConfig
,
DeviceConfig
,
ModelConfig
,
PassConfig
,
VllmConfig
)
...
...
@@ -14,12 +16,15 @@ from vllm.distributed import tensor_model_parallel_all_reduce
from
vllm.distributed.parallel_state
import
(
init_distributed_environment
,
initialize_model_parallel
)
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
Fp8LinearOp
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
update_environment_variables
from
..utils
import
multi_gpu_test
from
.backend
import
TestBackend
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
...
...
@@ -30,13 +35,16 @@ prompts = [
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
intermediate_size
=
32
):
def
__init__
(
self
,
hidden_size
=
16
,
intermediate_size
=
32
,
vllm_config
:
VllmConfig
=
None
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
gate_proj
=
torch
.
nn
.
Parameter
(
torch
.
empty
((
intermediate_size
,
hidden_size
)))
self
.
norm
=
RMSNorm
(
hidden
_size
,
1e-05
)
self
.
norm
=
RMSNorm
(
intermediate
_size
,
1e-05
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
...
...
@@ -79,32 +87,138 @@ class TestModel(torch.nn.Module):
return
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
]
class
TestQuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
=
16
,
intermediate_size
=
32
,
vllm_config
:
VllmConfig
=
None
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
vllm_config
=
vllm_config
self
.
gate_proj
=
torch
.
nn
.
Parameter
(
torch
.
empty
(
(
intermediate_size
,
hidden_size
)),
requires_grad
=
False
)
self
.
norm
=
RMSNorm
(
intermediate_size
,
1e-05
)
# Initialize weights
torch
.
nn
.
init
.
normal_
(
self
.
gate_proj
,
std
=
0.02
)
self
.
fp8_linear
=
Fp8LinearOp
(
cutlass_fp8_supported
=
True
,
use_per_token_if_dynamic
=
False
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
# Create a weight that is compatible with torch._scaled_mm,
# which expects a column-major layout.
self
.
w
=
torch
.
rand
(
hidden_size
,
intermediate_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
self
.
wscale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
def
forward
(
self
,
hidden_states
,
residual
):
"""
Forward pass implementing the operations in the FX graph
Args:
hidden_states: Input tensor
residual: Residual tensor from previous layer
Returns:
Tuple containing the output tensor
"""
# Reshape input
view
=
hidden_states
.
reshape
(
-
1
,
self
.
hidden_size
)
#matrix multiplication
permute
=
self
.
gate_proj
.
permute
(
1
,
0
)
mm
=
torch
.
mm
(
view
,
permute
)
# Tensor parallel all-reduce
all_reduce
=
tensor_model_parallel_all_reduce
(
mm
)
# layer normalization
norm_output
,
residual_output
=
self
.
norm
(
all_reduce
,
residual
)
# for static input quantization
# self.fp8_linear is initialized with use_per_token_if_dynamic=False
fp8_linear_result
=
self
.
fp8_linear
.
apply
(
norm_output
,
self
.
w
,
self
.
wscale
,
input_scale
=
self
.
scale
.
to
(
norm_output
.
device
))
return
fp8_linear_result
,
residual_output
def
ops_in_model_before
(
self
):
ops_to_remove
=
[
torch
.
ops
.
vllm
.
all_reduce
.
default
]
# Always removed by SP
# The following are only removed if fusion happens
if
self
.
vllm_config
and
self
.
vllm_config
.
compilation_config
\
.
pass_config
.
enable_fusion
:
ops_to_remove
.
extend
([
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
,
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
,
])
return
ops_to_remove
def
ops_in_model_after
(
self
):
ops_to_add
=
[
torch
.
ops
.
vllm
.
reduce_scatter
.
default
,
torch
.
ops
.
vllm
.
all_gather
.
default
]
# The following is only added if fusion happens
if
self
.
vllm_config
and
self
.
vllm_config
.
compilation_config
\
.
pass_config
.
enable_fusion
:
ops_to_add
.
append
(
torch
.
ops
.
_C
.
fused_add_rms_norm_static_fp8_quant
.
default
)
return
ops_to_add
def
ops_in_model
(
self
):
if
self
.
vllm_config
and
self
.
vllm_config
.
compilation_config
\
.
pass_config
.
enable_fusion
:
# If fusion happens, the fused op is the one
# we check for (de)functionalization
return
[
torch
.
ops
.
_C
.
fused_add_rms_norm_static_fp8_quant
.
default
]
# noqa: E501
else
:
# If no fusion, the original ops are checked
return
[
torch
.
ops
.
_C
.
fused_add_rms_norm
.
default
,
# TODO functionalization pass does not handle this yet
# torch.ops._C.static_scaled_fp8_quant.default,
]
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"test_model_cls"
,
[
TestModel
,
TestQuantModel
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
8
])
@
pytest
.
mark
.
parametrize
(
"seq_len"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"enable_fusion"
,
[
True
,
False
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
],
reason
=
"Only test on CUDA"
)
def
test_sequence_parallelism_pass
(
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
def
test_sequence_parallelism_pass
(
test_model_cls
:
type
[
torch
.
nn
.
Module
],
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
enable_fusion
:
bool
):
num_processes
=
2
def
run_torch_spawn
(
fn
,
nprocs
):
# need to use torch.mp.spawn otherwise will have problems with
# torch.distributed and cuda
torch
.
multiprocessing
.
spawn
(
fn
,
args
=
(
num_processes
,
batch_size
,
seq_len
,
hidden_size
,
dtype
),
args
=
(
num_processes
,
test_model_cls
,
batch_size
,
seq_len
,
hidden_size
,
dtype
,
enable_fusion
),
nprocs
=
nprocs
)
run_torch_spawn
(
sequence_parallelism_pass_on_test_model
,
num_processes
)
def
sequence_parallelism_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
):
def
sequence_parallelism_pass_on_test_model
(
local_rank
:
int
,
world_size
:
int
,
test_model_cls
:
type
[
torch
.
nn
.
Module
],
batch_size
:
int
,
seq_len
:
int
,
hidden_size
:
int
,
dtype
:
torch
.
dtype
,
enable_fusion
:
bool
):
current_platform
.
seed_everything
(
0
)
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
...
...
@@ -127,26 +241,39 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
# configure vllm config for SequenceParallelismPass
vllm_config
=
VllmConfig
()
vllm_config
.
compilation_config
=
CompilationConfig
(
pass_config
=
PassConfig
(
enable_sequence_parallelism
=
True
))
enable_sequence_parallelism
=
True
,
enable_fusion
=
enable_fusion
,
enable_noop
=
True
))
# NoOp needed for fusion
vllm_config
.
device_config
=
DeviceConfig
(
device
=
torch
.
device
(
"cuda"
))
# this is a fake model name to construct the model config
# in the vllm_config, it's not really used.
model
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model
,
model
_name
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config
.
model_config
=
ModelConfig
(
model
=
model
_name
,
task
=
"auto"
,
tokenizer
=
model
,
tokenizer
=
model
_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
dtype
,
seed
=
42
)
sequence_parallelism_pass
=
SequenceParallelismPass
(
vllm_config
)
backend_no_func
=
TestBackend
(
sequence_parallelism_pass
)
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
func_pass
=
FixFunctionalizationPass
(
vllm_config
)
backend_func
=
TestBackend
(
sequence_parallelism_pass
,
func_pass
)
model
=
TestModel
(
hidden_size
,
hidden_size
*
2
)
passes_for_backend
=
[
noop_pass
,
sequence_parallelism_pass
]
if
enable_fusion
:
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
passes_for_backend
.
append
(
fusion_pass
)
backend_no_func
=
TestBackend
(
*
passes_for_backend
)
backend_func
=
TestBackend
(
*
passes_for_backend
,
func_pass
)
model
=
test_model_cls
(
hidden_size
,
hidden_size
*
2
,
vllm_config
=
vllm_config
)
hidden_states
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
residual
=
torch
.
randn
((
batch_size
*
seq_len
,
hidden_size
),
dtype
=
dtype
)
...
...
tests/config/test_config_generation.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.model_executor.layers.quantization.quark.utils
import
deep_compare
def
test_cuda_empty_vs_unset_configs
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test that configs created with normal (untouched) CUDA_VISIBLE_DEVICES
and CUDA_VISIBLE_DEVICES="" are equivalent. This ensures consistent
behavior regardless of whether GPU visibility is disabled via empty string
or left in its normal state.
"""
def
create_config
():
engine_args
=
EngineArgs
(
model
=
"deepseek-ai/DeepSeek-V2-Lite"
,
trust_remote_code
=
True
)
return
engine_args
.
create_engine_config
()
# Create config with CUDA_VISIBLE_DEVICES set normally
normal_config
=
create_config
()
# Create config with CUDA_VISIBLE_DEVICES=""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"CUDA_VISIBLE_DEVICES"
,
""
)
empty_config
=
create_config
()
normal_config_dict
=
vars
(
normal_config
)
empty_config_dict
=
vars
(
empty_config
)
# Remove instance_id before comparison as it's expected to be different
normal_config_dict
.
pop
(
"instance_id"
,
None
)
empty_config_dict
.
pop
(
"instance_id"
,
None
)
assert
deep_compare
(
normal_config_dict
,
empty_config_dict
),
(
"Configs with normal CUDA_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES=
\"\"
"
" should be equivalent"
)
tests/config/test_mp_reducer.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
sys
from
unittest.mock
import
patch
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.v1.engine.async_llm
import
AsyncLLM
def
test_mp_reducer
(
monkeypatch
):
"""
Test that _reduce_config reducer is registered when AsyncLLM is instantiated
without transformers_modules. This is a regression test for
https://github.com/vllm-project/vllm/pull/18640.
"""
# Use V1 AsyncLLM which calls maybe_register_config_serialize_by_value
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'1'
)
# Ensure transformers_modules is not in sys.modules
if
'transformers_modules'
in
sys
.
modules
:
del
sys
.
modules
[
'transformers_modules'
]
with
patch
(
'multiprocessing.reducer.register'
)
as
mock_register
:
engine_args
=
AsyncEngineArgs
(
model
=
"facebook/opt-125m"
,
max_model_len
=
32
,
gpu_memory_utilization
=
0.1
,
disable_log_stats
=
True
,
disable_log_requests
=
True
,
)
async_llm
=
AsyncLLM
.
from_engine_args
(
engine_args
,
start_engine_loop
=
False
,
)
assert
mock_register
.
called
,
(
"multiprocessing.reducer.register should have been called"
)
vllm_config_registered
=
False
for
call_args
in
mock_register
.
call_args_list
:
# Verify that a reducer for VllmConfig was registered
if
len
(
call_args
[
0
])
>=
2
and
call_args
[
0
][
0
]
==
VllmConfig
:
vllm_config_registered
=
True
reducer_func
=
call_args
[
0
][
1
]
assert
callable
(
reducer_func
),
"Reducer function should be callable"
break
assert
vllm_config_registered
,
(
"VllmConfig should have been registered to multiprocessing.reducer"
)
async_llm
.
shutdown
()
tests/conftest.py
View file @
99324e25
...
...
@@ -33,7 +33,7 @@ from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.
utils
import
cuda_device_count_stateless
from
vllm.
transformers_utils.utils
import
maybe_model_redirect
logger
=
init_logger
(
__name__
)
...
...
@@ -145,6 +145,7 @@ def run_with_both_engines(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without
use_v1
=
request
.
param
# Tests decorated with `@skip_v1` are only run without v1
skip_v0
=
request
.
node
.
get_closest_marker
(
"skip_v0"
)
skip_v1
=
request
.
node
.
get_closest_marker
(
"skip_v1"
)
if
use_v1
:
...
...
@@ -152,6 +153,8 @@ def run_with_both_engines(request, monkeypatch):
pytest
.
skip
(
"Skipping test on vllm V1"
)
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'1'
)
else
:
if
skip_v0
:
pytest
.
skip
(
"Skipping test on vllm V0"
)
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
...
...
@@ -318,6 +321,7 @@ class HfRunner:
skip_tokenizer_init
:
bool
=
False
,
auto_cls
:
type
[
_BaseAutoModelClass
]
=
AutoModelForCausalLM
,
)
->
None
:
model_name
=
maybe_model_redirect
(
model_name
)
self
.
model_name
=
model_name
self
.
config
=
AutoConfig
.
from_pretrained
(
...
...
@@ -727,8 +731,12 @@ class HfRunner:
**
kwargs
)
->
list
[
list
[
torch
.
Tensor
]]:
return
self
.
model
.
encode
(
prompts
,
*
args
,
**
kwargs
)
def
predict
(
self
,
prompts
:
list
[
list
[
str
]])
->
torch
.
Tensor
:
return
self
.
model
.
predict
(
prompts
,
convert_to_tensor
=
True
)
def
predict
(
self
,
prompts
:
list
[
list
[
str
]],
*
args
,
**
kwargs
)
->
torch
.
Tensor
:
return
self
.
model
.
predict
(
prompts
,
*
args
,
convert_to_tensor
=
True
,
**
kwargs
)
def
__enter__
(
self
):
return
self
...
...
@@ -1018,13 +1026,13 @@ class VllmRunner:
req_outputs
=
self
.
model
.
classify
(
prompts
)
return
[
req_output
.
outputs
.
probs
for
req_output
in
req_outputs
]
def
e
ncode
(
self
,
prompts
:
list
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
*
args
,
**
kwargs
)
->
list
[
list
[
float
]]:
def
e
mbed
(
self
,
prompts
:
list
[
str
],
images
:
Optional
[
PromptImageInput
]
=
None
,
videos
:
Optional
[
PromptVideoInput
]
=
None
,
audios
:
Optional
[
PromptAudioInput
]
=
None
,
*
args
,
**
kwargs
)
->
list
[
list
[
float
]]:
inputs
=
self
.
get_inputs
(
prompts
,
images
=
images
,
videos
=
videos
,
...
...
@@ -1033,12 +1041,18 @@ class VllmRunner:
req_outputs
=
self
.
model
.
embed
(
inputs
,
*
args
,
**
kwargs
)
return
[
req_output
.
outputs
.
embedding
for
req_output
in
req_outputs
]
def
encode
(
self
,
prompts
:
list
[
str
])
->
list
[
list
[
float
]]:
req_outputs
=
self
.
model
.
encode
(
prompts
)
return
[
req_output
.
outputs
.
data
for
req_output
in
req_outputs
]
def
score
(
self
,
text_1
:
Union
[
str
,
list
[
str
]],
text_2
:
Union
[
str
,
list
[
str
]],
*
args
,
**
kwargs
,
)
->
list
[
float
]:
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
)
req_outputs
=
self
.
model
.
score
(
text_1
,
text_2
,
*
args
,
**
kwargs
)
return
[
req_output
.
outputs
.
score
for
req_output
in
req_outputs
]
def
apply_model
(
self
,
func
:
Callable
[[
nn
.
Module
],
_R
])
->
list
[
_R
]:
...
...
@@ -1079,7 +1093,8 @@ def num_gpus_available():
"""Get number of GPUs without initializing the CUDA context
in current process."""
return
cuda_device_count_stateless
()
from
vllm.platforms
import
current_platform
return
current_platform
.
device_count
()
temp_dir
=
tempfile
.
gettempdir
()
...
...
tests/core/block/e2e/test_correctness.py
View file @
99324e25
...
...
@@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"enable_prefix_caching"
:
True
,
}])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_auto_prefix_caching_after_evition_start
(
baseline_llm_generator
,
test_llm_generator
):
def
test_auto_prefix_caching_after_evi
c
tion_start
(
baseline_llm_generator
,
test_llm_generator
):
"""Verify block manager v2 with auto prefix caching could works normal
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
...
...
tests/core/block/e2e/test_correctness_sliding_window.py
View file @
99324e25
...
...
@@ -33,8 +33,8 @@ BLOCK_SIZE = 16
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
def
test_sliding_window_retrival
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
def
test_sliding_window_retri
e
val
(
baseline_llm_generator
,
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
The test does a bunch of assignments "x1 = 10
\n
x2 = 33
\n
..." and then
asks for value of one of them (which is outside the sliding window).
...
...
@@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
def
test_sliding_window_chunked_prefill
(
test_llm_generator
,
batch_size
,
seed
,
backend
,
monkeypatch
):
"""
This is similar to test_sliding_window_retrival, however, it doesn't
This is similar to test_sliding_window_retri
e
val, however, it doesn't
compare against the v1 block manager since v1 doesn't support
chunked prefill with sliding window.
...
...
tests/core/test_scheduler.py
View file @
99324e25
...
...
@@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
# should be preempted. 1 will also be preempted.
budget
=
create_token_budget
()
output
=
scheduler
.
_schedule_running
(
budget
,
curr_loras
)
remainig_running
=
scheduler
.
running
assert
len
(
remainig_running
)
==
0
remaini
n
g_running
=
scheduler
.
running
assert
len
(
remaini
n
g_running
)
==
0
assert
len
(
output
.
decode_seq_groups
)
==
1
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
output
.
decode_seq_groups
[
0
].
seq_group
.
request_id
==
"0"
...
...
@@ -1041,3 +1041,297 @@ def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
for
seq
in
scheduled_seq_group
.
seq_group
.
seqs
:
seq
.
status
=
SequenceStatus
.
FINISHED_STOPPED
scheduler
.
free_finished_seq_groups
()
def
test_remove_seq_from_computed_blocks_tracker
():
"""
Test that computed_blocks_tracker correctly removes stale sequences
during scheduling.
The test covers 9 scheduling branches where stale seqs are removed:
- 1 in _schedule_swapped
- 1 in _schedule_priority_preemption
- 7 in _schedule_prefill
Each branch is tested to ensure proper cleanup of
_seq_id_to_num_tokens_computed.
"""
# Budget can not schedule in swapped
block_size
=
2
max_seq_group
=
3
seq_tokens_with_swapped
:
list
[
list
[
int
]]
=
[]
blocks_to_swap_out
:
list
[
tuple
[
int
,
int
]]
=
[]
curr_loras
:
set
[
int
]
=
set
()
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
16
,
max_num_seqs
=
max_seq_group
,
enable_prefix_caching
=
True
,
)
budget
=
create_token_budget
(
token_budget
=
15
)
seq_length
=
16
num_seqs
=
3
for
i
in
range
(
num_seqs
):
seq_tokens_with_swapped
.
append
([
i
]
*
seq_length
)
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_with_swapped
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_with_swapped
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_add_seq_group_to_swapped
(
seq_group
)
scheduler
.
_schedule_swapped
(
budget
,
curr_loras
)
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
# Prefill schedule don't have a space for another LoRA, so
# we ignore this request for now.
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
,
enable_prefix_caching
=
True
)
budget
=
create_token_budget
(
token_budget
=
120
)
num_seqs
=
2
for
i
in
range
(
num_seqs
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
seq_length
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_path
=
"abc"
))
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
# Priority preemption schedule
scheduler
.
_schedule_priority_preemption
(
budget
)
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
# Prefill scheduler does not schedule batches with prompt tokens and
# prompt embeddings co-mingled.
block_size
=
2
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
100
,
enable_prefix_caching
=
True
,
)
seq_length
=
7
embedding_size
=
5
seq_tokens_with_embedding
:
list
[
list
[
int
]]
=
[]
seq_embeds
:
list
[
Optional
[
torch
.
Tensor
]]
=
[]
seq_tokens_with_embedding
.
append
(
list
(
range
(
seq_length
)))
seq_embeds
.
append
(
None
)
seq_tokens_with_embedding
.
append
([
0
]
*
seq_length
)
seq_embeds
.
append
(
torch
.
rand
(
embedding_size
))
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_with_embedding
[
i
],
prompt_embeds
=
seq_embeds
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_with_embedding
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
# Prefill scheduler budget num_batched_tokens
# >= scheduler_config max_num_batched_tokens
block_size
=
2
max_seq_group
=
3
seq_tokens_prefill_budget
:
list
[
list
[
int
]]
=
[]
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
max_token_budget
=
8
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
5
,
enable_prefix_caching
=
True
,
)
seq_length
=
4
num_seqs
=
3
for
i
in
range
(
num_seqs
):
seq_tokens_prefill_budget
.
append
([
i
]
*
seq_length
)
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_prefill_budget
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_prefill_budget
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
2
))
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not schedule in waiting
block_size
=
2
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
max_token_budget
=
30
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
30
,
enable_prefix_caching
=
True
,
)
seq_length
=
16
num_seqs
=
3
seq_tokens_prefill_budget_waiting
:
list
[
list
[
int
]]
=
[]
for
i
in
range
(
num_seqs
):
seq_tokens_prefill_budget_waiting
.
append
(
list
(
range
(
seq_length
)))
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_prefill_budget_waiting
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_prefill_budget_waiting
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
# Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
block_size
=
2
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
30
,
enable_prefix_caching
=
True
,
)
seq_length
=
31
seq_tokens_prompt_limit
:
list
[
list
[
int
]]
=
[]
seq_tokens_prompt_limit
.
append
(
list
(
range
(
seq_length
)))
seq_and_seq_groups
=
[
create_dummy_prompt
(
"0"
,
prompt_tokens
=
seq_tokens_prompt_limit
[
0
],
block_size
=
block_size
)
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
0
))
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
block_size
=
2
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
160
,
num_gpu_blocks
=
160
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
320
,
enable_prefix_caching
=
True
,
)
seq_length
=
320
num_seqs
=
1
seq_tokens_never
:
list
[
list
[
int
]]
=
[]
for
i
in
range
(
num_seqs
):
seq_tokens_never
.
append
(
list
(
range
(
seq_length
)))
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_never
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_never
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
0
))
assert
seq_id_to_num_tokens_computed
is
None
# Budget can not allocate, AllocStatus is LATER
block_size
=
2
max_seq_group
=
3
scheduler
=
initialize_scheduler
(
block_size
=
block_size
,
num_cpu_blocks
=
160
,
num_gpu_blocks
=
160
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
320
,
enable_prefix_caching
=
True
,
)
seq_length
=
160
num_seqs
=
2
seq_tokens_later
:
list
[
list
[
int
]]
=
[]
for
i
in
range
(
num_seqs
):
seq_tokens_later
.
append
(
list
(
range
(
seq_length
)))
seq_and_seq_groups
=
[
create_dummy_prompt
(
f
"
{
i
}
"
,
prompt_tokens
=
seq_tokens_later
[
i
],
block_size
=
block_size
)
for
i
in
range
(
len
(
seq_tokens_later
))
]
for
_
,
seq_group
in
seq_and_seq_groups
:
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
_schedule_default
()
seq_id_to_num_tokens_computed
=
(
scheduler
.
block_manager
.
_computed_blocks_tracker
.
_seq_id_to_num_tokens_computed
.
get
(
1
))
assert
seq_id_to_num_tokens_computed
is
None
tests/cuda/test_cuda_context.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
ctypes
from
concurrent.futures
import
ThreadPoolExecutor
import
pytest
import
torch
from
vllm.platforms
import
current_platform
def
check_cuda_context
():
"""Check CUDA driver context status"""
try
:
cuda
=
ctypes
.
CDLL
(
'libcuda.so'
)
device
=
ctypes
.
c_int
()
result
=
cuda
.
cuCtxGetDevice
(
ctypes
.
byref
(
device
))
return
(
True
,
device
.
value
)
if
result
==
0
else
(
False
,
None
)
except
Exception
:
return
False
,
None
def
run_cuda_test_in_thread
(
device_input
,
expected_device_id
):
"""Run CUDA context test in separate thread for isolation"""
try
:
# New thread should have no CUDA context initially
valid_before
,
device_before
=
check_cuda_context
()
if
valid_before
:
return
False
,
\
"CUDA context should not exist in new thread, "
\
f
"got device
{
device_before
}
"
# Test setting CUDA context
current_platform
.
set_device
(
device_input
)
# Verify context is created correctly
valid_after
,
device_id
=
check_cuda_context
()
if
not
valid_after
:
return
False
,
"CUDA context should be valid after set_cuda_context"
if
device_id
!=
expected_device_id
:
return
False
,
\
f
"Expected device
{
expected_device_id
}
, got
{
device_id
}
"
return
True
,
"Success"
except
Exception
as
e
:
return
False
,
f
"Exception in thread:
{
str
(
e
)
}
"
class
TestSetCudaContext
:
"""Test suite for the set_cuda_context function."""
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"CUDA not available"
)
@
pytest
.
mark
.
parametrize
(
argnames
=
"device_input,expected_device_id"
,
argvalues
=
[
(
0
,
0
),
(
torch
.
device
(
'cuda:0'
),
0
),
(
'cuda:0'
,
0
),
],
ids
=
[
"int"
,
"torch_device"
,
"string"
])
def
test_set_cuda_context_parametrized
(
self
,
device_input
,
expected_device_id
):
"""Test setting CUDA context in isolated threads."""
with
ThreadPoolExecutor
(
max_workers
=
1
)
as
executor
:
future
=
executor
.
submit
(
run_cuda_test_in_thread
,
device_input
,
expected_device_id
)
success
,
message
=
future
.
result
(
timeout
=
30
)
assert
success
,
message
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"CUDA not available"
)
def
test_set_cuda_context_invalid_device_type
(
self
):
"""Test error handling for invalid device type."""
with
pytest
.
raises
(
ValueError
,
match
=
"Expected a cuda device"
):
current_platform
.
set_device
(
torch
.
device
(
'cpu'
))
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
,
"-v"
])
tests/distributed/test_eplb_algo.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
from
vllm.distributed.eplb.rebalance_algo
import
rebalance_experts
def
test_basic_rebalance
():
"""Test basic rebalancing functionality"""
# Example from https://github.com/deepseek-ai/eplb
weight
=
torch
.
tensor
([
[
90
,
132
,
40
,
61
,
104
,
165
,
39
,
4
,
73
,
56
,
183
,
86
],
[
20
,
107
,
104
,
64
,
19
,
197
,
187
,
157
,
172
,
86
,
16
,
27
],
])
num_layers
=
weight
.
shape
[
0
]
num_replicas
=
16
num_groups
=
4
num_nodes
=
2
num_gpus
=
8
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify output shapes
assert
phy2log
.
shape
==
(
2
,
16
,
),
f
"Expected `phy2log` shape (2, 16), got
{
phy2log
.
shape
}
"
assert
(
log2phy
.
shape
[
0
]
==
2
),
f
"Expected `log2phy` first dimension 2, got
{
log2phy
.
shape
[
0
]
}
"
assert
(
log2phy
.
shape
[
1
]
==
12
),
f
"Expected `log2phy` second dimension 12, got
{
log2phy
.
shape
[
1
]
}
"
assert
logcnt
.
shape
==
(
2
,
12
,
),
f
"Expected `logcnt` shape (2, 12), got
{
logcnt
.
shape
}
"
# Verify physical to logical expert mapping range is correct
assert
torch
.
all
(
phy2log
>=
0
)
and
torch
.
all
(
phy2log
<
12
),
"Physical to logical mapping should be in range [0, 12)"
# Verify expert count reasonableness
assert
torch
.
all
(
logcnt
>=
1
),
"Each logical expert should have at least 1 replica"
assert
(
torch
.
sum
(
logcnt
,
dim
=
1
).
sum
()
==
num_replicas
*
num_layers
),
f
"Total replicas should be
{
num_replicas
*
num_layers
}
"
# Verify expected output
expected_phy2log
=
torch
.
tensor
([
[
5
,
6
,
5
,
7
,
8
,
4
,
3
,
4
,
10
,
9
,
10
,
2
,
0
,
1
,
11
,
1
],
[
7
,
10
,
6
,
8
,
6
,
11
,
8
,
9
,
2
,
4
,
5
,
1
,
5
,
0
,
3
,
1
],
])
assert
torch
.
all
(
phy2log
==
expected_phy2log
)
expected_logcnt
=
torch
.
tensor
([[
1
,
2
,
1
,
1
,
2
,
2
,
1
,
1
,
1
,
1
,
2
,
1
],
[
1
,
2
,
1
,
1
,
1
,
2
,
2
,
1
,
2
,
1
,
1
,
1
]])
assert
torch
.
all
(
logcnt
==
expected_logcnt
)
def
test_single_gpu_case
():
"""Test single GPU case"""
weight
=
torch
.
tensor
([[
10
,
20
,
30
,
40
]])
num_replicas
=
4
num_groups
=
1
num_nodes
=
1
num_gpus
=
1
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify shapes
assert
phy2log
.
shape
==
(
1
,
4
)
assert
log2phy
.
shape
[
0
]
==
1
assert
log2phy
.
shape
[
1
]
==
4
assert
logcnt
.
shape
==
(
1
,
4
)
# Verify all logical experts are mapped
assert
set
(
phy2log
[
0
].
tolist
())
==
{
0
,
1
,
2
,
3
}
def
test_equal_weights
():
"""Test case with equal weights"""
weight
=
torch
.
tensor
([[
50
,
50
,
50
,
50
,
50
,
50
,
50
,
50
]])
num_replicas
=
8
num_groups
=
2
num_nodes
=
2
num_gpus
=
4
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify shapes
assert
phy2log
.
shape
==
(
1
,
8
)
assert
logcnt
.
shape
==
(
1
,
8
)
# With equal weights, each expert should have exactly one replica
assert
torch
.
all
(
logcnt
==
1
),
"With equal weights and no replication, "
\
"each expert should have exactly 1 replica"
def
test_extreme_weight_imbalance
():
"""Test extreme weight imbalance case"""
weight
=
torch
.
tensor
([[
1000
,
1
,
1
,
1
,
1
,
1
,
1
,
1
]])
num_replicas
=
12
num_groups
=
2
num_nodes
=
2
num_gpus
=
4
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify shapes
assert
phy2log
.
shape
==
(
1
,
12
)
assert
logcnt
.
shape
==
(
1
,
8
)
# Expert with highest weight (index 0) should have more replicas
assert
(
logcnt
[
0
,
0
]
>
logcnt
[
0
,
1
]),
"Expert with highest weight should have more replicas"
def
test_multiple_layers
():
"""Test multiple layers case"""
weight
=
torch
.
tensor
([
[
10
,
20
,
30
,
40
,
50
,
60
],
# First layer
[
60
,
50
,
40
,
30
,
20
,
10
],
# Second layer (opposite weight pattern)
[
25
,
25
,
25
,
25
,
25
,
25
],
# Third layer (equal weights)
])
num_replicas
=
8
num_groups
=
2
num_nodes
=
2
num_gpus
=
4
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify shapes
assert
phy2log
.
shape
==
(
3
,
8
)
assert
logcnt
.
shape
==
(
3
,
6
)
# Verify expert allocation is reasonable for each layer
for
layer
in
range
(
3
):
assert
torch
.
all
(
phy2log
[
layer
]
>=
0
)
and
torch
.
all
(
phy2log
[
layer
]
<
6
),
f
"Layer
{
layer
}
physical to logical mapping"
\
"should be in range [0, 6)"
assert
(
torch
.
sum
(
logcnt
[
layer
])
==
num_replicas
),
f
"Layer
{
layer
}
total replicas should be
{
num_replicas
}
"
def
test_parameter_validation
():
"""Test parameter validation"""
weight
=
torch
.
tensor
([[
10
,
20
,
30
,
40
]])
# Test non-divisible case - this should handle normally without throwing
# errors because the function will fall back to global load balancing
# strategy
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
8
,
3
,
2
,
4
)
assert
phy2log
.
shape
==
(
1
,
8
)
assert
logcnt
.
shape
==
(
1
,
4
)
# Test cases that will actually cause errors:
# num_physical_experts not divisible by num_gpus
with
pytest
.
raises
(
AssertionError
):
rebalance_experts
(
weight
,
7
,
2
,
2
,
4
)
# 7 not divisible by 4
def
test_small_scale_hierarchical
():
"""Test small-scale hierarchical load balancing"""
weight
=
torch
.
tensor
([
[
100
,
50
,
200
,
75
,
150
,
25
,
300
,
80
],
# 8 experts
])
num_replicas
=
12
num_groups
=
4
# 4 groups, 2 experts each
num_nodes
=
2
# 2 nodes
num_gpus
=
4
# 4 GPUs
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Verify basic constraints
assert
phy2log
.
shape
==
(
1
,
12
)
assert
logcnt
.
shape
==
(
1
,
8
)
assert
torch
.
sum
(
logcnt
)
==
num_replicas
assert
torch
.
all
(
logcnt
>=
1
)
# Expert with highest weight should have more replicas
max_weight_expert
=
torch
.
argmax
(
weight
[
0
])
assert
(
logcnt
[
0
,
max_weight_expert
]
>=
2
),
"Highest weight expert should have multiple replicas"
def
test_global_load_balance_fallback
():
"""Test global load balancing fallback case"""
# When num_groups % num_nodes != 0, should fall back to global load
# balancing
weight
=
torch
.
tensor
([[
10
,
20
,
30
,
40
,
50
,
60
]])
num_replicas
=
8
num_groups
=
3
# Cannot be divided evenly by num_nodes=2
num_nodes
=
2
num_gpus
=
4
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Should work normally, just using global load balancing strategy
assert
phy2log
.
shape
==
(
1
,
8
)
assert
logcnt
.
shape
==
(
1
,
6
)
assert
torch
.
sum
(
logcnt
)
==
num_replicas
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"cuda"
])
def
test_device_compatibility
(
device
):
"""Test device compatibility"""
if
device
==
"cuda"
and
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA not available"
)
weight
=
torch
.
tensor
([[
10
,
20
,
30
,
40
]],
device
=
device
)
num_replicas
=
6
num_groups
=
2
num_nodes
=
1
num_gpus
=
2
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
# Function will convert to CPU internally, but should handle different
# device inputs normally
assert
phy2log
.
shape
==
(
1
,
6
)
assert
logcnt
.
shape
==
(
1
,
4
)
def
test_additional_cases
():
"""Test more edge cases and different parameter combinations"""
# Test case 1: Large-scale distributed setup
weight1
=
torch
.
tensor
(
[[
50
,
100
,
75
,
120
,
90
,
60
,
80
,
110
,
40
,
70
,
95
,
85
,
65
,
55
,
45
,
35
]])
phy2log1
,
log2phy1
,
logcnt1
=
rebalance_experts
(
weight1
,
24
,
8
,
4
,
8
)
assert
phy2log1
.
shape
==
(
1
,
24
)
assert
logcnt1
.
shape
==
(
1
,
16
)
assert
torch
.
sum
(
logcnt1
)
==
24
# Test case 2: Different weight distributions
weight2
=
torch
.
tensor
([
[
200
,
150
,
100
,
50
,
25
,
12
],
# Decreasing weights
[
12
,
25
,
50
,
100
,
150
,
200
],
# Increasing weights
])
phy2log2
,
log2phy2
,
logcnt2
=
rebalance_experts
(
weight2
,
10
,
3
,
1
,
2
)
assert
phy2log2
.
shape
==
(
2
,
10
)
assert
logcnt2
.
shape
==
(
2
,
6
)
# Verify high-weight experts have more replicas
for
layer
in
range
(
2
):
max_weight_idx
=
torch
.
argmax
(
weight2
[
layer
])
assert
logcnt2
[
layer
,
max_weight_idx
]
>=
2
if
__name__
==
"__main__"
:
weight
=
torch
.
tensor
([
[
90
,
132
,
40
,
61
,
104
,
165
,
39
,
4
,
73
,
56
,
183
,
86
],
[
20
,
107
,
104
,
64
,
19
,
197
,
187
,
157
,
172
,
86
,
16
,
27
],
])
num_replicas
=
16
num_groups
=
4
num_nodes
=
2
num_gpus
=
8
phy2log
,
log2phy
,
logcnt
=
rebalance_experts
(
weight
,
num_replicas
,
num_groups
,
num_nodes
,
num_gpus
)
print
(
phy2log
)
test_basic_rebalance
()
tests/distributed/test_eplb_execute.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
multiprocessing
import
os
import
random
import
pytest
import
torch
import
torch.distributed
from
vllm.distributed.eplb.rebalance_execute
import
(
rearrange_expert_weights_inplace
)
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
get_tp_group
,
init_distributed_environment
)
from
vllm.utils
import
update_environment_variables
def
distributed_run
(
fn
,
world_size
):
number_of_processes
=
world_size
processes
:
list
[
multiprocessing
.
Process
]
=
[]
for
i
in
range
(
number_of_processes
):
env
:
dict
[
str
,
str
]
=
{}
env
[
'RANK'
]
=
str
(
i
)
env
[
'LOCAL_RANK'
]
=
str
(
i
)
env
[
'WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'LOCAL_WORLD_SIZE'
]
=
str
(
number_of_processes
)
env
[
'MASTER_ADDR'
]
=
'localhost'
env
[
'MASTER_PORT'
]
=
'12345'
p
=
multiprocessing
.
Process
(
target
=
fn
,
args
=
(
env
,
))
processes
.
append
(
p
)
p
.
start
()
for
p
in
processes
:
p
.
join
()
for
p
in
processes
:
assert
p
.
exitcode
==
0
def
worker_fn_wrapper
(
fn
):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
def
wrapped_fn
(
env
):
update_environment_variables
(
env
)
local_rank
=
os
.
environ
[
'LOCAL_RANK'
]
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_distributed_environment
()
# Ensure each worker process has the same random seed
random
.
seed
(
42
)
torch
.
manual_seed
(
42
)
fn
()
return
wrapped_fn
def
create_expert_indices_with_redundancy
(
num_layers
:
int
,
num_logical_experts
:
int
,
total_physical_experts
:
int
,
redundancy_config
:
list
[
int
],
# redundancy for each logical expert
)
->
torch
.
Tensor
:
"""
Create expert indices with redundancy.
Args:
num_layers: number of layers
num_logical_experts: number of logical experts
total_physical_experts: total number of physical experts
redundancy_config: redundancy for each logical expert
Returns:
indices: Shape (num_layers, total_physical_experts)
"""
assert
sum
(
redundancy_config
)
==
total_physical_experts
assert
len
(
redundancy_config
)
==
num_logical_experts
indices
=
torch
.
zeros
(
num_layers
,
total_physical_experts
,
dtype
=
torch
.
long
)
for
layer
in
range
(
num_layers
):
physical_pos
=
0
for
logical_expert_id
,
redundancy
in
enumerate
(
redundancy_config
):
for
_
in
range
(
redundancy
):
indices
[
layer
,
physical_pos
]
=
logical_expert_id
physical_pos
+=
1
# Shuffle the indices at dim 1
for
layer
in
range
(
num_layers
):
indices
[
layer
]
=
indices
[
layer
][
torch
.
randperm
(
indices
.
shape
[
1
])]
return
indices
def
create_expert_weights
(
num_layers
:
int
,
num_local_experts
:
int
,
hidden_sizes
:
list
[
int
],
rank
:
int
,
device
:
torch
.
device
,
physical_to_logical_mapping
:
torch
.
Tensor
,
)
->
list
[
list
[
torch
.
Tensor
]]:
"""
Create fake expert weights tensor for testing.
Use `arange` to generate predictable weights values, based on logical
expert ID.
All replicas of the same logical expert should have the same weights.
Args:
physical_to_logical_mapping: Shape (num_layers, num_local_experts)
mapping[layer, physical_pos] = logical_expert_id
"""
expert_weights
=
[]
for
layer
in
range
(
num_layers
):
layer_weights
=
[]
for
weight_idx
,
hidden_size
in
enumerate
(
hidden_sizes
):
weight_tensor
=
torch
.
zeros
(
num_local_experts
,
hidden_size
,
device
=
device
,
dtype
=
torch
.
float32
)
for
local_expert
in
range
(
num_local_experts
):
# Get the logical expert ID for this physical expert
global_pos
=
rank
*
num_local_experts
+
local_expert
logical_expert_id
=
physical_to_logical_mapping
[
layer
,
global_pos
].
item
()
# Generate weights based on logical expert ID
# (so that all replicas of the same logical expert have the
# same weights)
base_value
=
(
logical_expert_id
*
1000
+
layer
*
100
+
weight_idx
*
10
)
weight_tensor
[
local_expert
]
=
torch
.
arange
(
base_value
,
base_value
+
hidden_size
,
device
=
device
,
dtype
=
torch
.
float32
)
layer_weights
.
append
(
weight_tensor
)
expert_weights
.
append
(
layer_weights
)
return
expert_weights
def
create_redundancy_config
(
num_logical_experts
:
int
,
num_physical_experts
:
int
,
)
->
list
[
int
]:
"""Create a redundancy configuration."""
redundancy_config
=
[
1
]
*
num_logical_experts
remaining
=
num_physical_experts
-
num_logical_experts
# Randomly assign the remaining physical experts to the logical experts
for
_
in
range
(
remaining
):
redundancy_config
[
random
.
choice
(
range
(
num_logical_experts
))]
+=
1
return
redundancy_config
def
verify_expert_weights_after_shuffle
(
expert_weights
:
list
[
list
[
torch
.
Tensor
]],
new_indices
:
torch
.
Tensor
,
hidden_sizes
:
list
[
int
],
ep_rank
:
int
,
num_local_experts
:
int
,
):
"""Verify the weights after shuffling are correct."""
num_layers
=
len
(
expert_weights
)
for
layer
in
range
(
num_layers
):
for
weight_idx
,
hidden_size
in
enumerate
(
hidden_sizes
):
weight_tensor
=
expert_weights
[
layer
][
weight_idx
]
for
local_expert
in
range
(
num_local_experts
):
# Calculate the global expert ID for this local expert
global_pos
=
ep_rank
*
num_local_experts
+
local_expert
expected_logical_expert
=
new_indices
[
layer
,
global_pos
].
item
()
# Check if the weights are correct
actual_weights
=
weight_tensor
[
local_expert
]
expected_base
=
(
expected_logical_expert
*
1000
+
layer
*
100
+
weight_idx
*
10
)
expected_weights
=
torch
.
arange
(
expected_base
,
expected_base
+
hidden_size
,
device
=
actual_weights
.
device
,
dtype
=
actual_weights
.
dtype
)
torch
.
testing
.
assert_close
(
actual_weights
,
expected_weights
,
msg
=
f
"Layer
{
layer
}
, weight
{
weight_idx
}
,"
f
"local expert
{
local_expert
}
: "
f
"weights do not match. "
f
"Expected logical expert
{
expected_logical_expert
}
"
)
def
verify_redundant_experts_have_same_weights
(
expert_weights
:
list
[
list
[
torch
.
Tensor
]],
indices
:
torch
.
Tensor
,
hidden_sizes
:
list
[
int
],
world_size
:
int
,
num_local_experts
:
int
,
):
"""
Verify that all replicas of the same logical expert have the same weights.
"""
num_layers
=
len
(
expert_weights
)
total_physical_experts
=
world_size
*
num_local_experts
for
layer
in
range
(
num_layers
):
# Collect weights for all physical experts for each weight matrix
all_weights
:
list
[
torch
.
Tensor
]
=
[]
for
weight_idx
,
hidden_size
in
enumerate
(
hidden_sizes
):
# Create tensor to store all expert weights
# Shape: [total_physical_experts, hidden_size]
gathered_weights
=
torch
.
zeros
(
total_physical_experts
,
hidden_size
,
device
=
expert_weights
[
layer
][
weight_idx
].
device
,
dtype
=
expert_weights
[
layer
][
weight_idx
].
dtype
)
# Use all_gather to collect expert weights from current node
# expert_weights[layer][weight_idx] shape:
# [num_local_experts, hidden_size]
local_weights
=
expert_weights
[
layer
][
weight_idx
]
# [num_local_experts, hidden_size]
# Split tensor along dim 0 into a list for all_gather
gathered_weights_list
=
torch
.
chunk
(
gathered_weights
,
world_size
,
dim
=
0
)
torch
.
distributed
.
all_gather
(
# Output list: each element corresponds to one rank's weights
list
(
gathered_weights_list
),
local_weights
# Input: current rank's local weights
)
all_weights
.
append
(
gathered_weights
)
# Verify that all replicas of the same logical expert have the same
# weights
logical_expert_weights
:
dict
[
int
,
dict
[
int
,
torch
.
Tensor
]]
=
{}
for
physical_pos
in
range
(
total_physical_experts
):
logical_expert_id
=
int
(
indices
[
layer
,
physical_pos
].
item
())
if
logical_expert_id
not
in
logical_expert_weights
:
# First time encountering this logical expert, save its weights
logical_expert_weights
[
logical_expert_id
]
=
{
weight_idx
:
all_weights
[
weight_idx
][
physical_pos
]
for
weight_idx
in
range
(
len
(
hidden_sizes
))
}
else
:
# Verify that current physical expert's weights match the
# previously saved logical expert weights
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
all_weights
[
weight_idx
][
physical_pos
],
logical_expert_weights
[
logical_expert_id
][
weight_idx
],
msg
=
f
"Layer
{
layer
}
, weight
{
weight_idx
}
,"
f
"logical expert
{
logical_expert_id
}
: "
f
"Physical expert
{
physical_pos
}
has different weights"
f
"than expected"
)
@
pytest
.
mark
.
parametrize
(
"world_size,num_layers,num_local_experts,num_logical_experts"
,
[
# 2 GPU, 2 experts per GPU
# 3 logical experts, 4 physical experts, 1 redundant experts
(
2
,
1
,
2
,
3
),
# 2 GPU, 3 experts per GPU
# 4 logical experts, 6 physical experts, 2 redundant experts
(
2
,
2
,
3
,
4
),
# 2 GPU, 8 experts per GPU
# 16 logical experts, 16 physical experts, 0 redundant experts
(
2
,
4
,
8
,
16
),
# 4 GPU, 2 experts per GPU
# 6 logical experts, 8 physical experts, 2 redundant experts
(
4
,
1
,
2
,
6
),
# 4 GPU, 2 experts per GPU
# 5 logical experts, 8 physical experts, 3 redundant experts
(
4
,
2
,
2
,
5
),
# 4 GPU, 8 experts per GPU
# 16 logical experts, 32 physical experts, 16 redundant experts
(
4
,
8
,
8
,
16
),
])
def
test_rearrange_expert_weights_with_redundancy
(
world_size
,
num_layers
,
num_local_experts
,
num_logical_experts
):
"""Test the functionality of rearranging expert weights with redundancy."""
if
torch
.
cuda
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
@
worker_fn_wrapper
def
worker_fn
():
# Initialize model parallel (using tensor parallel as an entrypoint
# to expert parallel)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
# Test parameters
total_physical_experts
=
world_size
*
num_local_experts
hidden_sizes
=
[
32
,
64
]
# Two different weight matrices
# Create old expert indices (with redundancy)
redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
,
)
# Create new expert indices (with redundancy)
new_redundancy_config
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy_config
,
)
# Create expert weights
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
# Execute weight rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
False
,
)
# Verify the rearrangement result
verify_expert_weights_after_shuffle
(
expert_weights
,
new_indices
,
hidden_sizes
,
ep_rank
,
num_local_experts
,
)
verify_redundant_experts_have_same_weights
(
expert_weights
,
new_indices
,
hidden_sizes
,
world_size
,
num_local_experts
,
)
distributed_run
(
worker_fn
,
world_size
)
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
,
4
])
def
test_rearrange_expert_weights_no_change
(
world_size
):
"""
Test that when the indices do not change, the weights should remain
unchanged.
"""
if
torch
.
cuda
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
@
worker_fn_wrapper
def
worker_fn
():
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
num_layers
=
2
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
# Some redundancy
hidden_sizes
=
[
32
,
64
]
# Create redundancy configuration
redundancy_config
=
[
2
]
*
num_logical_experts
# Same indices - no change
indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
redundancy_config
)
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
indices
)
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute rearrangement (should be no change)
rearrange_expert_weights_inplace
(
indices
,
indices
,
# Same indices
expert_weights
,
ep_group
,
is_profile
=
False
)
# Verify that the weights have not changed
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
f
"Layer
{
layer
}
, weight
{
weight_idx
}
should remain "
f
"unchanged"
)
distributed_run
(
worker_fn
,
world_size
)
@
pytest
.
mark
.
parametrize
(
"world_size"
,
[
2
,
4
])
def
test_rearrange_expert_weights_profile_mode
(
world_size
):
"""Test profile mode (should not copy actual weights)"""
if
torch
.
cuda
.
device_count
()
<
world_size
:
pytest
.
skip
(
f
"Need at least
{
world_size
}
GPUs to run the test"
)
@
worker_fn_wrapper
def
worker_fn
():
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
world_size
,
pipeline_model_parallel_size
=
1
)
ep_group
=
get_tp_group
().
cpu_group
ep_rank
=
torch
.
distributed
.
get_rank
()
device
=
torch
.
device
(
f
"cuda:
{
ep_rank
}
"
)
num_layers
=
1
num_local_experts
=
2
total_physical_experts
=
world_size
*
num_local_experts
num_logical_experts
=
total_physical_experts
//
2
hidden_sizes
=
[
32
]
# Create different index distributions
old_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
new_redundancy
=
create_redundancy_config
(
num_logical_experts
,
total_physical_experts
)
old_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
old_redundancy
)
new_indices
=
create_expert_indices_with_redundancy
(
num_layers
,
num_logical_experts
,
total_physical_experts
,
new_redundancy
)
expert_weights
=
create_expert_weights
(
num_layers
,
num_local_experts
,
hidden_sizes
,
ep_rank
,
device
,
old_indices
)
# Save original weights
original_weights
=
[]
for
layer_weights
in
expert_weights
:
layer_copy
=
[]
for
weight
in
layer_weights
:
layer_copy
.
append
(
weight
.
clone
())
original_weights
.
append
(
layer_copy
)
# Execute profile mode rearrangement
rearrange_expert_weights_inplace
(
old_indices
,
new_indices
,
expert_weights
,
ep_group
,
is_profile
=
True
# Profile mode
)
# In profile mode, the weights should remain unchanged
for
layer
in
range
(
num_layers
):
for
weight_idx
in
range
(
len
(
hidden_sizes
)):
torch
.
testing
.
assert_close
(
expert_weights
[
layer
][
weight_idx
],
original_weights
[
layer
][
weight_idx
],
msg
=
"In profile mode, the weights should remain unchanged"
)
distributed_run
(
worker_fn
,
world_size
)
tests/distributed/test_node_count.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
torch.distributed
as
dist
from
vllm.distributed.parallel_state
import
_node_count
from
vllm.distributed.utils
import
StatelessProcessGroup
from
vllm.utils
import
get_ip
,
get_open_port
if
__name__
==
"__main__"
:
dist
.
init_process_group
(
backend
=
"gloo"
)
rank
=
dist
.
get_rank
()
world_size
=
dist
.
get_world_size
()
if
rank
==
0
:
port
=
get_open_port
()
ip
=
get_ip
()
dist
.
broadcast_object_list
([
ip
,
port
],
src
=
0
)
else
:
recv
=
[
None
,
None
]
dist
.
broadcast_object_list
(
recv
,
src
=
0
)
ip
,
port
=
recv
stateless_pg
=
StatelessProcessGroup
.
create
(
ip
,
port
,
rank
,
world_size
)
for
pg
in
[
dist
.
group
.
WORLD
,
stateless_pg
]:
test_result
=
_node_count
(
pg
)
# Expected node count based on environment variable)
expected
=
int
(
os
.
environ
.
get
(
"NUM_NODES"
,
"1"
))
assert
test_result
==
expected
,
\
f
"Expected
{
expected
}
nodes, got
{
test_result
}
"
if
pg
==
dist
.
group
.
WORLD
:
print
(
f
"Node count test passed! Got
{
test_result
}
nodes "
f
"when using torch distributed!"
)
else
:
print
(
f
"Node count test passed! Got
{
test_result
}
nodes "
f
"when using StatelessProcessGroup!"
)
tests/distributed/test_quick_all_reduce.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
import
pytest
import
ray
import
torch
import
torch.distributed
as
dist
from
vllm.distributed.communication_op
import
(
# noqa
tensor_model_parallel_all_reduce
)
from
vllm.distributed.parallel_state
import
(
get_tensor_model_parallel_group
,
get_tp_group
,
graph_capture
)
from
vllm.platforms
import
current_platform
from
..utils
import
(
ensure_model_parallel_initialized
,
init_test_distributed_environment
,
multi_process_parallel
)
torch
.
manual_seed
(
42
)
random
.
seed
(
44
)
# Size over 8MB is sufficient for custom quick allreduce.
test_sizes
=
[
random
.
randint
(
8
*
1024
*
1024
,
10
*
1024
*
1024
)
for
_
in
range
(
8
)
]
for
i
,
v
in
enumerate
(
test_sizes
):
test_sizes
[
i
]
-=
v
%
8
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
graph_quickreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
,
pp_size
,
rank
,
distributed_init_port
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
ensure_model_parallel_initialized
(
tp_size
,
pp_size
)
group
=
get_tensor_model_parallel_group
().
device_group
# A small all_reduce for warmup.
# this is needed because device communicators might be created lazily
# (e.g. NCCL). This will ensure that the communicator is initialized
# before any communication happens, so that this group can be used for
# graph capture immediately.
data
=
torch
.
zeros
(
1
)
data
=
data
.
to
(
device
=
device
)
torch
.
distributed
.
all_reduce
(
data
,
group
=
group
)
torch
.
cuda
.
synchronize
()
del
data
# we use the first group to communicate once
# and the second group to communicate twice
# and so on
# this is used to demonstrate that each group can
# communicate independently
num_communication
=
rank
//
tp_size
+
1
for
sz
in
test_sizes
:
for
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]:
with
graph_capture
(
device
=
device
)
as
graph_capture_context
:
inp1
=
torch
.
randint
(
1
,
23
,
(
sz
,
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
inp2
=
torch
.
randint
(
-
23
,
1
,
(
sz
,
),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
())
torch
.
cuda
.
synchronize
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
,
stream
=
graph_capture_context
.
stream
):
for
_
in
range
(
num_communication
):
out1
=
tensor_model_parallel_all_reduce
(
inp1
)
dist
.
all_reduce
(
inp1
,
group
=
group
)
out2
=
tensor_model_parallel_all_reduce
(
inp2
)
dist
.
all_reduce
(
inp2
,
group
=
group
)
graph
.
replay
()
torch
.
testing
.
assert_close
(
out1
,
inp1
,
atol
=
2.5
,
rtol
=
0.1
)
torch
.
testing
.
assert_close
(
out2
,
inp2
,
atol
=
2.5
,
rtol
=
0.1
)
@
ray
.
remote
(
num_gpus
=
1
,
max_calls
=
1
)
def
eager_quickreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
,
pp_size
,
rank
,
distributed_init_port
,
):
with
monkeypatch
.
context
()
as
m
:
m
.
delenv
(
"CUDA_VISIBLE_DEVICES"
,
raising
=
False
)
device
=
torch
.
device
(
f
"cuda:
{
rank
}
"
)
torch
.
cuda
.
set_device
(
device
)
init_test_distributed_environment
(
tp_size
,
pp_size
,
rank
,
distributed_init_port
)
# Size over 8MB is sufficient for custom quick allreduce.
sz
=
16
*
1024
*
1024
fa
=
get_tp_group
().
device_communicator
.
qr_comm
inp
=
torch
.
tensor
([
1.0
*
((
i
)
%
23
)
for
i
in
range
(
sz
)],
dtype
=
torch
.
float16
,
device
=
device
)
out
=
fa
.
quick_all_reduce
(
inp
)
torch
.
testing
.
assert_close
(
out
,
inp
*
tp_size
,
atol
=
2.5
,
rtol
=
0.1
)
inp
=
torch
.
tensor
([
1.0
*
((
i
)
%
23
)
for
i
in
range
(
sz
)],
dtype
=
torch
.
bfloat16
,
device
=
device
)
out
=
fa
.
quick_all_reduce
(
inp
)
torch
.
testing
.
assert_close
(
out
,
inp
*
tp_size
,
atol
=
2.5
,
rtol
=
0.1
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"only test quick allreduce for rocm"
)
@
pytest
.
mark
.
parametrize
(
"quant_mode"
,
[
"FP"
,
"INT8"
,
"INT6"
,
"INT4"
])
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"pipeline_parallel_size"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
graph_quickreduce
,
eager_quickreduce
])
def
test_custom_quick_allreduce
(
monkeypatch
:
pytest
.
MonkeyPatch
,
tp_size
,
pipeline_parallel_size
,
test_target
,
quant_mode
):
world_size
=
tp_size
*
pipeline_parallel_size
if
world_size
>
torch
.
cuda
.
device_count
():
pytest
.
skip
(
"Not enough GPUs to run the test."
)
monkeypatch
.
setenv
(
"VLLM_ROCM_QUICK_REDUCE_QUANTIZATION"
,
quant_mode
)
multi_process_parallel
(
monkeypatch
,
tp_size
,
pipeline_parallel_size
,
test_target
)
Prev
1
…
10
11
12
13
14
15
16
17
18
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment