Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
469e903b
Commit
469e903b
authored
Mar 28, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-dev
parents
389ebcf7
25f560a6
Changes
535
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1019 additions
and
341 deletions
+1019
-341
tests/neuron/1_core/test_block_table.py
tests/neuron/1_core/test_block_table.py
+153
-0
tests/neuron/1_core/test_cache.py
tests/neuron/1_core/test_cache.py
+83
-0
tests/neuron/1_core/test_layernorm.py
tests/neuron/1_core/test_layernorm.py
+56
-0
tests/neuron/1_core/test_logits_processor.py
tests/neuron/1_core/test_logits_processor.py
+94
-0
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+264
-221
tests/neuron/1_core/test_rotary_embedding.py
tests/neuron/1_core/test_rotary_embedding.py
+58
-0
tests/neuron/2_core/test_comm_ops.py
tests/neuron/2_core/test_comm_ops.py
+100
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+3
-7
tests/plugins_tests/conftest.py
tests/plugins_tests/conftest.py
+11
-0
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+7
-6
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
+53
-18
tests/prefix_caching/test_disable_sliding_window.py
tests/prefix_caching/test_disable_sliding_window.py
+4
-1
tests/prefix_caching/test_prefix_caching.py
tests/prefix_caching/test_prefix_caching.py
+85
-57
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+8
-6
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+8
-2
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+1
-2
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+12
-5
tests/quantization/test_experts_int8.py
tests/quantization/test_experts_int8.py
+2
-2
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+13
-12
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+4
-2
No files found.
Too many changes to show.
To preserve performance only
535 of 535+
files are displayed.
Plain diff
Email patch
tests/neuron/1_core/test_block_table.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
neuronxcc.nki.language
as
nl
import
pytest
import
torch
import
torch.nn.functional
as
F
from
neuronxcc
import
nki
from
vllm.attention.ops.nki_flash_attn
import
(
load_block_tables
,
transform_block_tables_for_indirect_load
)
def
is_power_of_2
(
n
):
return
n
>
0
and
(
n
&
(
n
-
1
)
==
0
)
def
nki_load_and_transform_block_tables
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
num_head
,
head_id
,
block_size_tiling_factor
,
):
assert
is_power_of_2
(
num_blocks_per_tile
),
f
"
{
num_blocks_per_tile
=
}
must be power of 2"
block_tables_sbuf
=
load_block_tables
(
block_tables
,
num_tiles
,
num_blocks_per_tile
)
# we need to pass an Index as head_id
head_id
=
nl
.
arange
(
1
)[
None
,
:]
+
head_id
block_tables_transposed
=
transform_block_tables_for_indirect_load
(
block_tables_sbuf
,
block_size_tiling_factor
,
num_head
,
head_id
)
B_P_SIZE
=
128
assert
block_tables_transposed
.
shape
[
1
]
==
B_P_SIZE
out
=
nl
.
ndarray
(
block_tables_transposed
.
shape
,
dtype
=
nl
.
int32
,
buffer
=
nl
.
shared_hbm
,
)
for
i
in
nl
.
affine_range
(
block_tables_transposed
.
shape
[
0
]):
nl
.
store
(
dst
=
out
[
i
],
value
=
block_tables_transposed
[
i
])
return
out
def
ref_block_tables_transform
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
num_head
,
head_id
,
block_size_tiling_factor
,
):
assert
block_tables
.
numel
()
==
num_tiles
*
num_blocks_per_tile
block_tables
=
block_tables
.
view
(
num_tiles
,
num_blocks_per_tile
)
B_F_SIZE
=
128
num_tiles_padded
=
(
num_tiles
+
B_F_SIZE
-
1
)
//
B_F_SIZE
*
B_F_SIZE
block_tables
=
F
.
pad
(
block_tables
,
(
0
,
0
,
0
,
num_tiles_padded
-
num_tiles
),
"constant"
,
0
,
)
block_tables
=
block_tables
*
num_head
+
head_id
block_tables
=
block_tables
.
view
(
num_tiles_padded
,
num_blocks_per_tile
,
1
)
offset
=
torch
.
arange
(
0
,
block_size_tiling_factor
).
view
(
1
,
1
,
-
1
)
block_tables
=
block_tables
*
block_size_tiling_factor
+
offset
block_tables_transposed
=
block_tables
.
view
(
num_tiles_padded
,
-
1
).
t
()
num_blocks_per_tile
=
block_tables_transposed
.
shape
[
0
]
assert
num_blocks_per_tile
%
B_F_SIZE
==
0
return
block_tables_transposed
.
view
(
num_blocks_per_tile
//
B_F_SIZE
,
B_F_SIZE
,
num_tiles_padded
)
@
pytest
.
mark
.
parametrize
(
"q_head_per_kv_head,head_id"
,
[
(
1
,
0
),
(
3
,
1
),
],
)
@
pytest
.
mark
.
parametrize
(
"num_tiles,num_blocks_per_tile"
,
[
(
1
,
1
),
(
13
,
16
),
(
17
,
128
),
(
35
,
512
),
(
128
,
128
),
(
130
,
64
),
(
280
,
256
),
(
315
,
1
),
],
)
@
torch
.
inference_mode
()
def
test_load_and_transform_block_tables
(
monkeypatch
:
pytest
.
MonkeyPatch
,
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
compiler_flags_str
=
" "
.
join
([
"-O1"
,
"--retry_failed_compilation"
,
])
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"NEURON_CC_FLAGS"
,
compiler_flags_str
)
torch
.
manual_seed
(
10000
)
torch
.
set_printoptions
(
sci_mode
=
False
)
# On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
B_P_SIZE
=
128
if
num_blocks_per_tile
<
B_P_SIZE
:
assert
B_P_SIZE
%
num_blocks_per_tile
==
0
block_size_tiling_factor
=
B_P_SIZE
//
num_blocks_per_tile
else
:
block_size_tiling_factor
=
1
max_num_blocks
=
100000
block_tables
=
torch
.
randint
(
0
,
max_num_blocks
,
(
num_tiles
*
num_blocks_per_tile
,
),
dtype
=
torch
.
int32
,
)
nki_out
=
nki
.
jit
(
nki_load_and_transform_block_tables
)[
1
,
1
](
block_tables
.
to
(
device
=
device
),
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
block_size_tiling_factor
,
).
cpu
()
ref_out
=
ref_block_tables_transform
(
block_tables
,
num_tiles
,
num_blocks_per_tile
,
q_head_per_kv_head
,
head_id
,
block_size_tiling_factor
,
)
assert
(
nki_out
.
shape
==
ref_out
.
shape
),
f
"
{
nki_out
.
shape
=
}
!=
{
ref_out
.
shape
=
}
"
assert
torch
.
all
(
nki_out
==
ref_out
)
tests/neuron/1_core/test_cache.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.attention.ops.nki_flash_attn
import
reshape_and_cache
@
pytest
.
mark
.
parametrize
(
"num_tokens, n_kv_head, d_head, num_blocks, block_size"
,
[
# Small model configuration (e.g., GPT-2 small)
(
32
,
12
,
64
,
4
,
128
),
# Typical sequence processing
(
1
,
12
,
64
,
4
,
128
),
# Single token update
(
128
,
12
,
64
,
4
,
128
),
# Longer sequence
# Medium model configuration (e.g., GPT-2 medium)
(
64
,
16
,
96
,
8
,
256
),
# Standard batch
(
256
,
16
,
96
,
8
,
256
),
# Large batch
# Large model configuration (e.g., GPT-3 style)
(
48
,
32
,
128
,
16
,
512
),
# Typical processing window
(
512
,
32
,
128
,
16
,
512
),
# Full context window
# Edge cases and stress tests
(
1024
,
8
,
32
,
32
,
32
),
# Many tokens, small heads
(
16
,
64
,
256
,
4
,
64
),
# Few tokens, many heads
(
2048
,
24
,
128
,
64
,
128
),
# Large scale test
# Minimal configurations for debugging
(
4
,
2
,
16
,
2
,
16
),
# Tiny test case
(
1
,
1
,
8
,
1
,
8
),
# Minimal possible
])
def
test_reshape_and_cache
(
num_tokens
,
n_kv_head
,
d_head
,
num_blocks
,
block_size
):
# Set random seed for reproducibility
torch
.
manual_seed
(
42
)
# Create CPU tensors for reference implementation
key_cpu
=
torch
.
randn
(
num_tokens
,
n_kv_head
,
d_head
)
/
torch
.
sqrt
(
torch
.
tensor
(
d_head
))
value_cpu
=
torch
.
randn
(
num_tokens
,
n_kv_head
,
d_head
)
/
torch
.
sqrt
(
torch
.
tensor
(
d_head
))
key_cache_cpu
=
torch
.
zeros
(
num_blocks
,
n_kv_head
,
block_size
,
d_head
)
value_cache_cpu
=
torch
.
zeros
(
num_blocks
,
n_kv_head
,
block_size
,
d_head
)
slot_mapping_cpu
=
torch
.
randperm
(
num_blocks
*
block_size
)[:
num_tokens
]
# Run reference implementation on CPU
block_indices
=
torch
.
div
(
slot_mapping_cpu
,
block_size
,
rounding_mode
=
"floor"
)
block_offsets
=
slot_mapping_cpu
%
block_size
for
i
in
range
(
num_tokens
):
block_idx
=
block_indices
[
i
]
block_offset
=
block_offsets
[
i
]
key_cache_cpu
[
block_idx
,
:,
block_offset
,
:]
=
key_cpu
[
i
]
value_cache_cpu
[
block_idx
,
:,
block_offset
,
:]
=
value_cpu
[
i
]
# Create XLA device tensors
device
=
torch
.
device
(
'xla'
)
key
=
key_cpu
.
to
(
device
)
value
=
value_cpu
.
to
(
device
)
key_cache
=
torch
.
zeros_like
(
key_cache_cpu
,
device
=
device
)
value_cache
=
torch
.
zeros_like
(
value_cache_cpu
,
device
=
device
)
slot_mapping
=
slot_mapping_cpu
.
to
(
device
)
# Run vectorized implementation on XLA device
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
)
# Move results back to CPU for comparison
key_cache_result
=
key_cache
.
cpu
()
value_cache_result
=
value_cache
.
cpu
()
# Assert results match
torch
.
testing
.
assert_close
(
key_cache_result
,
key_cache_cpu
,
rtol
=
1e-5
,
atol
=
1e-5
)
torch
.
testing
.
assert_close
(
value_cache_result
,
value_cache_cpu
,
rtol
=
1e-5
,
atol
=
1e-5
)
tests/neuron/1_core/test_layernorm.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"num_tokens,hidden_size,add_residual,dtype"
,
[
(
7
,
8
,
False
,
torch
.
half
),
(
83
,
768
,
False
,
torch
.
half
),
(
83
,
768
,
True
,
torch
.
half
),
(
83
,
768
,
True
,
torch
.
bfloat16
),
(
83
,
768
,
True
,
torch
.
float32
),
])
@
torch
.
inference_mode
()
def
test_rms_norm
(
num_tokens
:
int
,
hidden_size
:
int
,
add_residual
:
bool
,
dtype
:
torch
.
dtype
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
layer
=
RMSNorm
(
hidden_size
).
to
(
dtype
=
dtype
)
layer
.
weight
.
data
.
normal_
(
mean
=
1.0
,
std
=
0.1
)
scale
=
1
/
(
2
*
hidden_size
)
x
=
torch
.
randn
(
num_tokens
,
hidden_size
,
dtype
=
dtype
).
to
(
device
=
device
)
x
*=
scale
residual
=
torch
.
randn_like
(
x
)
*
scale
if
add_residual
else
None
residual_cpu
=
residual
.
cpu
()
if
add_residual
else
None
ref_out
=
layer
.
to
(
device
=
"cpu"
).
forward_native
(
x
.
cpu
(),
residual_cpu
)
assert
x
.
is_xla
,
"input tensor under testing is expected to be XLA tensor."
out
=
layer
.
to
(
device
=
device
)(
x
,
residual
)
# NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
# numerical errors than other operators because they involve reductions.
# Therefore, we use a larger tolerance.
if
add_residual
:
assert
out
[
0
].
is_xla
,
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out
[
0
].
cpu
(),
ref_out
[
0
],
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
out
[
1
].
cpu
(),
ref_out
[
1
],
atol
=
1e-2
,
rtol
=
1e-2
)
else
:
assert
out
.
is_xla
,
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out
.
cpu
(),
ref_out
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/neuron/1_core/test_logits_processor.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
random
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_random_seed
from
vllm.sequence
import
SamplingParams
,
SequenceData
,
SequenceGroupMetadata
from
vllm.utils
import
is_pin_memory_available
class
MockLogitsProcessor
(
LogitsProcessor
):
def
__init__
(
self
,
vocab_size
:
int
,
scale
:
float
,
fake_logits
:
torch
.
Tensor
):
super
().
__init__
(
vocab_size
=
vocab_size
,
scale
=
scale
)
self
.
fake_logits
=
fake_logits
.
clone
()
def
forward
(
self
,
*
args
,
**
kwargs
):
with
patch
(
"vllm.model_executor.layers.logits_processor._prune_hidden_states"
,
lambda
x
,
y
:
x
),
patch
(
"vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits"
,
lambda
*
args
,
**
kwargs
:
self
.
fake_logits
):
return
super
().
forward
(
*
args
,
**
kwargs
)
def
_prepare_test
(
batch_size
:
int
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
MockLogitsProcessor
]:
vocab_size
=
32000
input_tensor
=
torch
.
rand
((
batch_size
,
1024
),
dtype
=
torch
.
float16
)
fake_logits
=
torch
.
full
((
batch_size
,
vocab_size
),
1e-2
,
dtype
=
input_tensor
.
dtype
)
logits_processor
=
MockLogitsProcessor
(
32000
,
0.5
,
fake_logits
)
return
input_tensor
,
fake_logits
,
logits_processor
RANDOM_SEEDS
=
list
(
range
(
8
))
@
pytest
.
mark
.
parametrize
(
"seed"
,
RANDOM_SEEDS
)
def
test_logits_processors
(
seed
:
int
):
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
set_random_seed
(
seed
)
torch
.
set_default_device
(
"cpu"
)
batch_size
=
random
.
randint
(
1
,
256
)
input_tensor
,
fake_logits
,
logits_processor
=
_prepare_test
(
batch_size
)
# This sample logits processor gives infinite score to the i-th token,
# where i is the length of the input sequence.
# We therefore expect the output token sequence to be [0, 1, 2, ...]
def
pick_ith
(
token_ids
,
logits
):
logits
[
len
(
token_ids
)]
=
float
(
"inf"
)
return
logits
seq_group_metadata_list
=
[]
seq_lens
=
[]
for
i
in
range
(
batch_size
):
seq_group_metadata_list
.
append
(
SequenceGroupMetadata
(
request_id
=
f
"test_
{
i
}
"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0
,
logits_processors
=
[
pick_ith
]),
block_tables
=
{
0
:
[
1
]},
))
seq_lens
.
append
(
seq_group_metadata_list
[
-
1
].
seq_data
[
0
].
get_len
())
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
query_lens
=
seq_lens
,
device
=
device
,
pin_memory
=
is_pin_memory_available
())
logits_processor_output
=
logits_processor
(
lm_head
=
None
,
hidden_states
=
input_tensor
,
sampling_metadata
=
sampling_metadata
)
fake_logits
*=
logits_processor
.
scale
torch
.
testing
.
assert_close
(
logits_processor_output
[:,
1
],
fake_logits
[:,
1
],
rtol
=
1e-4
,
atol
=
0.0
)
tests/neuron/test_prefix_prefill.py
→
tests/neuron/
1_core/
test_prefix_prefill.py
View file @
469e903b
...
...
@@ -107,7 +107,7 @@ def ref_masked_attention(
masked_score
,
dim
=-
1
,
return_max_reduce
=
True
)
else
:
norm_score
=
ref_softmax
(
masked_score
,
dim
=-
1
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
norm_score
,
value
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
norm_score
.
to
(
value
.
dtype
)
,
value
)
if
return_max_reduce
:
return
(
out
,
...
...
@@ -118,7 +118,7 @@ def ref_masked_attention(
scaled_qk
,
)
else
:
return
out
return
(
out
,
)
def
ref_context_attention
(
...
...
@@ -128,8 +128,6 @@ def ref_context_attention(
query_lens
,
seq_lens
,
head_size
,
num_kv_heads
,
num_heads
,
num_queries_per_kv
,
return_max_reduce
=
False
,
):
...
...
@@ -146,18 +144,19 @@ def ref_context_attention(
attn_mask
=
torch
.
logical_not
(
attn_mask
)
attn_mask
=
attn_mask
.
float
()
*
-
30000
output
,
cached_max
,
cached_sum_reciprocal
,
lse
,
masked_score
,
scaled_qk
=
(
ref_masked_attention
(
query
,
key
,
value
,
scale
,
attn_mask
,
return_max_reduce
=
return_max_reduce
,
))
output
,
*
debug_tensors
=
ref_masked_attention
(
query
,
key
,
value
,
scale
,
attn_mask
,
return_max_reduce
=
return_max_reduce
,
)
output
=
output
.
unsqueeze
(
1
)
if
return_max_reduce
:
cached_max
,
cached_sum_reciprocal
,
lse
,
masked_score
,
scaled_qk
=
(
debug_tensors
)
return
(
output
,
cached_max
,
...
...
@@ -170,65 +169,22 @@ def ref_context_attention(
return
output
@
pytest
.
mark
.
parametrize
(
"block_size, large_tile_size"
,
[
(
32
,
2048
),
# 64 blocks
(
32
,
4096
),
# 128 blocks
(
32
,
8192
),
# 256 blocks
(
64
,
8192
),
# 128 blocks
],
)
@
pytest
.
mark
.
parametrize
(
"num_heads,num_queries_per_kv,head_size,mixed_precision"
,
[
(
4
,
2
,
8
,
False
),
(
4
,
2
,
8
,
True
),
(
32
,
8
,
64
,
True
),
(
16
,
2
,
128
,
True
),
],
)
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
num_heads
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
block_size
:
int
,
large_tile_size
,
mixed_precision
:
bool
,
)
->
None
:
import
os
import
torch_xla.core.xla_model
as
xm
from
vllm.attention.ops.nki_flash_attn
import
flash_attn_varlen_nkifunc
assert
large_tile_size
%
block_size
==
0
device
=
xm
.
xla_device
()
compiler_flags
=
[
"--model-type=transformer -O1"
,
"--internal-hlo2tensorizer-options='--verify-hlo'"
,
"--retry_failed_compilation"
,
]
compiler_flags_str
=
" "
.
join
(
compiler_flags
)
os
.
environ
[
"NEURON_CC_FLAGS"
]
=
compiler_flags_str
torch
.
manual_seed
(
0
)
torch
.
set_printoptions
(
sci_mode
=
False
)
min_ctx_len
=
32
max_ctx_len
=
1024
min_query_len
=
16
max_query_len
=
512
prefill_batch_size
=
4
decode_batch_size
=
12
def
sample_inputs
(
prefill_batch_size
,
decode_batch_size
,
min_query_len
,
max_query_len
,
min_ctx_len
,
max_ctx_len
,
block_size
,
num_heads
,
num_kv_heads
,
head_size
,
dtype
,
):
batch_size
=
prefill_batch_size
+
decode_batch_size
max_model_len
=
(
max_query_len
+
max_ctx_len
)
*
4
max_block_per_request
=
max_model_len
//
block_size
dtype
=
torch
.
float32
cache_size
=
(
batch_size
*
max_block_per_request
)
+
2
prefill_ctx_lens
=
torch
.
randint
(
min_ctx_len
,
max_ctx_len
+
1
,
(
prefill_batch_size
,
),
...
...
@@ -244,7 +200,6 @@ def test_contexted_kv_attention(
dtype
=
torch
.
long
,
).
tolist
()
+
[
1
for
_
in
range
(
decode_batch_size
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
query_lens
,
ctx_lens
)]
num_kv_heads
=
num_heads
//
num_queries_per_kv
num_tokens
=
sum
(
query_lens
)
query
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
...
...
@@ -304,171 +259,259 @@ def test_contexted_kv_attention(
cur_ctx
+=
block_size
block_id
+=
1
(
output_ref
,
cached_max
,
cached_sum_reciprocal
,
lse
,
masked_score
,
scaled_qk
,
)
=
ref_context_attention
(
return
(
query
,
k
,
v
,
k_cache
,
v_cache
,
block_table
,
key
,
value
,
query_lens
,
seq_lens
,
head_size
,
num_kv_heads
,
num_heads
,
num_queries_per_kv
,
return_max_reduce
=
True
,
)
# build neuron program
return_debug_tensors
=
False
B_P_SIZE
=
128
LARGE_TILE_SZ
=
large_tile_size
def
get_active_block_tables
(
block_tables
,
query_lens
,
seq_lens
,
block_size
,
num_blocks
):
context_lens
=
seq_lens
-
query_lens
blocks_per_seq
=
(
context_lens
+
block_size
-
1
)
//
block_size
num_seqs
=
len
(
seq_lens
)
active_blocks
:
list
[
int
]
=
[]
for
seq_id
in
range
(
num_seqs
):
active_blocks
=
(
active_blocks
+
block_tables
[
seq_id
,
:
blocks_per_seq
[
seq_id
]].
tolist
())
return
F
.
pad
(
torch
.
tensor
(
active_blocks
),
(
0
,
num_blocks
-
len
(
active_blocks
)),
"constant"
,
0
,
)
def
ceil_div
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
def
pad_to_multiple
(
a
,
b
):
return
ceil_div
(
a
,
b
)
*
b
def
pad_to_next_power_of_2
(
a
):
assert
a
>
0
return
2
**
int
(
a
-
1
).
bit_length
()
# calculate input shapes
max_num_queries
=
pad_to_multiple
(
sum
(
query_lens
),
block_size
)
max_num_queries
=
pad_to_next_power_of_2
(
max_num_queries
)
head_size_padded
=
B_P_SIZE
assert
head_size_padded
>=
head_size
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
num_active_blocks
=
ceil_div
(
context_lens
,
block_size
).
sum
().
item
()
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
LARGE_TILE_SZ
//
block_size
)
context_kv_len
=
num_active_blocks
*
block_size
assert
(
context_kv_len
%
LARGE_TILE_SZ
==
0
),
f
"invalid context_kv_len=
{
context_kv_len
}
"
# pad QKV tensors
pad_dims
=
(
0
,
head_size_padded
-
query
.
shape
[
2
],
0
,
0
,
def
get_active_block_tables
(
block_tables
,
query_lens
,
seq_lens
,
block_size
,
num_blocks
):
context_lens
=
seq_lens
-
query_lens
blocks_per_seq
=
(
context_lens
+
block_size
-
1
)
//
block_size
num_seqs
=
len
(
seq_lens
)
active_blocks
:
list
[
int
]
=
[]
for
seq_id
in
range
(
num_seqs
):
active_blocks
=
(
active_blocks
+
block_tables
[
seq_id
,
:
blocks_per_seq
[
seq_id
]].
tolist
())
return
F
.
pad
(
torch
.
tensor
(
active_blocks
,
dtype
=
torch
.
int32
),
(
0
,
num_blocks
-
len
(
active_blocks
)),
"constant"
,
0
,
max_num_queries
-
query
.
shape
[
0
],
)
query
=
F
.
pad
(
query
,
pad_dims
,
"constant"
,
0
)
k
=
F
.
pad
(
k
,
pad_dims
,
"constant"
,
0
)
v
=
F
.
pad
(
v
,
pad_dims
,
"constant"
,
0
)
k_cache
=
F
.
pad
(
k_cache
,
(
0
,
head_size_padded
-
head_size
),
"constant"
,
0
)
v_cache
=
F
.
pad
(
v_cache
,
(
0
,
head_size_padded
-
head_size
),
"constant"
,
0
)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query
=
query
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
k
=
k
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
v
=
v
.
unsqueeze
(
0
).
permute
(
0
,
2
,
1
,
3
).
contiguous
()
# transform block table
active_block_table
=
get_active_block_tables
(
block_table
,
torch
.
tensor
(
query_lens
),
torch
.
tensor
(
seq_lens
),
block_size
,
num_active_blocks
,
)
# Build attention masks
prior_mask
,
active_mask
=
(
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
query_lens
,
seq_lens
,
block_size
=
block_size
))
attn_mask
=
torch
.
concat
(
[
F
.
pad
(
prior_mask
,
(
0
,
context_kv_len
-
prior_mask
.
shape
[
1
],
0
,
max_num_queries
-
prior_mask
.
shape
[
0
],
),
"constant"
,
@
pytest
.
mark
.
parametrize
(
"prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision"
,
[
# Test minimal configurations (small block size)
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
False
),
# minimal block size, small dimensions
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
True
),
# same with mixed precision
# Test common/medium configurations
(
4
,
12
,
32
,
2048
,
32
,
8
,
64
,
False
),
# common case, larger heads
(
4
,
12
,
32
,
2048
,
16
,
4
,
32
,
True
),
# medium size, mixed precision, grouped-query attention (GQA)
# Test large configurations
(
4
,
12
,
256
,
8192
,
8
,
1
,
128
,
False
),
# large blocks, large head size
(
4
,
12
,
256
,
8192
,
64
,
8
,
64
,
True
),
# large blocks, many heads
# Test asymmetric configurations
(
2
,
24
,
64
,
4096
,
12
,
4
,
96
,
False
),
# varied batch sizes
(
8
,
8
,
128
,
2048
,
24
,
2
,
48
,
True
),
# balanced batches
# Test edge cases
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
16
,
4
,
8
,
1024
,
4
,
2
,
128
,
True
),
# large prefill batch
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
])
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
monkeypatch
:
pytest
.
MonkeyPatch
,
prefill_batch_size
:
int
,
decode_batch_size
:
int
,
num_heads
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
block_size
:
int
,
large_tile_size
,
mixed_precision
:
bool
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
from
vllm.attention.ops.nki_flash_attn
import
(
flash_attn_varlen_nkifunc
,
reorder_context_mask
)
assert
large_tile_size
%
block_size
==
0
device
=
xm
.
xla_device
()
compiler_flags_str
=
" "
.
join
([
"-O1"
,
"--retry_failed_compilation"
,
])
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"NEURON_CC_FLAGS"
,
compiler_flags_str
)
torch
.
manual_seed
(
0
)
torch
.
set_printoptions
(
sci_mode
=
False
)
torch
.
set_default_device
(
"cpu"
)
dtype
=
torch
.
float32
min_ctx_len
=
32
max_ctx_len
=
1024
min_query_len
=
16
max_query_len
=
512
num_kv_heads
=
num_heads
//
num_queries_per_kv
(
query
,
k_active
,
v_active
,
k_cache
,
v_cache
,
block_table
,
key
,
value
,
query_lens
,
seq_lens
,
)
=
sample_inputs
(
prefill_batch_size
=
prefill_batch_size
,
decode_batch_size
=
decode_batch_size
,
min_query_len
=
min_query_len
,
max_query_len
=
max_query_len
,
min_ctx_len
=
min_ctx_len
,
max_ctx_len
=
max_ctx_len
,
block_size
=
block_size
,
num_heads
=
num_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
)
output_ref
=
ref_context_attention
(
query
,
key
,
value
,
query_lens
,
seq_lens
,
head_size
,
num_queries_per_kv
,
return_max_reduce
=
False
,
)
# build neuron program
B_P_SIZE
=
128
assert
(
large_tile_size
>=
B_P_SIZE
),
f
"Expect
{
large_tile_size
=
}
to be larger than
{
B_P_SIZE
=
}
"
def
ceil_div
(
a
,
b
):
return
(
a
+
b
-
1
)
//
b
def
pad_to_multiple
(
a
,
b
):
return
ceil_div
(
a
,
b
)
*
b
def
pad_to_next_power_of_2
(
a
):
assert
a
>
0
return
2
**
int
(
a
-
1
).
bit_length
()
# calculate input shapes
max_num_queries
=
pad_to_next_power_of_2
(
sum
(
query_lens
))
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
num_active_blocks
=
ceil_div
(
context_lens
,
block_size
).
sum
().
item
()
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
large_tile_size
//
block_size
)
context_kv_len
=
num_active_blocks
*
block_size
assert
(
context_kv_len
%
large_tile_size
==
0
),
f
"invalid context_kv_len=
{
context_kv_len
}
"
# pad QKV tensors
pad_dims
=
(
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
query
.
shape
[
0
],
)
query
=
F
.
pad
(
query
,
pad_dims
,
"constant"
,
0
)
k
=
F
.
pad
(
k_active
,
pad_dims
,
"constant"
,
0
)
v
=
F
.
pad
(
v_active
,
pad_dims
,
"constant"
,
0
)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query
=
query
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
k
=
k
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
v
=
v
.
unsqueeze
(
0
).
permute
(
0
,
2
,
1
,
3
).
contiguous
()
k_cache
=
k_cache
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
v_cache
=
v_cache
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
# transform block table
active_block_table
=
get_active_block_tables
(
block_table
.
cpu
(),
torch
.
tensor
(
query_lens
).
cpu
(),
torch
.
tensor
(
seq_lens
).
cpu
(),
block_size
,
num_active_blocks
,
)
# Build attention masks
prior_mask
,
active_mask
=
(
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
query_lens
,
seq_lens
,
block_size
=
block_size
))
prior_mask_padded
=
F
.
pad
(
prior_mask
,
(
0
,
).
bool
(),
F
.
pad
(
active_mask
,
(
0
,
max_num_queries
-
active_mask
.
shape
[
1
],
0
,
max_num_queries
-
active_mask
.
shape
[
0
],
),
"constant"
,
context_kv_len
-
prior_mask
.
shape
[
1
],
0
,
).
bool
(),
],
dim
=
1
,
)
input_args
=
(
query
.
to
(
device
=
device
),
k
.
to
(
device
=
device
),
v
.
to
(
device
=
device
),
k_cache
.
to
(
device
=
device
),
v_cache
.
to
(
device
=
device
),
active_block_table
.
to
(
torch
.
int32
).
to
(
device
=
device
),
attn_mask
.
to
(
device
=
device
),
)
input_kwargs
=
dict
(
n_kv_head
=
num_kv_heads
,
head_size
=
head_size
,
mixed_precision
=
mixed_precision
,
LARGE_TILE_SZ
=
LARGE_TILE_SZ
,
return_debug_tensors
=
return_debug_tensors
,
)
max_num_queries
-
prior_mask
.
shape
[
0
],
),
"constant"
,
0
,
).
bool
()
active_mask_padded
=
F
.
pad
(
active_mask
,
(
0
,
max_num_queries
-
active_mask
.
shape
[
1
],
0
,
max_num_queries
-
active_mask
.
shape
[
0
],
),
"constant"
,
0
,
).
bool
()
attn_mask
=
torch
.
concat
([
prior_mask_padded
,
active_mask_padded
],
dim
=
1
)
attn_mask
=
reorder_context_mask
(
attn_mask
,
large_tile_size
,
block_size
)
input_args
=
(
query
.
to
(
device
=
device
),
k
.
to
(
device
=
device
),
v
.
to
(
device
=
device
),
k_cache
.
to
(
device
=
device
),
v_cache
.
to
(
device
=
device
),
active_block_table
.
to
(
device
=
device
),
attn_mask
.
to
(
device
=
device
),
)
input_kwargs
=
dict
(
n_kv_head
=
num_kv_heads
,
head_size
=
head_size
,
mixed_precision
=
mixed_precision
,
LARGE_TILE_SZ
=
large_tile_size
,
)
if
return_debug_tensors
:
output_nki
,
*
debug_tensors
=
flash_attn_varlen_nkifunc
(
*
input_args
,
**
input_kwargs
)
else
:
output_nki
=
flash_attn_varlen_nkifunc
(
*
input_args
,
**
input_kwargs
)
debug_tensors
=
[]
debug_tensors
=
[
torch
.
tensor
(
dt
).
cpu
()
for
dt
in
debug_tensors
]
num_actual_tokens
=
sum
(
query_lens
)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki
=
output_nki
.
cpu
().
permute
(
0
,
2
,
1
,
3
)[:,
:,
:,
:
head_size
]
output_nki
=
output_nki
[
0
,
:
num_actual_tokens
,
:,
:]
output_ref_padded
=
F
.
pad
(
output_ref
,
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
output_ref
.
shape
[
0
]),
"constant"
,
0
,
)
output_ref
=
output_ref_padded
.
transpose
(
0
,
1
)[
0
,
:
num_actual_tokens
,
:,
:]
num_actual_tokens
=
sum
(
query_lens
)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki
=
output_nki
.
cpu
().
permute
(
0
,
2
,
1
,
3
)
output_nki
=
output_nki
[
0
,
:
num_actual_tokens
,
:,
:]
output_ref_padded
=
F
.
pad
(
output_ref
,
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
output_ref
.
shape
[
0
]),
"constant"
,
0
,
)
output_ref
=
output_ref_padded
.
transpose
(
0
,
1
)[
0
,
:
num_actual_tokens
,
:,
:]
torch
.
testing
.
assert_close
(
output_nki
,
output_ref
,
atol
=
1e-2
,
rtol
=
0
)
torch
.
testing
.
assert_close
(
output_nki
,
output_ref
,
atol
=
1e-2
,
rtol
=
0
)
tests/neuron/1_core/test_rotary_embedding.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
"""
Tests for miscellaneous utilities
"""
import
pytest
import
torch
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"max_position,is_neox_style,rotary_dim,head_size,seq_len"
,
[
(
16
,
False
,
32
,
32
,
1024
),
(
16
,
False
,
32
,
128
,
1024
),
(
16
,
True
,
32
,
32
,
1024
),
(
16
,
True
,
32
,
128
,
1024
),
])
def
test_rotary_embedding_opcheck
(
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
):
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
batch_size
=
1
base
=
10000
num_heads
=
8
rot
=
RotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
torch
.
float32
)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
),
device
=
"cpu"
)
query
=
torch
.
randn
(
batch_size
,
seq_len
,
num_heads
*
head_size
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
key
=
torch
.
randn_like
(
query
)
assert
positions
.
is_cpu
,
\
"reference input tensor is expected to be CPU tensor."
ref_query
,
ref_key
=
rot
.
to
(
device
=
"cpu"
).
forward_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rot
.
to
(
device
=
device
).
forward_neuron
(
positions
.
to
(
device
=
device
),
query
.
to
(
device
=
device
),
key
.
to
(
device
=
device
))
assert
out_query
.
is_xla
and
out_key
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out_query
.
cpu
(),
ref_query
,
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
out_key
.
cpu
(),
ref_key
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/neuron/2_core/test_comm_ops.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
functools
from
typing
import
Callable
from
unittest.mock
import
patch
import
pytest
import
torch
import
torch_xla.distributed.xla_multiprocessing
as
xmp
from
typing_extensions
import
ParamSpec
from
vllm.distributed.communication_op
import
(
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.utils
import
get_distributed_init_method
,
get_open_port
_P
=
ParamSpec
(
"_P"
)
def
reinitialize_neuron_runtime
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
"""Decorator to reinitialize the Neuron Runtime before executing a test.
This is necessary for distributed tests which need to reallocate Neuron
Cores to separate subprocesses.
"""
@
functools
.
wraps
(
f
)
def
wrapper
(
*
args
:
_P
.
args
,
**
kwargs
:
_P
.
kwargs
)
->
None
:
runtime
=
torch
.
classes
.
neuron
.
Runtime
()
runtime
.
initialize
()
runtime
.
unsafe_close
()
f
(
*
args
,
**
kwargs
)
runtime
.
initialize
()
return
wrapper
def
all_gather_test_worker
(
index
,
tp_degree
,
distributed_init_method
):
init_distributed_environment
(
tp_degree
,
index
,
distributed_init_method
,
index
,
backend
=
"xla"
)
ensure_model_parallel_initialized
(
tp_degree
,
1
)
num_dimensions
=
3
tensor_size
=
list
(
range
(
2
,
num_dimensions
+
2
))
total_size
=
1
for
s
in
tensor_size
:
total_size
*=
s
all_gather_dimension
=
-
1
all_tensors
=
[
torch
.
arange
(
total_size
,
dtype
=
torch
.
float32
,
device
=
"xla"
).
reshape
(
tensor_size
)
*
(
r
+
1
)
for
r
in
range
(
tp_degree
)
]
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
index
%
tp_degree
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
torch
.
testing
.
assert_close
(
t
,
expected
)
def
all_reduce_test_worker
(
index
,
tp_degree
,
distributed_init_method
):
init_distributed_environment
(
tp_degree
,
index
,
distributed_init_method
,
index
,
backend
=
"xla"
)
ensure_model_parallel_initialized
(
tp_degree
,
1
)
num_elements
=
8
all_tensors
=
[
torch
.
arange
(
num_elements
,
dtype
=
torch
.
float32
,
device
=
"xla"
)
*
(
r
+
1
)
for
r
in
range
(
tp_degree
)
]
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
index
%
tp_degree
]
t
=
tensor_model_parallel_all_reduce
(
t
)
torch
.
testing
.
assert_close
(
t
,
expected
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
])
@
reinitialize_neuron_runtime
def
test_neuron_multi_process_tensor_parallel
(
monkeypatch
,
tp_size
,
test_target
):
with
patch
(
'torch_xla._XLAC._xla_runtime_is_initialized'
,
return_value
=
False
):
distributed_init_method
=
get_distributed_init_method
(
"127.0.0.1"
,
get_open_port
())
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
monkeypatch
.
setenv
(
"NEURONCORE_NUM_DEVICES"
,
str
(
tp_size
))
monkeypatch
.
setenv
(
"NEURON_PJRT_PROCESSES_NUM_DEVICES"
,
','
.
join
([
'1'
for
_
in
range
(
tp_size
)]))
xmp
.
spawn
(
test_target
,
args
=
(
tp_size
,
distributed_init_method
))
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
from
typing
import
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
collections.abc
import
Iterable
from
typing
import
Optional
,
Union
import
torch
import
torch.nn
as
nn
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.pooler
import
Pooler
,
PoolingType
from
vllm.model_executor.models.gemma2
import
Gemma2Model
...
...
@@ -37,16 +37,12 @@ class MyGemma2Embedding(nn.Module):
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
inputs_embeds
,
)
...
...
@@ -64,7 +60,7 @@ class MyGemma2Embedding(nn.Module):
)
->
Optional
[
PoolerOutput
]:
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
T
uple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
t
uple
[
str
,
torch
.
Tensor
]]):
weights
=
self
.
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
((
name
,
data
)
for
name
,
data
in
weights
...
...
tests/plugins_tests/conftest.py
0 → 100644
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
\ No newline at end of file
tests/plugins_tests/test_platform_plugins.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm.attention.selector
import
get_attn_backend
from
vllm.utils
import
STR_INVALID_VAL
from
vllm.utils
import
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
def
test_platform_plugins
():
...
...
@@ -25,8 +25,9 @@ def test_platform_plugins():
f
" is loaded. The first import:
\n
{
_init_trace
}
"
)
def
test_oot_attention_backend
(
monkeypatch
):
def
test_oot_attention_backend
(
monkeypatch
:
pytest
.
MonkeyPatch
):
# ignore the backend env variable if it is set
override_backend_env_variable
(
monkeypatch
,
STR_INVALID_VAL
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
get_name
()
==
"Dummy_Backend"
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
torch
.
float16
,
16
,
False
)
assert
backend
.
get_name
()
==
"Dummy_Backend"
tests/plugins_tests/test_scheduler_plugins.py
View file @
469e903b
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.sched.scheduler
import
Scheduler
as
V1Scheduler
from
vllm.v1.engine.llm_engine
import
LLMEngine
as
V1LLMEngine
class
DummyV0Scheduler
(
Scheduler
):
def
schedule
(
self
):
raise
Exception
(
"Exception raised by DummyV0Scheduler"
)
class
DummyScheduler
(
Scheduler
):
class
Dummy
V1
Scheduler
(
V1
Scheduler
):
def
schedule
(
self
):
raise
Exception
(
"Exception raised by DummyScheduler"
)
raise
Exception
(
"Exception raised by DummyV1Scheduler"
)
def
test_scheduler_plugins_v0
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
pytest
.
raises
(
Exception
)
as
exception_info
:
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
# reduce test time
scheduler_cls
=
DummyV0Scheduler
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
=
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
str
(
exception_info
.
value
)
==
"Exception raised by DummyV0Scheduler"
def
test_scheduler_plugins
():
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
def
test_scheduler_plugins_v1
(
monkeypatch
:
pytest
.
MonkeyPatch
):
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Explicitly turn off engine multiprocessing so
# that the scheduler runs in this process
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
with
pytest
.
raises
(
Exception
)
as
exception_info
:
with
pytest
.
raises
(
Exception
)
as
exception_info
:
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
# reduce test time
scheduler_cls
=
DummyScheduler
,
)
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
# reduce test time
scheduler_cls
=
Dummy
V1
Scheduler
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
=
engine_args
)
engine
=
V1
LLMEngine
.
from_engine_args
(
engine_args
=
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
str
(
exception_info
.
value
)
==
"Exception raised by DummyScheduler"
assert
str
(
exception_info
.
value
)
==
"Exception raised by DummyV1Scheduler"
tests/prefix_caching/test_disable_sliding_window.py
View file @
469e903b
...
...
@@ -36,7 +36,10 @@ def test_disable_sliding_window(model_len_len, ):
del
vllm_disabled_model
cleanup_dist_env_and_memory
()
vllm_enabled_model
=
LLM
(
model
,
disable_sliding_window
=
False
)
vllm_enabled_model
=
LLM
(
model
,
enforce_eager
=
True
,
disable_sliding_window
=
False
,
enable_prefix_caching
=
False
)
vllm_enabled_model
.
generate
(
"Hi my name is"
)
model_config
=
vllm_enabled_model
.
llm_engine
.
model_config
assert
model_config
.
max_model_len
==
full_len
,
(
...
...
tests/prefix_caching/test_prefix_caching.py
View file @
469e903b
...
...
@@ -4,21 +4,35 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
from
__future__
import
annotations
import
pytest
import
os
from
tests.conftest
import
VllmRunner
from
tests.core.utils
import
SchedulerProxy
,
create_dummy_prompt
from
tests.kernels.utils
import
override_backend_env_variable
from
vllm
import
SamplingParams
,
TokensPrompt
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.platforms
import
current_platform
from
vllm.utils
import
STR_BACKEND_ENV_VAR
from
..models.utils
import
check_outputs_equal
from
..utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
yield
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"
facebook/opt-125m
"
),
os
.
path
.
join
(
models_path_prefix
,
"
distilbert/distilgpt2
"
),
]
UNSTABLE_PROMPT_SEQUENCE
=
[
...
...
@@ -49,74 +63,88 @@ def test_mixed_requests(
cached_position
:
int
,
enable_chunked_prefill
:
bool
,
block_size
:
int
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
"""
Test the case when some sequences have the prefix cache hit
and the others don't. The cached position determines where
the sequence is at among the batch of prefills.
"""
override_backend_env_variable
(
monkeypatch
,
backend
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
cached_prompt
=
example_prompts
[
cached_position
]
with
vllm_runner
(
model
,
dtype
=
dtype
,
enable_prefix_caching
=
True
,
enable_chunked_prefill
=
enable_chunked_prefill
,
block_size
=
block_size
,
)
as
vllm_model
:
# Run the first prompt so the cache is populated
vllm_outputs
=
vllm_model
.
generate_greedy
([
cached_prompt
],
max_tokens
)
# Run all the promopts
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
greedy_params
)
# Verify number of cached tokens
for
i
in
range
(
len
(
req_outputs
)):
if
i
==
cached_position
:
expected_num_cached_tokens
=
(
len
(
req_outputs
[
i
].
prompt_token_ids
)
//
block_size
)
*
block_size
else
:
expected_num_cached_tokens
=
0
assert
(
req_outputs
[
i
].
num_cached_tokens
==
expected_num_cached_tokens
)
vllm_outputs
=
[(
output
.
prompt_token_ids
+
list
(
output
.
outputs
[
0
].
token_ids
),
output
.
prompt
+
output
.
outputs
[
0
].
text
,
)
for
output
in
req_outputs
]
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
backend
==
"XFORMERS"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Xformers does not support ROCm/HIP."
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
cached_prompt
=
example_prompts
[
cached_position
]
with
vllm_runner
(
model
,
dtype
=
dtype
,
enable_prefix_caching
=
True
,
enable_chunked_prefill
=
enable_chunked_prefill
,
block_size
=
block_size
,
)
as
vllm_model
:
# Run the first prompt so the cache is populated
vllm_outputs
=
vllm_model
.
generate_greedy
([
cached_prompt
],
max_tokens
)
# Run all the promopts
greedy_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
max_tokens
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
greedy_params
)
# Verify number of cached tokens
for
i
in
range
(
len
(
req_outputs
)):
if
i
==
cached_position
:
expected_num_cached_tokens
=
(
len
(
req_outputs
[
i
].
prompt_token_ids
)
//
block_size
)
*
block_size
else
:
expected_num_cached_tokens
=
0
assert
(
req_outputs
[
i
].
num_cached_tokens
==
expected_num_cached_tokens
)
vllm_outputs
=
[(
output
.
prompt_token_ids
+
list
(
output
.
outputs
[
0
].
token_ids
),
output
.
prompt
+
output
.
outputs
[
0
].
text
,
)
for
output
in
req_outputs
]
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
"XFORMERS"
])
def
test_unstable_prompt_sequence
(
vllm_runner
,
backend
:
str
,
monkeypatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
)
->
None
:
override_backend_env_variable
(
monkeypatch
,
backend
)
with
vllm_runner
(
"Qwen/Qwen2.5-0.5B-Instruct"
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
max_model_len
=
4096
,
)
as
vllm_model
:
for
prompt
in
UNSTABLE_PROMPT_SEQUENCE
:
vllm_model
.
generate
(
TokensPrompt
(
prompt_token_ids
=
prompt
),
SamplingParams
(
max_tokens
=
1
))
if
backend
==
"FLASHINFER"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flashinfer does not support ROCm/HIP."
)
if
backend
==
"XFORMERS"
and
current_platform
.
is_rocm
():
pytest
.
skip
(
"Xformers does not support ROCm/HIP."
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
backend
)
with
vllm_runner
(
"Qwen/Qwen2.5-0.5B-Instruct"
,
enable_chunked_prefill
=
True
,
enable_prefix_caching
=
True
,
max_model_len
=
4096
,
)
as
vllm_model
:
for
prompt
in
UNSTABLE_PROMPT_SEQUENCE
:
vllm_model
.
generate
(
TokensPrompt
(
prompt_token_ids
=
prompt
),
SamplingParams
(
max_tokens
=
1
))
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
...
...
tests/quantization/test_bitsandbytes.py
View file @
469e903b
...
...
@@ -15,10 +15,12 @@ from ..utils import models_path_prefix
from
vllm.platforms
import
current_platform
from
tests
.utils
import
compare_two_settings
,
fork
_new_process_for_each_test
from
.
.utils
import
compare_two_settings
,
create
_new_process_for_each_test
models_4bit_to_test
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
),
"quantize opt model inflight"
),
(
os
.
path
.
join
(
models_path_prefix
,
"mistralai/Mistral-7B-Instruct-v0.3"
),
"quantize inflight model with both HF and Mistral format weights"
)
]
models_pre_qaunt_4bit_to_test
=
[
...
...
@@ -37,7 +39,7 @@ models_pre_quant_8bit_to_test = [
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_load_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -50,7 +52,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_qaunt_4bit_to_test
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_load_pre_quant_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -62,7 +64,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_quant_8bit_to_test
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_load_8bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -75,7 +77,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
)
or
current_platform
(),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_load_tp_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
...
...
@@ -93,7 +95,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
fork
_new_process_for_each_test
@
create
_new_process_for_each_test
()
def
test_load_pp_4bit_bnb_model
(
model_name
,
description
)
->
None
:
common_args
=
[
"--disable-log-stats"
,
...
...
tests/quantization/test_compressed_tensors.py
View file @
469e903b
...
...
@@ -24,6 +24,14 @@ from vllm.platforms import current_platform
from
..utils
import
models_path_prefix
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
...
...
@@ -220,8 +228,6 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert
qkv_proj
.
scheme
.
group_size
==
(
-
1
if
group
is
None
else
group
)
assert
qkv_proj
.
weight_packed
.
dtype
is
torch
.
int32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float16
assert
qkv_proj
.
scheme
.
pack_factor
==
pack_factor
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_configs.py
View file @
469e903b
...
...
@@ -5,7 +5,6 @@ Run `pytest tests/quantization/test_configs.py --forked`.
"""
from
dataclasses
import
dataclass
from
typing
import
Tuple
import
pytest
import
os
...
...
@@ -55,7 +54,7 @@ MODEL_ARG_EXPTYPES = [
@
pytest
.
mark
.
parametrize
(
"model_arg_exptype"
,
MODEL_ARG_EXPTYPES
)
def
test_auto_gptq
(
model_arg_exptype
:
T
uple
[
str
,
None
,
str
])
->
None
:
def
test_auto_gptq
(
model_arg_exptype
:
t
uple
[
str
,
None
,
str
])
->
None
:
model_path
,
quantization_arg
,
expected_type
=
model_arg_exptype
try
:
...
...
tests/quantization/test_cpu_offload.py
View file @
469e903b
...
...
@@ -9,10 +9,17 @@ import os
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
compare_two_settings
,
models_path_prefix
from
vllm.
util
s
import
is_hip
from
vllm.
platform
s
import
current_platform
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
is_hip
(),
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
...
...
@@ -26,7 +33,7 @@ def test_cpu_offload_fp8():
# max_wait_seconds=480)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
# Test GPTQ Marlin
...
...
@@ -40,7 +47,7 @@ def test_cpu_offload_gptq():
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
# Test AWQ Marlin
...
...
@@ -54,7 +61,7 @@ def test_cpu_offload_awq():
max_wait_seconds
=
480
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
# Test wNa16
...
...
tests/quantization/test_experts_int8.py
View file @
469e903b
...
...
@@ -9,12 +9,12 @@ import os
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
models_path_prefix
from
vllm.
util
s
import
is_hip
from
vllm.
platform
s
import
current_platform
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-random"
)]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"experts_int8"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"experts_int8"
)
or
current_platform
.
is_rocm
(),
reason
=
"ExpertsInt8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
...
...
tests/quantization/test_fp8.py
View file @
469e903b
...
...
@@ -13,7 +13,6 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod
)
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
from
vllm.utils
import
is_hip
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
),
...
...
@@ -22,7 +21,7 @@ MODELS = [
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
...
...
@@ -47,10 +46,12 @@ KV_CACHE_MODELS = [
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
KV_CACHE_MODELS
)
def
test_kv_cache_model_load_and_run
(
vllm_runner
,
model_id
:
str
):
def
test_kv_cache_model_load_and_run
(
vllm_runner
,
model_id
:
str
,
monkeypatch
):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
vllm_runner
(
model_id
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
def
check_model
(
model
):
...
...
@@ -83,12 +84,15 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
print
(
outputs
[
0
][
1
])
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
,
force_marlin
:
bool
,
monkeypatch
)
->
None
:
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
if
force_marlin
:
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
...
...
@@ -106,8 +110,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
assert
attn
.
_v_scale
==
1.0
if
current_platform
.
is_cuda
():
if
current_platform
.
has_device_capability
(
89
)
and
not
force_marlin
:
if
current_platform
.
supports_fp8
()
and
not
force_marlin
:
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
else
:
...
...
@@ -115,11 +118,9 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
# for weight-only quantization using Marlin kernels
assert
fc1
.
weight
.
dtype
==
torch
.
int32
elif
current_platform
.
is_rocm
():
# Only MI300 and above support quantization='fp8'
if
current_platform
.
has_device_capability
(
94
)
and
not
force_marlin
:
if
current_platform
.
supports_fp8
()
and
not
force_marlin
:
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fnuz
assert
fc1
.
weight
.
dtype
==
current_platform
.
fp8_dtype
()
else
:
# unsupported ROCm platform
pytest
.
skip
(
"Skip `test_load_fp16_model`. "
...
...
@@ -132,7 +133,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
llm
.
apply_model
(
check_model
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scaled_fp8_quant
(
dtype
)
->
None
:
...
...
tests/quantization/test_gptq_dynamic.py
View file @
469e903b
...
...
@@ -28,8 +28,10 @@ MODEL_QUANT = [
@
pytest
.
mark
.
parametrize
(
"model_id, use_marlin_kernel"
,
MODEL_QUANT
)
def
test_gptq_with_dynamic
(
vllm_runner
,
model_id
:
str
,
use_marlin_kernel
:
bool
):
def
test_gptq_with_dynamic
(
vllm_runner
,
model_id
:
str
,
use_marlin_kernel
:
bool
,
monkeypatch
):
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
)
...
...
Prev
1
…
22
23
24
25
26
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment