Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
269 additions
and
1048 deletions
+269
-1048
tests/neuron/1_core/test_neuron_quant.py
tests/neuron/1_core/test_neuron_quant.py
+0
-12
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+0
-514
tests/neuron/1_core/test_rotary_embedding.py
tests/neuron/1_core/test_rotary_embedding.py
+0
-68
tests/neuron/2_core/test_comm_ops.py
tests/neuron/2_core/test_comm_ops.py
+0
-101
tests/neuron/2_core/test_eagle.py
tests/neuron/2_core/test_eagle.py
+0
-83
tests/neuron/2_core/test_mistral.py
tests/neuron/2_core/test_mistral.py
+0
-64
tests/neuron/2_core/test_multi_lora.py
tests/neuron/2_core/test_multi_lora.py
+0
-97
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
...thvi_io_processor_plugin/prithvi_io_processor/__init__.py
+2
-4
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
...rocessor_plugin/prithvi_io_processor/prithvi_processor.py
+3
-36
tests/plugins/prithvi_io_processor_plugin/setup.py
tests/plugins/prithvi_io_processor_plugin/setup.py
+1
-2
tests/plugins_tests/test_io_processor_plugins.py
tests/plugins_tests/test_io_processor_plugins.py
+46
-58
tests/quantization/test_modelopt.py
tests/quantization/test_modelopt.py
+1
-1
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+20
-0
tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
...i_model_streamer_test/test_runai_model_streamer_loader.py
+1
-1
tests/runai_model_streamer_test/test_runai_utils.py
tests/runai_model_streamer_test/test_runai_utils.py
+39
-0
tests/samplers/test_beam_search.py
tests/samplers/test_beam_search.py
+1
-1
tests/tensorizer_loader/test_tensorizer.py
tests/tensorizer_loader/test_tensorizer.py
+4
-3
tests/test_config.py
tests/test_config.py
+3
-2
tests/tool_use/test_openai_tool_parser.py
tests/tool_use/test_openai_tool_parser.py
+147
-0
tests/tpu/test_quantization_accuracy.py
tests/tpu/test_quantization_accuracy.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/neuron/1_core/test_neuron_quant.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.model_executor.layers.quantization.neuron_quant
import
(
NeuronQuantConfig
)
def
test_get_supported_act_dtypes
():
neuron_quant_config
=
NeuronQuantConfig
()
supported_act_dtypes
=
neuron_quant_config
.
get_supported_act_dtypes
()
target_list
=
[
"any_dtype1"
,
"any_dtype2"
]
for
dtype
in
target_list
:
assert
dtype
in
supported_act_dtypes
tests/neuron/1_core/test_prefix_prefill.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
import
pytest
import
torch
import
torch.nn.functional
as
F
from
vllm.utils
import
cdiv
class
BlockDiagonalCausalFromBottomRightMask
:
@
staticmethod
def
_from_seqlens
(
query_lens
,
seq_lens
,
block_size
=
None
):
from
torch
import
logical_and
,
logical_or
contexted
=
block_size
is
None
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
n_queries
=
sum
(
query_lens
)
num_seqs
=
len
(
query_lens
)
if
contexted
:
key_lens_blockaligned
=
seq_lens
else
:
n_blocks_per_seq
=
(
context_lens
+
block_size
-
1
)
//
block_size
offset_per_seq
=
n_blocks_per_seq
*
block_size
key_lens_blockaligned
=
offset_per_seq
[:
num_seqs
].
tolist
()
n_keys
=
sum
(
key_lens_blockaligned
)
a
=
(
torch
.
arange
(
n_queries
).
reshape
(
n_queries
,
1
).
expand
(
n_queries
,
n_keys
))
b
=
torch
.
arange
(
n_keys
).
reshape
(
1
,
n_keys
).
expand
(
n_queries
,
n_keys
)
q_cumsum
=
torch
.
tensor
([
0
]
+
query_lens
).
cumsum
(
dim
=
0
)
k_cumsum
=
torch
.
tensor
([
0
]
+
key_lens_blockaligned
).
cumsum
(
dim
=
0
)
prior_mask
=
torch
.
zeros
(
n_queries
,
n_keys
)
new_masks
:
list
[
torch
.
Tensor
]
=
[]
for
seq_id
in
range
(
num_seqs
):
ri
=
q_cumsum
[
seq_id
]
ci
=
k_cumsum
[
seq_id
]
nr
=
query_lens
[
seq_id
]
if
contexted
:
nc
=
seq_lens
[
seq_id
]
a_offset
=
ci
+
nc
-
ri
-
nr
new_mask
=
(
a
+
a_offset
)
>=
b
else
:
nc
=
context_lens
[
seq_id
]
a_offset
=
ci
+
nc
-
1
new_mask
=
a_offset
>=
b
left_mask
=
b
>=
ci
top_mask
=
a
>=
ri
bottom_mask
=
a
<
(
ri
+
nr
)
new_mask
=
logical_and
(
logical_and
(
logical_and
(
new_mask
,
left_mask
),
top_mask
),
bottom_mask
,
)
prior_mask
=
logical_or
(
prior_mask
,
new_mask
)
new_masks
=
new_masks
+
[
new_mask
]
return
prior_mask
@
staticmethod
def
from_seqlens
(
query_lens
,
seq_lens
,
block_size
=
None
):
contexted
=
block_size
is
None
if
contexted
:
prior_mask
=
BlockDiagonalCausalFromBottomRightMask
.
_from_seqlens
(
query_lens
,
seq_lens
)
active_mask
=
None
else
:
prior_mask
=
BlockDiagonalCausalFromBottomRightMask
.
_from_seqlens
(
query_lens
,
seq_lens
,
block_size
)
active_mask
=
BlockDiagonalCausalFromBottomRightMask
.
_from_seqlens
(
query_lens
,
query_lens
)
return
prior_mask
,
active_mask
def
ref_softmax
(
x
:
torch
.
Tensor
,
dim
:
int
,
mixed_precision
=
False
,
return_max_reduce
=
False
):
max_value
=
torch
.
amax
(
x
,
dim
=
dim
,
keepdims
=
True
)
exp
=
torch
.
exp
(
x
-
max_value
)
if
mixed_precision
:
sum_value
=
torch
.
sum
(
exp
.
astype
(
torch
.
float32
),
dim
=
dim
,
keepdims
=
True
).
astype
(
x
.
dtype
)
else
:
sum_value
=
torch
.
sum
(
exp
,
dim
=
dim
,
keepdims
=
True
)
if
return_max_reduce
:
return
exp
/
sum_value
,
max_value
,
torch
.
reciprocal
(
sum_value
)
return
exp
/
sum_value
def
ref_masked_attention
(
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
scale
:
float
,
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
return_max_reduce
:
Optional
[
bool
]
=
False
,
)
->
torch
.
Tensor
:
scaled_qk
=
scale
*
torch
.
einsum
(
"qhd,khd->hqk"
,
query
,
key
).
float
()
if
attn_mask
is
not
None
:
masked_score
=
scaled_qk
+
attn_mask
.
float
()
if
return_max_reduce
:
norm_score
,
cached_max
,
cached_sum_reciprocal
=
ref_softmax
(
masked_score
,
dim
=-
1
,
return_max_reduce
=
True
)
else
:
norm_score
=
ref_softmax
(
masked_score
,
dim
=-
1
)
out
=
torch
.
einsum
(
"hqk,khd->qhd"
,
norm_score
.
to
(
value
.
dtype
),
value
)
if
return_max_reduce
:
return
(
out
,
cached_max
,
cached_sum_reciprocal
,
norm_score
,
masked_score
,
scaled_qk
,
)
else
:
return
(
out
,
)
def
ref_context_attention
(
query
,
key
,
value
,
query_lens
,
seq_lens
,
head_size
,
num_queries_per_kv
,
return_max_reduce
=
False
,
):
scale
=
float
(
1.0
/
(
head_size
**
0.5
))
if
num_queries_per_kv
>
1
:
# Handle MQA and GQA
key
=
torch
.
repeat_interleave
(
key
,
num_queries_per_kv
,
dim
=
1
)
value
=
torch
.
repeat_interleave
(
value
,
num_queries_per_kv
,
dim
=
1
)
attn_mask
,
_
=
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
query_lens
,
seq_lens
)
# convert binary mask to -inf values
attn_mask
=
torch
.
logical_not
(
attn_mask
)
attn_mask
=
attn_mask
.
float
()
*
-
30000
output
,
*
debug_tensors
=
ref_masked_attention
(
query
,
key
,
value
,
scale
,
attn_mask
,
return_max_reduce
=
return_max_reduce
,
)
output
=
output
.
unsqueeze
(
1
)
if
return_max_reduce
:
cached_max
,
cached_sum_reciprocal
,
lse
,
masked_score
,
scaled_qk
=
(
debug_tensors
)
return
(
output
,
cached_max
,
cached_sum_reciprocal
,
lse
,
masked_score
,
scaled_qk
,
)
else
:
return
output
def
sample_inputs
(
prefill_batch_size
,
decode_batch_size
,
min_query_len
,
max_query_len
,
min_ctx_len
,
max_ctx_len
,
block_size
,
num_heads
,
num_kv_heads
,
head_size
,
dtype
,
):
batch_size
=
prefill_batch_size
+
decode_batch_size
max_model_len
=
(
max_query_len
+
max_ctx_len
)
*
4
max_block_per_request
=
max_model_len
//
block_size
cache_size
=
(
batch_size
*
max_block_per_request
)
+
2
prefill_ctx_lens
=
torch
.
randint
(
min_ctx_len
,
max_ctx_len
+
1
,
(
prefill_batch_size
,
),
dtype
=
torch
.
long
).
tolist
()
decode_ctx_lens
=
torch
.
randint
(
min_ctx_len
,
max_ctx_len
+
1
,
(
decode_batch_size
,
),
dtype
=
torch
.
long
).
tolist
()
ctx_lens
=
prefill_ctx_lens
+
decode_ctx_lens
query_lens
=
torch
.
randint
(
min_query_len
,
max_query_len
+
1
,
(
prefill_batch_size
,
),
dtype
=
torch
.
long
,
).
tolist
()
+
[
1
for
_
in
range
(
decode_batch_size
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
query_lens
,
ctx_lens
)]
num_tokens
=
sum
(
query_lens
)
query
=
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
query
.
uniform_
(
-
1
,
1
)
torch
.
empty
(
num_tokens
,
num_heads
,
head_size
,
dtype
=
dtype
)
kv
=
torch
.
empty
(
sum
(
seq_lens
),
2
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
kv
.
uniform_
(
-
1
,
1
)
key
,
value
=
kv
.
unbind
(
dim
=
1
)
k_cache
=
torch
.
zeros
(
cache_size
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v_cache
=
torch
.
zeros
(
cache_size
,
block_size
,
num_kv_heads
,
head_size
,
dtype
=
dtype
)
k
=
torch
.
zeros
(
sum
(
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
v
=
torch
.
zeros
(
sum
(
query_lens
),
num_kv_heads
,
head_size
,
dtype
=
dtype
)
values
=
torch
.
arange
(
0
,
cache_size
,
dtype
=
torch
.
long
)
values
=
values
[
torch
.
randperm
(
cache_size
)]
block_table
=
values
[:
batch_size
*
max_block_per_request
].
view
(
batch_size
,
max_block_per_request
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
query_lens
[:
-
1
],
dtype
=
torch
.
long
),
dim
=
0
)
# copy kv to cache
b_seq_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
seq_lens
[:
-
1
],
dtype
=
torch
.
long
),
dim
=
0
)
for
i
in
range
(
batch_size
):
for
j
in
range
(
query_lens
[
i
]):
k
[
b_start_loc
[
i
]
+
j
].
copy_
(
key
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
j
])
v
[
b_start_loc
[
i
]
+
j
].
copy_
(
value
[
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
+
j
])
cur_ctx
=
0
block_id
=
0
while
cur_ctx
<
b_ctx_len
[
i
]:
start_loc
=
b_seq_start_loc
[
i
]
+
cur_ctx
if
cur_ctx
+
block_size
>
b_ctx_len
[
i
]:
end_loc
=
b_seq_start_loc
[
i
]
+
b_ctx_len
[
i
]
else
:
end_loc
=
start_loc
+
block_size
start_slot
=
block_table
[
i
,
block_id
]
*
block_size
end_slot
=
start_slot
+
end_loc
-
start_loc
k_cache
.
view
(
-
1
,
num_kv_heads
,
head_size
)[
start_slot
:
end_slot
].
copy_
(
key
[
start_loc
:
end_loc
])
v_cache
.
view
(
-
1
,
num_kv_heads
,
head_size
)[
start_slot
:
end_slot
].
copy_
(
value
[
start_loc
:
end_loc
])
cur_ctx
+=
block_size
block_id
+=
1
kv_cache
=
torch
.
stack
([
k_cache
,
v_cache
])
return
(
query
,
k
,
v
,
kv_cache
,
block_table
,
key
,
value
,
query_lens
,
seq_lens
,
)
def
get_active_block_tables
(
block_tables
,
query_lens
,
seq_lens
,
block_size
,
num_blocks
):
context_lens
=
seq_lens
-
query_lens
blocks_per_seq
=
(
context_lens
+
block_size
-
1
)
//
block_size
num_seqs
=
len
(
seq_lens
)
active_blocks
:
list
[
int
]
=
[]
for
seq_id
in
range
(
num_seqs
):
active_blocks
=
(
active_blocks
+
block_tables
[
seq_id
,
:
blocks_per_seq
[
seq_id
]].
tolist
())
return
F
.
pad
(
torch
.
tensor
(
active_blocks
,
dtype
=
torch
.
int32
),
(
0
,
num_blocks
-
len
(
active_blocks
)),
"constant"
,
0
,
)
@
pytest
.
mark
.
parametrize
(
"prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision"
,
[
# Test minimal configurations (small block size)
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
False
),
# minimal block size, small dimensions
(
1
,
199
,
1
,
512
,
4
,
2
,
8
,
True
),
# same with mixed precision
# Test common/medium configurations
(
4
,
12
,
32
,
2048
,
32
,
8
,
64
,
False
),
# common case, larger heads
(
4
,
12
,
32
,
2048
,
16
,
4
,
32
,
True
),
# medium size, mixed precision, grouped-query attention (GQA)
# Test large configurations
(
4
,
12
,
256
,
8192
,
8
,
1
,
128
,
False
),
# large blocks, large head size
(
4
,
12
,
256
,
8192
,
64
,
8
,
64
,
True
),
# large blocks, many heads
# Test asymmetric configurations
(
2
,
24
,
64
,
4096
,
12
,
4
,
96
,
False
),
# varied batch sizes
(
8
,
8
,
128
,
2048
,
24
,
2
,
48
,
True
),
# balanced batches
# Test edge cases
(
1
,
128
,
16
,
1024
,
4
,
2
,
16
,
False
),
# large decode batch
(
16
,
4
,
8
,
1024
,
4
,
2
,
128
,
True
),
# large prefill batch
(
4
,
12
,
32
,
2048
,
16
,
1
,
32
,
True
),
# multi-head attention (MHA)
(
4
,
12
,
32
,
2048
,
16
,
16
,
32
,
True
),
# multi-query attention (MQA)
])
@
torch
.
inference_mode
()
def
test_contexted_kv_attention
(
monkeypatch
:
pytest
.
MonkeyPatch
,
prefill_batch_size
:
int
,
decode_batch_size
:
int
,
num_heads
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
block_size
:
int
,
large_tile_size
,
mixed_precision
:
bool
,
)
->
None
:
import
torch_xla.core.xla_model
as
xm
from
vllm.attention.ops.nki_flash_attn
import
(
flash_attn_varlen_nkifunc
,
reorder_context_mask
)
assert
large_tile_size
%
block_size
==
0
device
=
xm
.
xla_device
()
compiler_flags_str
=
" "
.
join
([
"-O1"
,
"--retry_failed_compilation"
,
])
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"NEURON_CC_FLAGS"
,
compiler_flags_str
)
torch
.
manual_seed
(
0
)
torch
.
set_printoptions
(
sci_mode
=
False
)
torch
.
set_default_device
(
"cpu"
)
dtype
=
torch
.
float32
min_ctx_len
=
32
max_ctx_len
=
1024
min_query_len
=
16
max_query_len
=
512
num_kv_heads
=
num_heads
//
num_queries_per_kv
(
query
,
k_active
,
v_active
,
kv_cache
,
block_table
,
key
,
value
,
query_lens
,
seq_lens
,
)
=
sample_inputs
(
prefill_batch_size
=
prefill_batch_size
,
decode_batch_size
=
decode_batch_size
,
min_query_len
=
min_query_len
,
max_query_len
=
max_query_len
,
min_ctx_len
=
min_ctx_len
,
max_ctx_len
=
max_ctx_len
,
block_size
=
block_size
,
num_heads
=
num_heads
,
num_kv_heads
=
num_kv_heads
,
head_size
=
head_size
,
dtype
=
dtype
,
)
output_ref
=
ref_context_attention
(
query
,
key
,
value
,
query_lens
,
seq_lens
,
head_size
,
num_queries_per_kv
,
return_max_reduce
=
False
,
)
# build neuron program
B_P_SIZE
=
128
assert
(
large_tile_size
>=
B_P_SIZE
),
f
"Expect
{
large_tile_size
=
}
to be larger than
{
B_P_SIZE
=
}
"
def
pad_to_multiple
(
a
,
b
):
return
cdiv
(
a
,
b
)
*
b
def
pad_to_next_power_of_2
(
a
):
assert
a
>
0
return
2
**
int
(
a
-
1
).
bit_length
()
# calculate input shapes
max_num_queries
=
pad_to_next_power_of_2
(
sum
(
query_lens
))
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
num_active_blocks
=
cdiv
(
context_lens
,
block_size
).
sum
().
item
()
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
large_tile_size
//
block_size
)
context_kv_len
=
num_active_blocks
*
block_size
assert
(
context_kv_len
%
large_tile_size
==
0
),
f
"invalid context_kv_len=
{
context_kv_len
}
"
# pad QKV tensors
pad_dims
=
(
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
query
.
shape
[
0
],
)
query
=
F
.
pad
(
query
,
pad_dims
,
"constant"
,
0
)
k
=
F
.
pad
(
k_active
,
pad_dims
,
"constant"
,
0
)
v
=
F
.
pad
(
v_active
,
pad_dims
,
"constant"
,
0
)
# permute QKV tensors
# query: (1, n_heads, d, seq_q)
# key: (1, n_kv_heads, d, seq_k)
# value: (1, n_kv_heads, seq_v, d)
query
=
query
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
k
=
k
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
v
=
v
.
unsqueeze
(
0
).
permute
(
0
,
2
,
1
,
3
).
contiguous
()
kv_cache
=
kv_cache
.
permute
(
0
,
1
,
3
,
2
,
4
).
contiguous
()
# transform block table
active_block_table
=
get_active_block_tables
(
block_table
.
cpu
(),
torch
.
tensor
(
query_lens
).
cpu
(),
torch
.
tensor
(
seq_lens
).
cpu
(),
block_size
,
num_active_blocks
,
)
# Build attention masks
prior_mask
,
active_mask
=
(
BlockDiagonalCausalFromBottomRightMask
.
from_seqlens
(
query_lens
,
seq_lens
,
block_size
=
block_size
))
prior_mask_padded
=
F
.
pad
(
prior_mask
,
(
0
,
context_kv_len
-
prior_mask
.
shape
[
1
],
0
,
max_num_queries
-
prior_mask
.
shape
[
0
],
),
"constant"
,
0
,
).
bool
()
active_mask_padded
=
F
.
pad
(
active_mask
,
(
0
,
max_num_queries
-
active_mask
.
shape
[
1
],
0
,
max_num_queries
-
active_mask
.
shape
[
0
],
),
"constant"
,
0
,
).
bool
()
attn_mask
=
torch
.
concat
([
prior_mask_padded
,
active_mask_padded
],
dim
=
1
)
attn_mask
=
reorder_context_mask
(
attn_mask
,
large_tile_size
,
block_size
)
input_args
=
(
query
.
to
(
device
=
device
),
k
.
to
(
device
=
device
),
v
.
to
(
device
=
device
),
kv_cache
.
to
(
device
=
device
),
active_block_table
.
to
(
device
=
device
),
attn_mask
.
to
(
device
=
device
),
)
input_kwargs
=
dict
(
n_kv_head
=
num_kv_heads
,
head_size
=
head_size
,
mixed_precision
=
mixed_precision
,
LARGE_TILE_SZ
=
large_tile_size
,
)
output_nki
=
flash_attn_varlen_nkifunc
(
*
input_args
,
**
input_kwargs
)
num_actual_tokens
=
sum
(
query_lens
)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki
=
output_nki
.
cpu
().
permute
(
0
,
2
,
1
,
3
)
output_nki
=
output_nki
[
0
,
:
num_actual_tokens
,
:,
:]
output_ref_padded
=
F
.
pad
(
output_ref
,
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
output_ref
.
shape
[
0
]),
"constant"
,
0
,
)
output_ref
=
output_ref_padded
.
transpose
(
0
,
1
)[
0
,
:
num_actual_tokens
,
:,
:]
torch
.
testing
.
assert_close
(
output_nki
,
output_ref
,
atol
=
1e-2
,
rtol
=
0
)
tests/neuron/1_core/test_rotary_embedding.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for miscellaneous utilities
"""
import
pytest
import
torch
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
parametrize
(
"max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key"
,
[
(
16
,
False
,
32
,
32
,
1024
,
True
),
(
16
,
False
,
32
,
128
,
1024
,
True
),
(
16
,
True
,
32
,
32
,
1024
,
True
),
(
16
,
True
,
32
,
128
,
1024
,
True
),
(
16
,
False
,
32
,
128
,
1024
,
False
),
(
16
,
True
,
32
,
128
,
1024
,
False
),
])
def
test_rotary_embedding_opcheck
(
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
,
use_key
):
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
current_platform
.
seed_everything
(
0
)
torch
.
set_default_device
(
"cpu"
)
batch_size
=
1
base
=
10000
num_heads
=
8
rot
=
RotaryEmbedding
(
head_size
,
rotary_dim
,
max_position
,
base
,
is_neox_style
,
torch
.
float32
)
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
),
device
=
"cpu"
)
query
=
torch
.
randn
(
batch_size
,
seq_len
,
num_heads
*
head_size
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
key
=
torch
.
randn_like
(
query
)
if
use_key
else
None
assert
positions
.
is_cpu
,
\
"reference input tensor is expected to be CPU tensor."
ref_query
,
ref_key
=
rot
.
to
(
device
=
"cpu"
).
forward_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rot
.
to
(
device
=
device
).
forward_neuron
(
positions
.
to
(
device
=
device
),
query
.
to
(
device
=
device
),
key
.
to
(
device
=
device
)
if
key
is
not
None
else
None
)
if
use_key
:
assert
out_query
.
is_xla
and
out_key
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out_key
.
cpu
(),
ref_key
,
atol
=
1e-2
,
rtol
=
1e-2
)
else
:
assert
out_key
is
None
,
"expected returned key to be None"
assert
out_query
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out_query
.
cpu
(),
ref_query
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/neuron/2_core/test_comm_ops.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
functools
from
typing
import
Callable
from
unittest.mock
import
patch
import
pytest
import
torch
import
torch_xla.distributed.xla_multiprocessing
as
xmp
from
typing_extensions
import
ParamSpec
from
vllm.distributed.communication_op
import
(
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_reduce
)
from
vllm.distributed.parallel_state
import
(
ensure_model_parallel_initialized
,
init_distributed_environment
)
from
vllm.utils
import
get_distributed_init_method
,
get_open_port
_P
=
ParamSpec
(
"_P"
)
def
reinitialize_neuron_runtime
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
"""Decorator to reinitialize the Neuron Runtime before executing a test.
This is necessary for distributed tests which need to reallocate Neuron
Cores to separate subprocesses.
"""
@
functools
.
wraps
(
f
)
def
wrapper
(
*
args
:
_P
.
args
,
**
kwargs
:
_P
.
kwargs
)
->
None
:
runtime
=
torch
.
classes
.
neuron
.
Runtime
()
runtime
.
initialize
()
runtime
.
unsafe_close
()
f
(
*
args
,
**
kwargs
)
runtime
.
initialize
()
return
wrapper
def
all_gather_test_worker
(
index
,
tp_degree
,
distributed_init_method
):
init_distributed_environment
(
tp_degree
,
index
,
distributed_init_method
,
index
,
backend
=
"xla"
)
ensure_model_parallel_initialized
(
tp_degree
,
1
)
num_dimensions
=
3
tensor_size
=
list
(
range
(
2
,
num_dimensions
+
2
))
total_size
=
1
for
s
in
tensor_size
:
total_size
*=
s
all_gather_dimension
=
-
1
all_tensors
=
[
torch
.
arange
(
total_size
,
dtype
=
torch
.
float32
,
device
=
"xla"
).
reshape
(
tensor_size
)
*
(
r
+
1
)
for
r
in
range
(
tp_degree
)
]
expected
=
torch
.
cat
(
all_tensors
,
dim
=
all_gather_dimension
)
t
=
all_tensors
[
index
%
tp_degree
]
t
=
tensor_model_parallel_all_gather
(
t
,
all_gather_dimension
)
torch
.
testing
.
assert_close
(
t
,
expected
)
def
all_reduce_test_worker
(
index
,
tp_degree
,
distributed_init_method
):
init_distributed_environment
(
tp_degree
,
index
,
distributed_init_method
,
index
,
backend
=
"xla"
)
ensure_model_parallel_initialized
(
tp_degree
,
1
)
num_elements
=
8
all_tensors
=
[
torch
.
arange
(
num_elements
,
dtype
=
torch
.
float32
,
device
=
"xla"
)
*
(
r
+
1
)
for
r
in
range
(
tp_degree
)
]
expected
=
torch
.
sum
(
torch
.
stack
(
all_tensors
,
dim
=
0
),
dim
=
0
)
t
=
all_tensors
[
index
%
tp_degree
]
t
=
tensor_model_parallel_all_reduce
(
t
)
torch
.
testing
.
assert_close
(
t
,
expected
)
@
pytest
.
mark
.
parametrize
(
"tp_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"test_target"
,
[
all_reduce_test_worker
,
all_gather_test_worker
])
@
reinitialize_neuron_runtime
def
test_neuron_multi_process_tensor_parallel
(
monkeypatch
,
tp_size
,
test_target
):
with
patch
(
'torch_xla._XLAC._xla_runtime_is_initialized'
,
return_value
=
False
):
distributed_init_method
=
get_distributed_init_method
(
"127.0.0.1"
,
get_open_port
())
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
monkeypatch
.
setenv
(
"NEURONCORE_NUM_DEVICES"
,
str
(
tp_size
))
monkeypatch
.
setenv
(
"NEURON_PJRT_PROCESSES_NUM_DEVICES"
,
','
.
join
([
'1'
for
_
in
range
(
tp_size
)]))
xmp
.
spawn
(
test_target
,
args
=
(
tp_size
,
distributed_init_method
))
tests/neuron/2_core/test_eagle.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
os
import
shutil
import
tempfile
import
torch
from
huggingface_hub
import
snapshot_download
from
safetensors
import
safe_open
from
vllm
import
LLM
,
SamplingParams
def
patch_eagle_draft_with_lm_head
(
target_model_id
:
str
,
draft_model_id
:
str
)
->
str
:
# In NxDI, draft model checkpoint must include lm_head weights from target
# model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
# /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
# #eagle-checkpoint-compatibility
final_draft_dir
=
"/tmp/patched_eagle_draft"
with
tempfile
.
TemporaryDirectory
()
as
tmp_dir
:
target_dir
=
snapshot_download
(
repo_id
=
target_model_id
,
local_dir
=
os
.
path
.
join
(
tmp_dir
,
"target"
))
draft_dir
=
snapshot_download
(
repo_id
=
draft_model_id
,
local_dir
=
os
.
path
.
join
(
tmp_dir
,
"draft"
))
lm_head_key
=
"lm_head.weight"
index_path
=
os
.
path
.
join
(
target_dir
,
"model.safetensors.index.json"
)
with
open
(
index_path
)
as
f
:
index
=
json
.
load
(
f
)
shard_name
=
index
[
"weight_map"
][
lm_head_key
]
target_safetensor_path
=
os
.
path
.
join
(
target_dir
,
shard_name
)
with
safe_open
(
target_safetensor_path
,
framework
=
"pt"
)
as
f
:
target_lm_head
=
f
.
get_tensor
(
lm_head_key
)
draft_path
=
os
.
path
.
join
(
draft_dir
,
"pytorch_model.bin"
)
draft_state_dict
=
torch
.
load
(
draft_path
,
map_location
=
"cpu"
)
draft_state_dict
[
lm_head_key
]
=
target_lm_head
.
to
(
torch
.
float16
)
torch
.
save
(
draft_state_dict
,
draft_path
)
shutil
.
copytree
(
draft_dir
,
final_draft_dir
,
dirs_exist_ok
=
True
)
return
final_draft_dir
def
test_eagle
():
patched_draft_path
=
patch_eagle_draft_with_lm_head
(
target_model_id
=
"meta-llama/Llama-2-7b-hf"
,
draft_model_id
=
"yuhuili/EAGLE-llama2-chat-7B"
)
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-hf"
,
speculative_config
=
{
"model"
:
patched_draft_path
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
128
},
max_num_seqs
=
1
,
max_model_len
=
128
,
tensor_parallel_size
=
2
,
override_neuron_config
=
{
"enable_eagle_speculation"
:
True
,
"enable_fused_speculation"
:
True
,
"fused_qkv"
:
True
},
)
prompts
=
[
"The president of the United States is"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
))
expected_output
=
" the head of state and head of government of "
\
"the United States. The president direct"
for
output
in
outputs
:
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
output
.
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
(
expected_output
==
generated_text
)
print
(
"Neuron Eagle speculation test passed."
)
tests/neuron/2_core/test_mistral.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
LLM
,
SamplingParams
def
test_mistral
():
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-v0.1"
,
tensor_parallel_size
=
2
,
max_num_seqs
=
4
,
max_model_len
=
128
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
})
# Send more prompts than the compiled batch size (4) and request
# varying generation lengths to test accuracy related to Neuron
# specific sequence id sorting.
prompts
=
[
"The president of the United States is"
,
"The capital of France is"
,
"What is Annapurna labs?"
,
"I believe the meaning of life is"
,
"Tell me a story about a brave knight"
,
"Hello, my name is Llama"
,
]
sampling_params
=
[
SamplingParams
(
top_k
=
1
,
max_tokens
=
10
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
20
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
30
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
40
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
50
),
SamplingParams
(
top_k
=
1
,
max_tokens
=
60
)
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
expected_outputs
=
[
" the most powerful person in the world. He is"
,
" a city of many faces. It is a city of history, culture, art, "
"fashion, and"
,
"
\n\n
Annapurna Labs is a semiconductor company that was founded "
"in 2013 by Amazon. The company is"
,
" to be happy.
\n\n
I believe that happiness is a choice.
\n\n
I "
"believe that happiness is a state of mind.
\n\n
I believe that "
"happiness is a journey.
\n\n
I believe"
,
" who rescued a princess from a dragon.
\n\n
Tell me a story about"
" a princess who rescued herself from a dragon.
\n\n
Tell me a "
"story about a princess who rescued herself from a dragon and "
"then rescued a knight from"
,
" and I am a 10 year old male. I am a very friendly and "
"affectionate boy who loves to be around people. I am a very "
"active boy who loves to play and run around. I am a very smart "
"boy who loves to learn new things. I am a very loyal boy"
]
for
expected_output
,
output
in
zip
(
expected_outputs
,
outputs
):
generated_text
=
output
.
outputs
[
0
].
text
print
(
f
"Prompt:
{
output
.
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
assert
(
expected_output
==
generated_text
)
print
(
"Neuron Mistral test passed."
)
tests/neuron/2_core/test_multi_lora.py
deleted
100644 → 0
View file @
33650733
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
huggingface_hub
import
snapshot_download
from
vllm
import
LLM
,
SamplingParams
from
vllm.lora.request
import
LoRARequest
def
test_llama_single_lora
():
sql_lora_files
=
snapshot_download
(
repo_id
=
"yard1/llama-2-7b-sql-lora-test"
)
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-hf"
,
tensor_parallel_size
=
2
,
max_num_seqs
=
4
,
max_model_len
=
512
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
,
"lora_modules"
:
[{
"name"
:
"lora_id_1"
,
"path"
:
sql_lora_files
}]
},
enable_lora
=
True
,
max_loras
=
1
,
max_lora_rank
=
256
,
device
=
"neuron"
)
"""For multi-lora requests using NxDI as the backend, only the lora_name
needs to be specified. The lora_id and lora_path are supplied at the LLM
class/server initialization, after which the paths are handled by NxDI"""
lora_req_1
=
LoRARequest
(
"lora_id_1"
,
0
,
" "
)
prompts
=
[
"The president of the United States is"
,
"The capital of France is"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
),
lora_request
=
[
lora_req_1
,
lora_req_1
])
expected_outputs
=
[
" the head of state and head of government of the United States. "
"The president direct"
,
" a city of contrasts. The city is home to the Eiffel Tower"
]
for
expected_output
,
output
in
zip
(
expected_outputs
,
outputs
):
generated_text
=
output
.
outputs
[
0
].
text
assert
(
expected_output
==
generated_text
)
def
test_llama_multiple_lora
():
sql_lora_files
=
snapshot_download
(
repo_id
=
"yard1/llama-2-7b-sql-lora-test"
)
llm
=
LLM
(
model
=
"meta-llama/Llama-2-7b-hf"
,
tensor_parallel_size
=
2
,
max_num_seqs
=
4
,
max_model_len
=
512
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
,
"lora_modules"
:
[{
"name"
:
"lora_id_1"
,
"path"
:
sql_lora_files
},
{
"name"
:
"lora_id_2"
,
"path"
:
sql_lora_files
}]
},
enable_lora
=
True
,
max_loras
=
2
,
max_lora_rank
=
256
,
device
=
"neuron"
)
"""For multi-lora requests using NxDI as the backend, only the lora_name
needs to be specified. The lora_id and lora_path are supplied at the LLM
class/server initialization, after which the paths are handled by NxDI"""
lora_req_1
=
LoRARequest
(
"lora_id_1"
,
0
,
" "
)
lora_req_2
=
LoRARequest
(
"lora_id_2"
,
1
,
" "
)
prompts
=
[
"The president of the United States is"
,
"The capital of France is"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
),
lora_request
=
[
lora_req_1
,
lora_req_2
])
expected_outputs
=
[
" the head of state and head of government of the United States. "
"The president direct"
,
" a city of contrasts. The city is home to the Eiffel Tower"
]
for
expected_output
,
output
in
zip
(
expected_outputs
,
outputs
):
generated_text
=
output
.
outputs
[
0
].
text
assert
(
expected_output
==
generated_text
)
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/__init__.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def
register_prithvi_india
():
return
"prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessorIndia"
# noqa: E501
def
register_prithvi
_valencia
():
return
"prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor
Valencia
"
# noqa: E501
def
register_prithvi
():
return
"prithvi_io_processor.prithvi_processor.PrithviMultimodalDataProcessor"
# noqa: E501
tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
View file @
38d80967
...
...
@@ -8,7 +8,7 @@ import datetime
import
os
import
tempfile
import
urllib.request
from
collections.abc
import
AsyncGenerator
,
Sequence
from
collections.abc
import
Sequence
from
typing
import
Any
,
Optional
,
Union
import
albumentations
...
...
@@ -234,6 +234,8 @@ def load_image(
class
PrithviMultimodalDataProcessor
(
IOProcessor
):
indices
=
[
0
,
1
,
2
,
3
,
4
,
5
]
def
__init__
(
self
,
vllm_config
:
VllmConfig
):
super
().
__init__
(
vllm_config
)
...
...
@@ -359,14 +361,6 @@ class PrithviMultimodalDataProcessor(IOProcessor):
return
prompts
async
def
pre_process_async
(
self
,
prompt
:
IOProcessorInput
,
request_id
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
Union
[
PromptType
,
Sequence
[
PromptType
]]:
return
self
.
pre_process
(
prompt
,
request_id
,
**
kwargs
)
def
post_process
(
self
,
model_output
:
Sequence
[
PoolingRequestOutput
],
...
...
@@ -420,30 +414,3 @@ class PrithviMultimodalDataProcessor(IOProcessor):
format
=
"tiff"
,
data
=
out_data
,
request_id
=
request_id
)
async
def
post_process_async
(
self
,
model_output
:
AsyncGenerator
[
tuple
[
int
,
PoolingRequestOutput
]],
request_id
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
IOProcessorOutput
:
collected_output
=
[
item
async
for
i
,
item
in
model_output
]
return
self
.
post_process
(
collected_output
,
request_id
,
**
kwargs
)
class
PrithviMultimodalDataProcessorIndia
(
PrithviMultimodalDataProcessor
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
):
super
().
__init__
(
vllm_config
)
self
.
indices
=
[
1
,
2
,
3
,
8
,
11
,
12
]
class
PrithviMultimodalDataProcessorValencia
(
PrithviMultimodalDataProcessor
):
def
__init__
(
self
,
vllm_config
:
VllmConfig
):
super
().
__init__
(
vllm_config
)
self
.
indices
=
[
0
,
1
,
2
,
3
,
4
,
5
]
tests/plugins/prithvi_io_processor_plugin/setup.py
View file @
38d80967
...
...
@@ -9,8 +9,7 @@ setup(
packages
=
[
"prithvi_io_processor"
],
entry_points
=
{
"vllm.io_processor_plugins"
:
[
"prithvi_to_tiff_india = prithvi_io_processor:register_prithvi_india"
,
# noqa: E501
"prithvi_to_tiff_valencia = prithvi_io_processor:register_prithvi_valencia"
,
# noqa: E501
"prithvi_to_tiff = prithvi_io_processor:register_prithvi"
,
# noqa: E501
]
},
)
tests/plugins_tests/test_io_processor_plugins.py
View file @
38d80967
...
...
@@ -7,12 +7,11 @@ import requests
from
tests.utils
import
RemoteOpenAIServer
from
vllm.config
import
VllmConfig
from
vllm.entrypoints.llm
import
LLM
from
vllm.entrypoints.openai.protocol
import
IOProcessorResponse
from
vllm.plugins.io_processors
import
get_io_processor
from
vllm.pooling_params
import
PoolingParams
MODEL_NAME
=
"
christian-pinto
/Prithvi-EO-2.0-300M-TL-
VLLM
"
MODEL_NAME
=
"
ibm-nasa-geospatial
/Prithvi-EO-2.0-300M-TL-
Sen1Floods11
"
image_url
=
"https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"
# noqa: E501
...
...
@@ -23,61 +22,7 @@ def test_loading_missing_plugin():
get_io_processor
(
vllm_config
,
"wrong_plugin"
)
def
test_loading_engine_with_wrong_plugin
():
with
pytest
.
raises
(
ValueError
):
LLM
(
model
=
MODEL_NAME
,
skip_tokenizer_init
=
True
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs
=
32
,
io_processor_plugin
=
"wrong_plugin"
,
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_prithvi_mae_plugin_offline
(
vllm_runner
,
model_name
:
str
):
img_prompt
=
dict
(
data
=
image_url
,
data_format
=
"url"
,
image_format
=
"tiff"
,
out_data_format
=
"b64_json"
,
)
pooling_params
=
PoolingParams
(
task
=
"encode"
,
softmax
=
False
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
skip_tokenizer_init
=
True
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs
=
1
,
io_processor_plugin
=
"prithvi_to_tiff_valencia"
,
)
as
llm_runner
:
pooler_output
=
llm_runner
.
get_llm
().
encode
(
img_prompt
,
pooling_params
=
pooling_params
,
)
output
=
pooler_output
[
0
].
outputs
# verify the output is formatted as expected for this plugin
assert
all
(
hasattr
(
output
,
attr
)
for
attr
in
[
"type"
,
"format"
,
"data"
,
"request_id"
])
# We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted.
base64
.
b64decode
(
output
.
data
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
server
():
args
=
[
"--runner"
,
...
...
@@ -90,7 +35,9 @@ def server():
"--max-num-seqs"
,
"32"
,
"--io-processor-plugin"
,
"prithvi_to_tiff_valencia"
"prithvi_to_tiff"
,
"--model-impl"
,
"terratorch"
,
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
remote_server
:
...
...
@@ -113,6 +60,7 @@ async def test_prithvi_mae_plugin_online(
},
"priority"
:
0
,
"model"
:
model_name
,
"softmax"
:
False
}
ret
=
requests
.
post
(
...
...
@@ -135,3 +83,43 @@ async def test_prithvi_mae_plugin_online(
# We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted.
base64
.
b64decode
(
plugin_data
[
"data"
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
def
test_prithvi_mae_plugin_offline
(
vllm_runner
,
model_name
:
str
):
img_prompt
=
dict
(
data
=
image_url
,
data_format
=
"url"
,
image_format
=
"tiff"
,
out_data_format
=
"b64_json"
,
)
pooling_params
=
PoolingParams
(
task
=
"encode"
,
softmax
=
False
)
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
skip_tokenizer_init
=
True
,
trust_remote_code
=
True
,
enforce_eager
=
True
,
# Limit the maximum number of parallel requests
# to avoid the model going OOM in CI.
max_num_seqs
=
1
,
model_impl
=
"terratorch"
,
io_processor_plugin
=
"prithvi_to_tiff"
,
)
as
llm_runner
:
pooler_output
=
llm_runner
.
get_llm
().
encode
(
img_prompt
,
pooling_params
=
pooling_params
,
)
output
=
pooler_output
[
0
].
outputs
# verify the output is formatted as expected for this plugin
assert
all
(
hasattr
(
output
,
attr
)
for
attr
in
[
"type"
,
"format"
,
"data"
,
"request_id"
])
# We just check that the output is a valid base64 string.
# Raises an exception and fails the test if the string is corrupted.
base64
.
b64decode
(
output
.
data
)
tests/quantization/test_modelopt.py
View file @
38d80967
...
...
@@ -27,7 +27,7 @@ def use_v0_only(monkeypatch):
reason
=
"ModelOpt FP8 is not supported on this GPU type."
)
def
test_modelopt_fp8_checkpoint_setup
(
vllm_runner
):
"""Test ModelOpt FP8 checkpoint loading and structure validation."""
# TODO: provide a small public
al
ly available test checkpoint
# TODO: provide a small publicly available test checkpoint
model_path
=
(
"/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/"
"TinyLlama-1.1B-Chat-v1.0-fp8-0710"
)
...
...
tests/quantization/test_torchao.py
View file @
38d80967
...
...
@@ -75,5 +75,25 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
print
(
output
)
@
pytest
.
mark
.
skipif
(
not
TORCHAO_AVAILABLE
,
reason
=
"torchao is not available"
)
@
pytest
.
mark
.
skip
(
reason
=
"since torchao nightly is only compatible with torch nightly"
"currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
"torchao tests that requires newer versions (0.14.0.dev+) for now"
)
def
test_opt_125m_awq_int4wo_model_loading_with_params
(
vllm_runner
):
torch
.
_dynamo
.
reset
()
model_name
=
(
"torchao-testing/opt-125m-AWQConfig-Int4WeightOnlyConfig-v2"
"-0.14.0.dev"
)
with
vllm_runner
(
model_name
=
model_name
,
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
assert
output
print
(
output
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
View file @
38d80967
...
...
@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm
import
SamplingParams
from
vllm.config
import
LoadConfig
from
vllm.config
.load
import
LoadConfig
from
vllm.model_executor.model_loader
import
get_model_loader
load_format
=
"runai_streamer"
...
...
tests/runai_model_streamer_test/test_runai_utils.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
glob
import
os
import
tempfile
import
huggingface_hub.constants
from
vllm.model_executor.model_loader.weight_utils
import
(
download_weights_from_hf
)
from
vllm.transformers_utils.runai_utils
import
(
is_runai_obj_uri
,
list_safetensors
)
def
test_is_runai_obj_uri
():
assert
is_runai_obj_uri
(
"gs://some-gcs-bucket/path"
)
assert
is_runai_obj_uri
(
"s3://some-s3-bucket/path"
)
assert
not
is_runai_obj_uri
(
"nfs://some-nfs-path"
)
def
test_runai_list_safetensors_local
():
with
tempfile
.
TemporaryDirectory
()
as
tmpdir
:
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
=
False
download_weights_from_hf
(
"openai-community/gpt2"
,
allow_patterns
=
[
"*.safetensors"
,
"*.json"
],
cache_dir
=
tmpdir
)
safetensors
=
glob
.
glob
(
f
"
{
tmpdir
}
/**/*.safetensors"
,
recursive
=
True
)
assert
len
(
safetensors
)
>
0
parentdir
=
[
os
.
path
.
dirname
(
safetensor
)
for
safetensor
in
safetensors
][
0
]
files
=
list_safetensors
(
parentdir
)
assert
len
(
safetensors
)
==
len
(
files
)
if
__name__
==
"__main__"
:
test_is_runai_obj_uri
()
test_runai_list_safetensors_local
()
tests/samplers/test_beam_search.py
View file @
38d80967
...
...
@@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit(
beam_width
:
int
,
)
->
None
:
# example_prompts[1]&[3]&[7] fails due to unknown reason even without
# concurency limit. skip them for now.
# concur
r
ency limit. skip them for now.
example_prompts
=
(
example_prompts
[:
8
])
concurrency_limit
=
2
assert
len
(
example_prompts
)
>
concurrency_limit
...
...
tests/tensorizer_loader/test_tensorizer.py
View file @
38d80967
...
...
@@ -161,11 +161,11 @@ def test_load_without_tensorizer_load_format(vllm_runner, capfd, model_ref):
model
=
vllm_runner
(
model_ref
,
model_loader_extra_config
=
TensorizerConfig
(
tensorizer_uri
=
"test"
))
pytest
.
fail
(
"Expected RuntimeError for extra config keys"
)
except
RuntimeError
:
out
,
err
=
capfd
.
readouterr
()
combined_output
=
out
+
err
assert
(
"ValueError: Model loader extra config "
"is not supported for load "
assert
(
"ValueError: Unexpected extra config keys for load "
"format auto"
)
in
combined_output
finally
:
del
model
...
...
@@ -181,11 +181,12 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd,
model_ref
,
load_format
=
"safetensors"
,
model_loader_extra_config
=
TensorizerConfig
(
tensorizer_uri
=
"test"
))
pytest
.
fail
(
"Expected RuntimeError for extra config keys"
)
except
RuntimeError
:
out
,
err
=
capfd
.
readouterr
()
combined_output
=
out
+
err
assert
(
"ValueError:
Model loader
extra config
is not supported
"
assert
(
"ValueError:
Unexpected
extra config
keys
"
"for load format safetensors"
)
in
combined_output
finally
:
del
model
...
...
tests/test_config.py
View file @
38d80967
...
...
@@ -6,8 +6,9 @@ from dataclasses import MISSING, Field, asdict, dataclass, field
import
pytest
from
vllm.compilation.backends
import
VllmBackend
from
vllm.config
import
(
LoadConfig
,
ModelConfig
,
PoolerConfig
,
VllmConfig
,
get_field
,
update_config
)
from
vllm.config
import
(
ModelConfig
,
PoolerConfig
,
VllmConfig
,
get_field
,
update_config
)
from
vllm.config.load
import
LoadConfig
from
vllm.model_executor.layers.pooler
import
PoolingType
from
vllm.platforms
import
current_platform
...
...
tests/tool_use/test_openai_tool_parser.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
json
import
pytest
from
openai_harmony
import
(
Conversation
,
DeveloperContent
,
HarmonyEncodingName
,
Message
,
Role
,
SystemContent
,
load_harmony_encoding
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.tool_parsers
import
OpenAIToolParser
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
MODEL
=
"gpt2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
openai_tokenizer
():
# The parser does not use the tokenizer, but the constructor requires it.
return
get_tokenizer
(
MODEL
)
@
pytest
.
fixture
def
openai_tool_parser
(
openai_tokenizer
):
return
OpenAIToolParser
(
openai_tokenizer
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
harmony_encoding
():
return
load_harmony_encoding
(
HarmonyEncodingName
.
HARMONY_GPT_OSS
)
def
assert_tool_calls
(
actual_tool_calls
:
list
[
ToolCall
],
expected_tool_calls
:
list
[
ToolCall
],
):
assert
len
(
actual_tool_calls
)
==
len
(
expected_tool_calls
)
for
actual_tool_call
,
expected_tool_call
in
zip
(
actual_tool_calls
,
expected_tool_calls
):
assert
isinstance
(
actual_tool_call
.
id
,
str
)
assert
len
(
actual_tool_call
.
id
)
>
16
# Default from protocol.py
assert
actual_tool_call
.
type
==
"function"
assert
actual_tool_call
.
function
==
expected_tool_call
.
function
def
test_extract_tool_calls_no_tools
(
openai_tool_parser
,
harmony_encoding
):
convo
=
Conversation
.
from_messages
([
Message
.
from_role_and_content
(
Role
.
SYSTEM
,
SystemContent
.
new
(),
),
Message
.
from_role_and_content
(
Role
.
DEVELOPER
,
DeveloperContent
.
new
().
with_instructions
(
"Talk like a pirate!"
)),
Message
.
from_role_and_content
(
Role
.
USER
,
"Arrr, how be you?"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
"This is a test"
).
with_channel
(
"final"
)
])
token_ids
=
harmony_encoding
.
render_conversation_for_completion
(
convo
,
Role
.
ASSISTANT
)
extracted_info
=
openai_tool_parser
.
extract_tool_calls
(
""
,
request
=
None
,
token_ids
=
token_ids
,
)
assert
not
extracted_info
.
tools_called
assert
extracted_info
.
tool_calls
==
[]
assert
extracted_info
.
content
==
"This is a test"
def
test_extract_tool_calls_single_tool
(
openai_tool_parser
,
harmony_encoding
):
convo
=
Conversation
.
from_messages
([
Message
.
from_role_and_content
(
Role
.
USER
,
"What is the weather in Tokyo?"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'User asks: "What is the weather in Tokyo?" We need to use get_current_weather tool.'
,
# noqa: E501
).
with_channel
(
"analysis"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"location": "Tokyo"}'
).
with_channel
(
"commentary"
).
with_recipient
(
"functions.get_current_weather"
).
with_content_type
(
"json"
),
])
token_ids
=
harmony_encoding
.
render_conversation_for_completion
(
convo
,
Role
.
ASSISTANT
)
extracted_info
=
openai_tool_parser
.
extract_tool_calls
(
""
,
request
=
None
,
token_ids
=
token_ids
,
)
assert
extracted_info
.
tools_called
expected_tool_calls
=
[
ToolCall
(
function
=
FunctionCall
(
name
=
"get_current_weather"
,
arguments
=
json
.
dumps
({
"location"
:
"Tokyo"
}),
))
]
assert_tool_calls
(
extracted_info
.
tool_calls
,
expected_tool_calls
)
assert
extracted_info
.
content
is
None
def
test_extract_tool_calls_multiple_tools
(
openai_tool_parser
,
harmony_encoding
,
):
convo
=
Conversation
.
from_messages
([
Message
.
from_role_and_content
(
Role
.
USER
,
"What is the weather in Tokyo based on where I'm at?"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'User asks: "What is the weather in Tokyo?" based on their location. We need to use get_current_weather tool and get_user_location tool.'
,
# noqa: E501
).
with_channel
(
"analysis"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"location": "Tokyo"}'
).
with_channel
(
"commentary"
).
with_recipient
(
"functions.get_current_weather"
).
with_content_type
(
"json"
),
Message
.
from_role_and_content
(
Role
.
ASSISTANT
,
'{"location": "Tokyo"}'
).
with_channel
(
"commentary"
).
with_recipient
(
"functions.get_user_location"
).
with_content_type
(
"json"
),
])
token_ids
=
harmony_encoding
.
render_conversation_for_completion
(
convo
,
Role
.
ASSISTANT
,
)
extracted_info
=
openai_tool_parser
.
extract_tool_calls
(
""
,
request
=
None
,
token_ids
=
token_ids
,
)
assert
extracted_info
.
tools_called
expected_tool_calls
=
[
ToolCall
(
function
=
FunctionCall
(
name
=
"get_current_weather"
,
arguments
=
json
.
dumps
({
"location"
:
"Tokyo"
}),
)),
ToolCall
(
function
=
FunctionCall
(
name
=
"get_user_location"
,
arguments
=
json
.
dumps
({
"location"
:
"Tokyo"
}),
))
]
assert_tool_calls
(
extracted_info
.
tool_calls
,
expected_tool_calls
)
assert
extracted_info
.
content
is
None
tests/tpu/test_quantization_accuracy.py
View file @
38d80967
...
...
@@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
expected_value
=
0.76
),
# no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow
up, move this into the LM-EVAL section of the CI.
# a follow
-
up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
# expected_value=0.66), # bias in QKV layers
...
...
Prev
1
…
12
13
14
15
16
17
18
19
20
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment