Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fcfc474d
Commit
fcfc474d
authored
Apr 09, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.3' into v0.8.3-dev
parents
bb94d2e5
296c6572
Changes
503
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
882 additions
and
192 deletions
+882
-192
tests/models/test_initialization.py
tests/models/test_initialization.py
+4
-2
tests/models/test_registry.py
tests/models/test_registry.py
+10
-2
tests/models/test_transformers.py
tests/models/test_transformers.py
+5
-18
tests/models/test_utils.py
tests/models/test_utils.py
+79
-0
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+4
-14
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+65
-84
tests/neuron/1_core/test_cache.py
tests/neuron/1_core/test_cache.py
+3
-1
tests/neuron/1_core/test_prefix_prefill.py
tests/neuron/1_core/test_prefix_prefill.py
+5
-8
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+0
-3
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+72
-8
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+9
-10
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+20
-3
tests/reasoning/__init__.py
tests/reasoning/__init__.py
+0
-0
tests/reasoning/test_deepseekr1_reasoning_parser.py
tests/reasoning/test_deepseekr1_reasoning_parser.py
+105
-11
tests/reasoning/test_granite_reasoning_parser.py
tests/reasoning/test_granite_reasoning_parser.py
+347
-0
tests/reasoning/utils.py
tests/reasoning/utils.py
+1
-1
tests/spec_decode/e2e/test_integration_dist_tp2.py
tests/spec_decode/e2e/test_integration_dist_tp2.py
+70
-10
tests/spec_decode/e2e/test_integration_dist_tp4.py
tests/spec_decode/e2e/test_integration_dist_tp4.py
+4
-2
tests/test_utils.py
tests/test_utils.py
+75
-12
tests/tool_use/test_chat_completion_request_validations.py
tests/tool_use/test_chat_completion_request_validations.py
+4
-3
No files found.
tests/models/test_initialization.py
View file @
fcfc474d
...
...
@@ -54,8 +54,10 @@ def test_can_initialize(model_arch):
model_info
.
default
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
speculative_model
=
model_info
.
speculative_model
,
num_speculative_tokens
=
1
if
model_info
.
speculative_model
else
None
,
speculative_config
=
{
"model"
:
model_info
.
speculative_model
,
"num_speculative_tokens"
:
1
,
}
if
model_info
.
speculative_model
else
None
,
trust_remote_code
=
model_info
.
trust_remote_code
,
load_format
=
"dummy"
,
hf_overrides
=
hf_overrides
,
...
...
tests/models/test_registry.py
View file @
fcfc474d
...
...
@@ -23,6 +23,11 @@ from .registry import HF_EXAMPLE_MODELS
@
pytest
.
mark
.
parametrize
(
"model_arch"
,
ModelRegistry
.
get_supported_archs
())
def
test_registry_imports
(
model_arch
):
# Llama4ForCausalLM does not have a standalone model
if
model_arch
==
"Llama4ForCausalLM"
:
return
model_info
=
HF_EXAMPLE_MODELS
.
get_hf_info
(
model_arch
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
...
...
@@ -91,8 +96,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
def
test_hf_registry_coverage
():
untested_archs
=
(
ModelRegistry
.
get_supported_archs
()
-
HF_EXAMPLE_MODELS
.
get_supported_archs
())
untested_archs
=
set
(
ModelRegistry
.
get_supported_archs
()
-
HF_EXAMPLE_MODELS
.
get_supported_archs
())
# Llama4ForCausalLM does not have a standalone model
untested_archs
.
discard
(
"Llama4ForCausalLM"
)
assert
not
untested_archs
,
(
"Please add the following architectures to "
...
...
tests/models/test_transformers.py
View file @
fcfc474d
...
...
@@ -3,8 +3,6 @@
Run `pytest tests/models/test_transformers.py`.
"""
from
contextlib
import
nullcontext
import
pytest
from
..conftest
import
HfRunner
,
VllmRunner
...
...
@@ -42,7 +40,6 @@ def check_implementation(
"model,model_impl"
,
[
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"transformers"
),
(
"openai-community/gpt2"
,
"transformers"
),
(
"ArthurZ/Ilama-3.2-1B"
,
"auto"
),
# CUSTOM CODE
])
# trust_remote_code=True by default
def
test_models
(
...
...
@@ -52,20 +49,11 @@ def test_models(
model
:
str
,
model_impl
:
str
,
)
->
None
:
maybe_raises
=
nullcontext
()
if
model
==
"openai-community/gpt2"
and
model_impl
==
"transformers"
:
# Model is not backend compatible
maybe_raises
=
pytest
.
raises
(
ValueError
,
match
=
"The Transformers implementation.*not compatible with vLLM"
)
with
maybe_raises
:
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
,
model_impl
=
model_impl
)
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
,
model_impl
=
model_impl
)
@
multi_gpu_test
(
num_gpus
=
2
)
...
...
@@ -84,7 +72,6 @@ def test_distributed(
"meta-llama/Llama-3.2-1B-Instruct"
,
{
"quantization"
:
"bitsandbytes"
,
"load_format"
:
"bitsandbytes"
,
},
),
])
...
...
tests/models/test_utils.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
torch
from
vllm.model_executor.models.utils
import
AutoWeightsLoader
class
ModuleWithBatchNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
bn
=
torch
.
nn
.
BatchNorm1d
(
2
)
def
forward
(
self
,
x
):
return
self
.
bn
(
x
)
class
ModuleWithNestedBatchNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
nested_mod
=
ModuleWithBatchNorm
()
def
forward
(
self
,
x
):
return
self
.
nested_mod
(
x
)
def
test_module_with_batchnorm_can_load
():
"""Ensure the auto weight loader can load batchnorm stats."""
mod
=
ModuleWithBatchNorm
()
# Run some data through the module with batchnorm
mod
(
torch
.
Tensor
([[
1
,
2
],
[
3
,
4
]]))
# Try to load the weights to a new instance
def
weight_generator
():
yield
from
mod
.
state_dict
().
items
()
new_mod
=
ModuleWithBatchNorm
()
assert
not
torch
.
all
(
new_mod
.
bn
.
running_mean
==
mod
.
bn
.
running_mean
)
assert
not
torch
.
all
(
new_mod
.
bn
.
running_var
==
mod
.
bn
.
running_var
)
assert
new_mod
.
bn
.
num_batches_tracked
.
item
()
==
0
loader
=
AutoWeightsLoader
(
new_mod
)
loader
.
load_weights
(
weight_generator
())
# Ensure the stats are updated
assert
torch
.
all
(
new_mod
.
bn
.
running_mean
==
mod
.
bn
.
running_mean
)
assert
torch
.
all
(
new_mod
.
bn
.
running_var
==
mod
.
bn
.
running_var
)
assert
new_mod
.
bn
.
num_batches_tracked
.
item
()
==
1
def
test_module_with_child_containing_batchnorm_can_autoload
():
"""Ensure the auto weight loader can load nested modules batchnorm stats."""
mod
=
ModuleWithNestedBatchNorm
()
# Run some data through the module with batchnorm
mod
(
torch
.
Tensor
([[
1
,
2
],
[
3
,
4
]]))
# Try to load the weights to a new instance
def
weight_generator
():
yield
from
mod
.
state_dict
().
items
()
new_mod
=
ModuleWithNestedBatchNorm
()
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
not
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
0
loader
=
AutoWeightsLoader
(
new_mod
)
loader
.
load_weights
(
weight_generator
())
# Ensure the stats are updated
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_mean
==
mod
.
nested_mod
.
bn
.
running_mean
)
assert
torch
.
all
(
new_mod
.
nested_mod
.
bn
.
running_var
==
mod
.
nested_mod
.
bn
.
running_var
)
assert
new_mod
.
nested_mod
.
bn
.
num_batches_tracked
.
item
()
==
1
tests/multimodal/test_processing.py
View file @
fcfc474d
...
...
@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches
)
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.transformers_utils.tokenizer
import
(
AnyTokenizer
,
cached_tokenizer_from_config
)
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
full_groupby
from
.utils
import
random_image
...
...
@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
)
profiler
=
MultiModalProfiler
(
processor
)
mock_supported_mm_limits
=
MagicMock
(
return_value
=
{
"image"
:
num_supported
})
...
...
@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
limit_mm_per_prompt
=
limit_mm_per_prompt
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
)
rng
=
np
.
random
.
RandomState
(
0
)
image
=
random_image
(
rng
,
min_wh
=
128
,
max_wh
=
256
)
...
...
@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
revision
=
None
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
tokenizer
=
cached_tokenizer_from_config
(
model_config
),
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
)
orig_get_hf_processor
=
processor
.
info
.
get_hf_processor
def
get_hf_processor
(
self
,
**
kwargs
):
...
...
tests/multimodal/test_utils.py
View file @
fcfc474d
...
...
@@ -11,12 +11,10 @@ import pytest
import
os
from
PIL
import
Image
,
ImageChops
from
transformers
import
AutoConfig
,
AutoTokenizer
from
vllm.multimodal.inputs
import
PlaceholderRange
from
vllm.multimodal.utils
import
(
MediaConnector
,
merge_and_sort_multimodal_metadata
,
repeat_and_pad_placeholder_tokens
)
merge_and_sort_multimodal_metadata
)
from
..utils
import
models_path_prefix
,
urls_port
if
TYPE_CHECKING
:
...
...
@@ -139,71 +137,6 @@ async def test_fetch_image_local_files(image_url: str):
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-v1.6-mistral-7b-hf"
)])
def
test_repeat_and_pad_placeholder_tokens
(
model
):
config
=
AutoConfig
.
from_pretrained
(
model
)
image_token_id
=
config
.
image_token_index
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model
)
test_cases
=
[
(
"<image>"
,
2
,
"<image><image>"
,
[
32000
,
32000
],
[{
"offset"
:
0
,
"length"
:
2
}],
),
(
"<image><image>"
,
2
,
"<image><image><image>"
,
[
32000
,
32000
,
32000
],
[{
"offset"
:
0
,
"length"
:
2
}],
),
(
"<image><image>"
,
[
3
,
2
],
"<image><image><image><image><image>"
,
[
32000
,
32000
,
32000
,
32000
,
32000
],
[{
"offset"
:
0
,
"length"
:
3
},
{
"offset"
:
3
,
"length"
:
2
}],
),
(
"Image:<image>Image:<image>!"
,
[
3
,
2
],
"Image:<image><image><image>Image:<image><image>!"
,
[
9833
,
28747
,
32000
,
32000
,
32000
,
9833
,
28747
,
32000
,
32000
,
918
],
[{
"offset"
:
2
,
"length"
:
3
},
{
"offset"
:
7
,
"length"
:
2
}],
),
(
"<image>"
,
[
3
,
2
],
"<image><image><image>"
,
[
32000
,
32000
,
32000
],
[{
"offset"
:
0
,
"length"
:
3
}],
),
]
# yapf: disable
for
(
prompt
,
repeat_count
,
expected_prompt
,
expected_token_ids
,
expected_ranges
,
)
in
test_cases
:
new_prompt
,
new_token_ids
,
ranges
=
repeat_and_pad_placeholder_tokens
(
tokenizer
=
tokenizer
,
prompt
=
prompt
,
prompt_token_ids
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
),
placeholder_token_id
=
image_token_id
,
repeat_count
=
repeat_count
,
)
assert
new_prompt
==
expected_prompt
assert
new_token_ids
==
expected_token_ids
assert
ranges
==
expected_ranges
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class
TestCase
(
NamedTuple
):
mm_positions
:
"MultiModalPlaceholderDict"
...
...
@@ -225,7 +158,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes
=
{
"image"
:
[
"hash1"
,
"hash2"
]},
expected_modalities
=
[
"image"
],
expected_modalities
=
[
"image"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
3
,
length
=
2
),
...
...
@@ -242,7 +175,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes
=
None
,
expected_modalities
=
[
"image"
],
expected_modalities
=
[
"image"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
2
,
length
=
2
),
...
...
@@ -267,7 +200,7 @@ def test_merge_and_sort_multimodal_metadata():
"image"
:
[
"image_hash1"
,
"image_hash2"
],
"audio"
:
[
"audio_hash1"
,
"audio_hash2"
],
},
expected_modalities
=
[
"audio"
,
"image"
],
expected_modalities
=
[
"audio"
,
"audio"
,
"image"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
2
,
length
=
3
),
...
...
@@ -293,7 +226,7 @@ def test_merge_and_sort_multimodal_metadata():
]
},
mm_hashes
=
None
,
expected_modalities
=
[
"audio"
,
"image"
],
expected_modalities
=
[
"audio"
,
"audio"
,
"image"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
2
,
length
=
3
),
...
...
@@ -324,7 +257,9 @@ def test_merge_and_sort_multimodal_metadata():
"audio"
:
[
"audio_hash1"
],
"video"
:
[
"video_hash1"
,
"video_hash2"
,
"video_hash3"
]
},
expected_modalities
=
[
"audio"
,
"video"
,
"image"
],
expected_modalities
=
[
"audio"
,
"video"
,
"video"
,
"video"
,
"image"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
3
,
length
=
4
),
...
...
@@ -370,12 +305,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
"image"
:
[
"image_hash1"
,
"image_hash2"
],
"audio"
:
[
"audio_hash1"
,
"audio_hash2"
],
},
expected_modalities
=
[],
expected_ranges
=
[],
expected_hashes
=
None
,
expected_modalities
=
[
"image"
,
"audio"
,
"image"
,
"audio"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
4
),
PlaceholderRange
(
offset
=
5
,
length
=
2
),
PlaceholderRange
(
offset
=
8
,
length
=
2
),
PlaceholderRange
(
offset
=
11
,
length
=
4
),
],
expected_hashes
=
[
"image_hash1"
,
"audio_hash1"
,
"image_hash2"
,
"audio_hash2"
],
),
# <image> <image> <
video> <audi
o> <image>
# <image> <image> <
audio> <vide
o> <image>
TestCase
(
mm_positions
=
{
"image"
:
[
...
...
@@ -391,15 +333,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
]
},
mm_hashes
=
None
,
expected_modalities
=
[],
expected_ranges
=
[],
expected_modalities
=
[
"image"
,
"image"
,
"audio"
,
"video"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
2
,
length
=
3
),
PlaceholderRange
(
offset
=
5
,
length
=
2
),
PlaceholderRange
(
offset
=
8
,
length
=
5
),
PlaceholderRange
(
offset
=
20
,
length
=
4
),
],
expected_hashes
=
None
,
),
# <image> <audio> <video> <image> with hashes
TestCase
(
mm_positions
=
{
"image"
:
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
18
,
length
=
4
),
],
"audio"
:
[
PlaceholderRange
(
offset
=
6
,
length
=
2
),
],
"video"
:
[
PlaceholderRange
(
offset
=
10
,
length
=
5
),
]
},
mm_hashes
=
{
"image"
:
[
"image_hash1"
,
"image_hash2"
],
"audio"
:
[
"audio_hash1"
],
"video"
:
[
"video_hash1"
],
},
expected_modalities
=
[
"image"
,
"audio"
,
"video"
,
"image"
],
expected_ranges
=
[
PlaceholderRange
(
offset
=
0
,
length
=
2
),
PlaceholderRange
(
offset
=
6
,
length
=
2
),
PlaceholderRange
(
offset
=
10
,
length
=
5
),
PlaceholderRange
(
offset
=
18
,
length
=
4
),
],
expected_hashes
=
[
"image_hash1"
,
"audio_hash1"
,
"video_hash1"
,
"image_hash2"
],
),
]
for
case
in
test_cas
es
:
with
pytest
.
raises
(
ValueError
)
as
ex_info
:
merge_and_sort_multimodal_metadata
(
case
.
mm_positions
,
case
.
mm_hashes
)
for
(
mm_positions
,
mm_hashes
,
expected_modalities
,
expected_rang
es
,
expected_hashes
)
in
test_cases
:
modalities
,
ranges
,
hashes
=
merge_and_sort_multimodal_metadata
(
mm_positions
,
mm_hashes
)
assert
"Interleaved mixed-modality"
in
str
(
ex_info
.
value
)
assert
modalities
==
expected_modalities
assert
ranges
==
expected_ranges
assert
hashes
==
expected_hashes
tests/neuron/1_core/test_cache.py
View file @
fcfc474d
...
...
@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
key_cache
=
torch
.
zeros_like
(
key_cache_cpu
,
device
=
device
)
value_cache
=
torch
.
zeros_like
(
value_cache_cpu
,
device
=
device
)
slot_mapping
=
slot_mapping_cpu
.
to
(
device
)
kv_cache
=
torch
.
stack
([
key_cache
,
value_cache
])
# Run vectorized implementation on XLA device
reshape_and_cache
(
key
,
value
,
key_cache
,
value_cache
,
slot_mapping
)
reshape_and_cache
(
key
,
value
,
kv_cache
,
slot_mapping
)
key_cache
,
value_cache
=
torch
.
unbind
(
kv_cache
,
dim
=
0
)
# Move results back to CPU for comparison
key_cache_result
=
key_cache
.
cpu
()
...
...
tests/neuron/1_core/test_prefix_prefill.py
View file @
fcfc474d
...
...
@@ -258,13 +258,13 @@ def sample_inputs(
value
[
start_loc
:
end_loc
])
cur_ctx
+=
block_size
block_id
+=
1
kv_cache
=
torch
.
stack
([
k_cache
,
v_cache
])
return
(
query
,
k
,
v
,
k_cache
,
v_cache
,
kv_cache
,
block_table
,
key
,
value
,
...
...
@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
query
,
k_active
,
v_active
,
k_cache
,
v_cache
,
kv_cache
,
block_table
,
key
,
value
,
...
...
@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
query
=
query
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
k
=
k
.
unsqueeze
(
0
).
permute
(
0
,
2
,
3
,
1
).
contiguous
()
v
=
v
.
unsqueeze
(
0
).
permute
(
0
,
2
,
1
,
3
).
contiguous
()
k_cache
=
k_cache
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
v_cache
=
v_cache
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
kv_cache
=
kv_cache
.
permute
(
0
,
1
,
3
,
2
,
4
).
contiguous
()
# transform block table
active_block_table
=
get_active_block_tables
(
...
...
@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
query
.
to
(
device
=
device
),
k
.
to
(
device
=
device
),
v
.
to
(
device
=
device
),
k_cache
.
to
(
device
=
device
),
v_cache
.
to
(
device
=
device
),
kv_cache
.
to
(
device
=
device
),
active_block_table
.
to
(
device
=
device
),
attn_mask
.
to
(
device
=
device
),
)
...
...
tests/quantization/test_bitsandbytes.py
View file @
fcfc474d
...
...
@@ -105,8 +105,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
"--enable-prefix-caching"
,
"--quantization"
,
"bitsandbytes"
,
"--load-format"
,
"bitsandbytes"
,
"--gpu-memory-utilization"
,
"0.7"
,
]
...
...
@@ -141,7 +139,6 @@ def validate_generated_texts(hf_runner,
# when using distributed inference
with
vllm_runner
(
model_name
,
quantization
=
'bitsandbytes'
,
load_format
=
'bitsandbytes'
,
tensor_parallel_size
=
vllm_tp_size
,
enforce_eager
=
False
)
as
llm
:
vllm_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
...
...
tests/quantization/test_compressed_tensors.py
View file @
fcfc474d
...
...
@@ -23,6 +23,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
# AITER only supports per-channel-per-channel INT8 gemm
# and per-tensor-per-tensor INT8 GEMM.
# It does not support mix precision MM and mix quantization scheme.
ROCM_AITER_SUPPORTED_INT8_MODEL
=
[
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
]
# TritonScaledMMLinearKernel only supports symmetric quantization.
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
=
[
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
,
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
,
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
,
]
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
...
...
@@ -60,6 +77,11 @@ def use_v0_only(monkeypatch):
)
def
test_compressed_tensors_w8a8_static_setup
(
vllm_runner
,
model_args
):
model_path
,
strategy
,
quant_type
,
shape_0
,
is_symmetric
=
model_args
if
current_platform
.
is_rocm
(
)
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
:
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
...
...
@@ -126,6 +148,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"use_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
def
test_compressed_tensors_w8a8_logprobs
(
hf_runner
,
vllm_runner
,
...
...
@@ -133,7 +157,21 @@ def test_compressed_tensors_w8a8_logprobs(
model_path
,
max_tokens
,
num_logprobs
,
use_aiter
,
monkeypatch
,
):
if
current_platform
.
is_rocm
(
)
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
:
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
if
use_aiter
:
if
model_path
not
in
ROCM_AITER_SUPPORTED_INT8_MODEL
:
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support by aiter."
)
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
dtype
=
"bfloat16"
# skip language translation prompt for the static per tensor asym model
...
...
@@ -157,6 +195,9 @@ def test_compressed_tensors_w8a8_logprobs(
name_1
=
"vllm"
,
)
if
current_platform
.
is_rocm
():
torch
.
cuda
.
synchronize
()
def
test_compressed_tensors_no_enforce_eager
(
vllm_runner
):
model_path
=
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
)
...
...
@@ -180,8 +221,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
),
],
)
def
test_compressed_tensors_w8a8_dynamic_per_token
(
vllm_runner
,
model_args
):
@
pytest
.
mark
.
parametrize
(
"use_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
def
test_compressed_tensors_w8a8_dynamic_per_token
(
vllm_runner
,
model_args
,
use_aiter
,
monkeypatch
,
):
model_path
,
strategy
=
model_args
if
current_platform
.
is_rocm
(
)
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
:
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
if
use_aiter
:
if
model_path
not
in
ROCM_AITER_SUPPORTED_INT8_MODEL
:
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support by aiter."
)
# this will enable VLLM_ROCM_USE_AITER_LINEAR
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
with
vllm_runner
(
model_path
,
dtype
=
torch
.
float16
)
as
llm
:
def
check_model
(
model
):
...
...
@@ -212,6 +272,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
),
"channel"
,
None
,
4
),
],
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"The tests are skipped on non-CUDA platform."
)
def
test_compressed_tensors_wNa16
(
vllm_runner
,
wNa16_args
):
model
,
strategy
,
group
,
pack_factor
=
wNa16_args
with
vllm_runner
(
model
)
as
llm
:
...
...
@@ -236,8 +298,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
assert
output
@
pytest
.
mark
.
skipif
(
current_platform
(),
reason
=
"
W4A16 MARLIN is not supported on ROC
m."
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"
This test is skipped on non-CUDA platfor
m."
)
def
test_compressed_tensors_w4a16_marlin24
(
vllm_runner
):
model_path
=
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
)
with
vllm_runner
(
model_path
)
as
llm
:
...
...
@@ -280,7 +342,7 @@ def test_compressed_tensors_fp8(vllm_runner):
if
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW8A8Fp8
):
assert
len
(
qkv_proj
.
input_scale
.
shape
)
==
0
assert
qkv_proj
.
weight
.
dtype
is
torch
.
float8_e4m3fn
assert
qkv_proj
.
weight
.
dtype
is
current_platform
.
fp8_dtype
()
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float32
assert
len
(
qkv_proj
.
weight_scale
.
shape
)
==
0
...
...
@@ -290,8 +352,8 @@ def test_compressed_tensors_fp8(vllm_runner):
assert
output
@
pytest
.
mark
.
skipif
(
current_platform
(),
reason
=
"
FP8 KV cache is not supported on ROC
m."
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"
This test is skipped on non-CUDA platfor
m."
)
def
test_compressed_tensors_kv_cache
(
vllm_runner
):
model_path
=
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
)
with
vllm_runner
(
model_path
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
...
...
@@ -320,7 +382,8 @@ def _test_2of4_quant_models(qkv_proj,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
not
current_platform
.
is_cuda
()
or
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
...
...
@@ -367,7 +430,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
@
pytest
.
mark
.
skipif
(
not
current_platform
.
has_device_capability
(
90
),
not
current_platform
.
is_cuda
()
or
not
current_platform
.
has_device_capability
(
90
),
reason
=
"Sparse FP8 is not yet supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
...
...
tests/quantization/test_cpu_offload.py
View file @
fcfc474d
...
...
@@ -12,13 +12,6 @@ from ..utils import compare_two_settings, models_path_prefix
from
vllm.platforms
import
current_platform
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
use_v0_only
(
monkeypatch
):
# Fall back to V0 if cpu offloading is enabled.
# Fixture is required to that baseline uses V0.
monkeypatch
.
setenv
(
'VLLM_USE_V1'
,
'0'
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
...
...
@@ -35,7 +28,9 @@ def test_cpu_offload_fp8():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_gptq
():
def
test_cpu_offload_gptq
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test GPTQ Marlin
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"
),
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
@@ -49,7 +44,9 @@ def test_cpu_offload_gptq():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"awq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"awq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_awq
():
def
test_cpu_offload_awq
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test AWQ Marlin
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct-AWQ"
),
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
@@ -63,7 +60,9 @@ def test_cpu_offload_awq():
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
current_platform
.
is_rocm
(),
reason
=
"gptq_marlin is not supported on this GPU type."
)
def
test_cpu_offload_compressed_tensors
():
def
test_cpu_offload_compressed_tensors
(
monkeypatch
):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch
.
setenv
(
'VLLM_TEST_FORCE_LOAD_FORMAT'
,
'auto'
)
# Test wNa16
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
),
[],
[
"--cpu-offload-gb"
,
"1"
],
...
...
tests/quantization/test_fp8.py
View file @
fcfc474d
...
...
@@ -25,8 +25,14 @@ MODELS = [
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
def
test_model_load_and_run
(
vllm_runner
,
model_id
:
str
,
force_marlin
:
bool
,
monkeypatch
)
->
None
:
use_rocm_aiter
:
bool
,
monkeypatch
)
->
None
:
if
use_rocm_aiter
:
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
if
force_marlin
:
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_FP8_MARLIN"
,
"1"
)
...
...
@@ -49,7 +55,13 @@ KV_CACHE_MODELS = [
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
)
or
current_platform
.
is_rocm
(),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
KV_CACHE_MODELS
)
def
test_kv_cache_model_load_and_run
(
vllm_runner
,
model_id
:
str
,
monkeypatch
):
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
def
test_kv_cache_model_load_and_run
(
vllm_runner
,
model_id
:
str
,
use_rocm_aiter
:
bool
,
monkeypatch
):
if
use_rocm_aiter
:
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
with
vllm_runner
(
model_id
,
kv_cache_dtype
=
"fp8"
)
as
llm
:
...
...
@@ -88,8 +100,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
def
test_load_fp16_model
(
vllm_runner
,
kv_cache_dtype
:
str
,
force_marlin
:
bool
,
monkeypatch
)
->
None
:
use_rocm_aiter
:
bool
,
monkeypatch
)
->
None
:
if
use_rocm_aiter
:
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
"1"
)
# vllm_runner.apply_model() relies on V0 internals.
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
...
...
tests/
entrypoints/openai/
reasoning
_parsers
/__init__.py
→
tests/reasoning/__init__.py
View file @
fcfc474d
File moved
tests/
entrypoints/openai/
reasoning
_parsers
/test_deepseekr1_reasoning_parser.py
→
tests/reasoning/test_deepseekr1_reasoning_parser.py
View file @
fcfc474d
...
...
@@ -3,74 +3,126 @@
import
pytest
from
transformers
import
AutoTokenizer
from
tests.entrypoints.openai.reasoning_parsers.utils
import
(
run_reasoning_extraction
)
from
vllm.entrypoints.openai.reasoning_parsers
import
(
ReasoningParser
,
ReasoningParserManager
)
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"deepseek_r1"
start_token
=
"<think>"
end_token
=
"</think>"
REASONING_MODEL_NAME
=
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
@
pytest
.
fixture
(
scope
=
"module"
)
def
deepseek_r1_qwen_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section</think>"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
NO_CONTENT
=
{
"output"
:
"This is content"
,
"reasoning_content"
:
"This is content"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
NO_REASONING_STREAMING
=
{
"output"
:
"This is a reasoning section"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
MULTIPLE_LINES
=
{
"output"
:
"This
\n
That</think>This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
}
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
""
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
SHORTEST_REASONING
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
COMPLETE_REASONING_WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
MULTIPLE_LINES_WITH_THINK
=
{
"output"
:
"<think>This
\n
That</think>This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
"is_reasoning_end"
:
True
,
}
SHORTEST_REASONING_NO_STREAMING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
""
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
SHORTEST_REASONING_WITH_THINK
=
{
"output"
:
"</think>This is the rest"
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
THINK_NO_END
=
{
"output"
:
"<think>This is a reasoning section"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
EMPTY
=
{
"output"
:
""
,
"reasoning_content"
:
""
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning_content"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
NEW_LINE
=
{
"output"
:
"
\n
<think>This is a reasoning section</think>
\n
This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
}
# Streaming cannot handle new lines at the beginning of the output
# because we need to support <think>...</think> and </think>...
# We cannot know if the text before <think> is reasoning content
# or not.
NEW_LINE_STREAMING
=
{
"output"
:
"
\n
<think>This is a reasoning section</think>
\n
This is the rest"
,
"reasoning_content"
:
"
\n
This is a reasoning section"
,
"content"
:
"
\n
This is the rest"
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
...
...
@@ -164,25 +216,53 @@ TEST_CASES = [
SHORTEST_REASONING_WITH_THINK
,
id
=
"shortest_with_think_streaming"
,
),
pytest
.
param
(
False
,
THINK_NO_END
,
id
=
"think_no_end"
,
),
pytest
.
param
(
True
,
THINK_NO_END
,
id
=
"think_no_end_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
NEW_LINE
,
id
=
"new_line"
,
),
pytest
.
param
(
True
,
NEW_LINE_STREAMING
,
id
=
"new_line_streaming"
,
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"facebook/opt-125m"
)
tokenizer
.
add_tokens
([
start_token
,
end_token
])
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
deepseek_r1_qwen_tokenizer
,
):
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
output
=
deepseek_r1_qwen_
tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
deepseek_r1_qwen_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
tokenizer
)
parser_name
)(
deepseek_r1_qwen_
tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
...
...
@@ -190,3 +270,17 @@ def test_reasoning(
assert
reasoning
==
param_dict
[
"reasoning_content"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
deepseek_r1_qwen_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_ids
)
assert
content
==
deepseek_r1_qwen_tokenizer
.
convert_tokens_to_ids
(
deepseek_r1_qwen_tokenizer
.
tokenize
(
param_dict
[
"content"
]))
else
:
content
=
parser
.
extract_content_ids
(
output
)
assert
content
==
[]
tests/reasoning/test_granite_reasoning_parser.py
0 → 100644
View file @
fcfc474d
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
DeltaMessage
,
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"granite"
START_REASONING
=
"Here is my thought process:"
START_RESPONSE
=
"Here is my response:"
SIMPLE_REASONING
=
{
"output"
:
f
"
{
START_REASONING
}
This is a reasoning section
{
START_RESPONSE
}
This is the rest"
,
#noqa: E501
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
}
COMPLETE_REASONING
=
{
"output"
:
f
"
{
START_REASONING
}
This is a reasoning section
{
START_RESPONSE
}
"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
NO_REASONING
=
{
"output"
:
"This is content"
,
"reasoning_content"
:
None
,
"content"
:
"This is content"
,
}
MULTIPLE_LINES
=
{
"output"
:
f
"
{
START_REASONING
}
This
\n
That
{
START_RESPONSE
}
This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
}
REASONING_WITH_THINK
=
{
"output"
:
f
"
{
START_REASONING
}
This is a reasoning section
{
START_RESPONSE
}
This is the rest"
,
#noqa: E501
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
}
COMPLETE_REASONING_WITH_THINK
=
{
"output"
:
f
"
{
START_REASONING
}
This is a reasoning section
{
START_RESPONSE
}
"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
MULTIPLE_LINES_WITH_THINK
=
{
"output"
:
f
"
{
START_REASONING
}
This
\n
That
{
START_RESPONSE
}
This is the rest
\n
That"
,
"reasoning_content"
:
"This
\n
That"
,
"content"
:
"This is the rest
\n
That"
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
),
pytest
.
param
(
False
,
NO_REASONING
,
id
=
"no_reasoning"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
False
,
REASONING_WITH_THINK
,
id
=
"reasoning_with_think"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING_WITH_THINK
,
id
=
"complete_reasoning_with_think"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES_WITH_THINK
,
id
=
"multiple_lines_with_think"
,
),
pytest
.
param
(
True
,
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
),
pytest
.
param
(
True
,
NO_REASONING
,
id
=
"no_reasoning_streaming"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
True
,
REASONING_WITH_THINK
,
id
=
"reasoning_with_think_streaming"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING_WITH_THINK
,
id
=
"complete_reasoning_with_think_streaming"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES_WITH_THINK
,
id
=
"multiple_lines_with_think_streaming"
,
),
]
# Global tokenizer initialization to avoid repeated loading
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"facebook/opt-125m"
)
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
):
output
=
tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning_content"
]
assert
content
==
param_dict
[
"content"
]
# Additional tests for verifying the correctness of granite streaming; this
# is complicated because granite uses multiple tokens to indicate when thinking
# is starting / when it's starting its response, so skipping special tokens
# is awkward.
### Handling the start of reasoning
STREAMING_1
=
{
"previous_text"
:
None
,
"current_text"
:
"Here"
,
"delta_text"
:
"Here"
,
"reasoning_content"
:
None
,
"content"
:
None
,
}
# When we fail, we should give what was previously being silenced first
STREAMING_2
=
{
"previous_text"
:
"Here is my thought"
,
"current_text"
:
"Here is my thought failure"
,
"delta_text"
:
" failure"
,
"reasoning_content"
:
None
,
"content"
:
"Here is my thought failure"
,
}
# But then after the first one, we should only add the delta text to content
STREAMING_3
=
{
"previous_text"
:
"Here wrong"
,
"current_text"
:
" words"
,
"delta_text"
:
" Here wrong words"
,
"reasoning_content"
:
None
,
"content"
:
" words"
,
}
# But then after the first one, we should only add the delta text to content
STREAMING_4
=
{
"previous_text"
:
"Here is my thought"
,
"current_text"
:
"Here is my thought process:"
,
"delta_text"
:
" process:"
,
"reasoning_content"
:
None
,
"content"
:
None
,
}
# Reasoning started successfully; parse reasoning content
STREAMING_5
=
{
"previous_text"
:
"Here is my thought process:"
,
"current_text"
:
"Here is my thought process: foo"
,
"delta_text"
:
" foo"
,
"reasoning_content"
:
" foo"
,
"content"
:
None
,
}
# Response special sequence has started, but not finished.
STREAMING_6
=
{
"previous_text"
:
"Here is my thought process: foo"
,
"current_text"
:
"Here is my thought process: foo Here is"
,
"delta_text"
:
" Here is"
,
"reasoning_content"
:
" "
,
"content"
:
None
,
}
# Response special sequence started, but was broken; the reasoning
# content should be the content that was previously unused.
STREAMING_7
=
{
"previous_text"
:
"Here is my thought process: foo Here is"
,
"current_text"
:
"Here is my thought process: foo Here is Here"
,
"delta_text"
:
" Here"
,
"reasoning_content"
:
"Here is "
,
"content"
:
None
,
}
# Response special sequence is ongoing
STREAMING_8
=
{
"previous_text"
:
"Here is my thought process: foo Here is my response:"
,
"current_text"
:
"Here is my thought process: foo Here is my response: bar"
,
"delta_text"
:
" bar"
,
"reasoning_content"
:
None
,
"content"
:
" bar"
,
}
# The delta text has everything; we should be able to correctly parse both
STREAMING_9
=
{
"previous_text"
:
None
,
"current_text"
:
"Here is my thought process: foo Here is my response: bar"
,
"delta_text"
:
"Here is my thought process: foo Here is my response: bar"
,
"reasoning_content"
:
" foo "
,
"content"
:
" bar"
,
}
## The Response is ongoing, and the delta mixes reasoning content / content
STREAMING_10
=
{
"previous_text"
:
"Here is my thought process: foo"
,
"current_text"
:
"Here is my thought process: foo bar Here is my response: baz"
,
"delta_text"
:
" bar Here is my response: baz"
,
"reasoning_content"
:
" bar "
,
"content"
:
" baz"
,
}
# The delta text starts a new substring that might be a response special seq
STREAMING_11
=
{
"previous_text"
:
"Here is my thought process: This is a reasoning section "
,
"current_text"
:
"Here is my thought process: This is a reasoning section Here"
,
"delta_text"
:
"Here"
,
"reasoning_content"
:
None
,
"content"
:
None
,
}
# The delta text is finishing the response special seq
STREAMING_12
=
{
"previous_text"
:
"Here is my thought process: foo Here is my response"
,
"current_text"
:
"Here is my thought process: foo Here is my response:"
,
"delta_text"
:
":"
,
"reasoning_content"
:
None
,
"content"
:
None
,
}
STREAMING_13
=
{
"previous_text"
:
"Here is my thought process: foo Here"
,
"current_text"
:
"Here is my thought process: foo Here was"
,
"delta_text"
:
" was"
,
"reasoning_content"
:
"Here was"
,
"content"
:
None
,
}
STREAMING_SUBCASES
=
[
pytest
.
param
(
STREAMING_1
,
id
=
"Starting reasoning special sequence"
,
),
pytest
.
param
(
STREAMING_2
,
id
=
"Unexpected start reasoning sequence"
,
),
pytest
.
param
(
STREAMING_3
,
id
=
"Continuing unexpected start reasoning sequence"
,
),
pytest
.
param
(
STREAMING_4
,
id
=
"Only start reasoning sequence and nothing else"
,
),
pytest
.
param
(
STREAMING_5
,
id
=
"Reasoning content has started"
,
),
pytest
.
param
(
STREAMING_6
,
id
=
"Response special sequence has started"
,
),
pytest
.
param
(
STREAMING_7
,
id
=
"Response special sequence reset"
,
),
pytest
.
param
(
STREAMING_8
,
id
=
"Response text has started"
,
),
pytest
.
param
(
STREAMING_9
,
id
=
"Delta contains everything"
,
),
pytest
.
param
(
STREAMING_10
,
id
=
"Delta contains some reasoning and response"
,
),
pytest
.
param
(
STREAMING_11
,
id
=
"Delta starts response sequence"
,
),
pytest
.
param
(
STREAMING_12
,
id
=
"Delta finishes response sequence"
,
),
pytest
.
param
(
STREAMING_13
,
id
=
"Delta breaks potential responise sequence"
,
),
]
@
pytest
.
mark
.
parametrize
(
"param_dict"
,
STREAMING_SUBCASES
)
def
test_streaming_subcases
(
param_dict
):
# Get all of the token IDs
previous_token_ids
=
tokenizer
.
encode
(
param_dict
[
"previous_text"
]
)
if
param_dict
[
"previous_text"
]
is
not
None
else
[]
current_token_ids
=
tokenizer
.
encode
(
param_dict
[
"current_text"
])
delta_token_ids
=
tokenizer
.
encode
(
param_dict
[
"delta_text"
])
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
tokenizer
)
response
=
parser
.
extract_reasoning_content_streaming
(
previous_text
=
param_dict
[
"previous_text"
],
current_text
=
param_dict
[
"current_text"
],
delta_text
=
param_dict
[
"delta_text"
],
previous_token_ids
=
previous_token_ids
,
current_token_ids
=
current_token_ids
,
delta_token_ids
=
delta_token_ids
,
)
# Streaming currently expects at least one of reasoning content / content,
# so the response should return None in that case.
if
param_dict
[
"reasoning_content"
]
is
None
and
param_dict
[
"content"
]
is
None
:
assert
response
is
None
else
:
assert
isinstance
(
response
,
DeltaMessage
)
assert
param_dict
[
"reasoning_content"
]
==
response
.
reasoning_content
assert
param_dict
[
"content"
]
==
response
.
content
tests/
entrypoints/openai/
reasoning
_parsers
/utils.py
→
tests/reasoning/utils.py
View file @
fcfc474d
...
...
@@ -4,7 +4,7 @@ from typing import Optional, Union
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
DeltaMessage
)
from
vllm.
entrypoints.openai.
reasoning
_parsers
import
ReasoningParser
from
vllm.reasoning
import
ReasoningParser
class
StreamingReasoningReconstructor
:
...
...
tests/spec_decode/e2e/test_integration_dist_tp2.py
View file @
fcfc474d
...
...
@@ -3,6 +3,7 @@
tensor parallelism.
"""
import
json
from
typing
import
Optional
import
pytest
...
...
@@ -30,14 +31,14 @@ from ...utils import models_path_prefix
@
pytest
.
mark
.
parametrize
(
"test_llm_kwargs"
,
[
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
3
,
}),
],
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
"ngram"
,
"num_speculative_tokens"
:
5
,
"prompt_lookup_max"
:
3
,
...
...
@@ -90,7 +91,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"model, test_llm_kwargs"
,
[(
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"JackFram/llama-68m"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
...
...
@@ -98,7 +99,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
]),
(
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
os
.
path
.
join
(
models_path_prefix
,
"ibm-granite/granite-3b-code-instruct"
),
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
...
...
@@ -149,20 +150,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
,
2
])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2
(
model
,
common_llm_kwargs
,
...
...
@@ -173,9 +174,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
if
logprobs
:
test_llm_kwargs
.
extend
(
[
"--disable_logprobs_during_spec_decoding"
,
"False"
])
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
batch_size
,
max_output_len
=
32
,
seed
=
seed
,
temperature
=
0.0
,
logprobs
=
logprobs
)
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
device_count
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[[
# Skip cuda graph recording for fast test.
"--enforce-eager"
,
"--tensor_parallel_size"
,
"2"
,
# precision
"--dtype"
,
"bfloat16"
,
]])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[[
"--enable-chunked-prefill"
,
"False"
],
[
"--enable-chunked-prefill"
,
"True"
,
"--max-num-batched-tokens"
,
"4"
,
"--max-num-seqs"
,
"4"
]])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[[]])
@
pytest
.
mark
.
parametrize
(
"model, test_llm_kwargs"
,
[(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"disable_logprobs"
:
False
,
}),
]),
(
"JackFram/llama-68m"
,
[
"--speculative_config"
,
json
.
dumps
({
"model"
:
"JackFram/llama-68m"
,
"num_speculative_tokens"
:
3
,
"draft_tensor_parallel_size"
:
1
,
"disable_logprobs"
:
False
,
}),
])])
@
pytest
.
mark
.
parametrize
(
"logprobs"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_spec_decode_chunked_prefill_tp2_with_logprobs
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
baseline_llm_kwargs
,
test_llm_kwargs
,
logprobs
:
Optional
[
int
],
batch_size
:
int
,
seed
:
int
):
"""Verify spec decode works well with same and different TP size for
the draft model with chunked prefill.
"""
run_equality_correctness_test_tp
(
model
,
common_llm_kwargs
,
per_test_common_llm_kwargs
,
...
...
tests/spec_decode/e2e/test_integration_dist_tp4.py
View file @
fcfc474d
...
...
@@ -3,6 +3,8 @@
tensor parallelism.
"""
import
json
import
openai
import
pytest
import
torch
...
...
@@ -35,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
#TODO(wooyeon): add spec_draft_dp=2 case
[
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"draft_tensor_parallel_size"
:
1
,
...
...
@@ -82,7 +84,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
# Artificially limit the draft model max model len; this forces vLLM
# to skip speculation once the sequences grow beyond 32-k tokens.
"--speculative_config"
,
str
({
json
.
dumps
({
"model"
:
f
"
{
SPEC_MODEL
}
"
,
"num_speculative_tokens"
:
5
,
"max_model_len"
:
32
,
...
...
tests/test_utils.py
View file @
fcfc474d
...
...
@@ -2,19 +2,22 @@
# ruff: noqa
import
asyncio
import
hashlib
import
pickle
import
socket
from
collections.abc
import
AsyncIterator
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm_test_utils
import
monitor
from
vllm_test_utils
.monitor
import
monitor
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.utils
import
(
FlexibleArgumentParser
,
MemorySnapshot
,
PlaceholderModule
,
StoreBoolean
,
bind_kv_cache
,
deprecate_kwargs
,
get_open_port
,
memory_profiling
,
merge_async_iterators
,
supports_kw
,
swap_dict_values
)
merge_async_iterators
,
sha256
,
supports_kw
,
swap_dict_values
)
from
.utils
import
create_new_process_for_each_test
,
error_on_warning
from
.utils
import
models_path_prefix
...
...
@@ -141,7 +144,8 @@ def parser():
def
parser_with_config
():
parser
=
FlexibleArgumentParser
()
parser
.
add_argument
(
'serve'
)
parser
.
add_argument
(
'model_tag'
)
parser
.
add_argument
(
'model_tag'
,
nargs
=
'?'
)
parser
.
add_argument
(
'--model'
,
type
=
str
)
parser
.
add_argument
(
'--served-model-name'
,
type
=
str
)
parser
.
add_argument
(
'--config'
,
type
=
str
)
parser
.
add_argument
(
'--port'
,
type
=
int
)
...
...
@@ -198,29 +202,29 @@ def test_missing_required_argument(parser):
parser
.
parse_args
([])
def
test_cli_override_to_config
(
parser_with_config
):
def
test_cli_override_to_config
(
parser_with_config
,
cli_config_file
):
args
=
parser_with_config
.
parse_args
([
'serve'
,
'mymodel'
,
'--config'
,
'./data/test
_config
.yaml'
,
'serve'
,
'mymodel'
,
'--config'
,
cli
_config
_file
,
'--tensor-parallel-size'
,
'3'
])
assert
args
.
tensor_parallel_size
==
3
args
=
parser_with_config
.
parse_args
([
'serve'
,
'mymodel'
,
'--tensor-parallel-size'
,
'3'
,
'--config'
,
'./data/test
_config
.yaml'
cli
_config
_file
])
assert
args
.
tensor_parallel_size
==
3
assert
args
.
port
==
12312
args
=
parser_with_config
.
parse_args
([
'serve'
,
'mymodel'
,
'--tensor-parallel-size'
,
'3'
,
'--config'
,
'./data/test
_config
.yaml'
,
'--port'
,
'666'
cli
_config
_file
,
'--port'
,
'666'
])
assert
args
.
tensor_parallel_size
==
3
assert
args
.
port
==
666
def
test_config_args
(
parser_with_config
):
def
test_config_args
(
parser_with_config
,
cli_config_file
):
args
=
parser_with_config
.
parse_args
(
[
'serve'
,
'mymodel'
,
'--config'
,
'./data/test
_config
.yaml'
])
[
'serve'
,
'mymodel'
,
'--config'
,
cli
_config
_file
])
assert
args
.
tensor_parallel_size
==
2
assert
args
.
trust_remote_code
assert
not
args
.
multi_step_stream_outputs
...
...
@@ -242,10 +246,9 @@ def test_config_file(parser_with_config):
])
def
test_no_model_tag
(
parser_with_config
):
def
test_no_model_tag
(
parser_with_config
,
cli_config_file
):
with
pytest
.
raises
(
ValueError
):
parser_with_config
.
parse_args
(
[
'serve'
,
'--config'
,
'./data/test_config.yaml'
])
parser_with_config
.
parse_args
([
'serve'
,
'--config'
,
cli_config_file
])
# yapf: enable
...
...
@@ -478,3 +481,63 @@ def test_swap_dict_values(obj, key1, key2):
assert
obj
[
key1
]
==
original_obj
[
key2
]
else
:
assert
key1
not
in
obj
def
test_model_specification
(
parser_with_config
,
cli_config_file
,
cli_config_file_with_model
):
# Test model in CLI takes precedence over config
args
=
parser_with_config
.
parse_args
([
'serve'
,
'cli-model'
,
'--config'
,
cli_config_file_with_model
])
assert
args
.
model_tag
==
'cli-model'
assert
args
.
served_model_name
==
'mymodel'
# Test model from config file works
args
=
parser_with_config
.
parse_args
([
'serve'
,
'--config'
,
cli_config_file_with_model
,
])
assert
args
.
model
==
'config-model'
assert
args
.
served_model_name
==
'mymodel'
# Test no model specified anywhere raises error
with
pytest
.
raises
(
ValueError
,
match
=
"No model specified!"
):
parser_with_config
.
parse_args
([
'serve'
,
'--config'
,
cli_config_file
])
# Test using --model option raises error
with
pytest
.
raises
(
ValueError
,
match
=
(
"With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."
),
):
parser_with_config
.
parse_args
([
'serve'
,
'--model'
,
'my-model'
])
# Test other config values are preserved
args
=
parser_with_config
.
parse_args
([
'serve'
,
'cli-model'
,
'--config'
,
cli_config_file_with_model
,
])
assert
args
.
tensor_parallel_size
==
2
assert
args
.
trust_remote_code
is
True
assert
args
.
multi_step_stream_outputs
is
False
assert
args
.
port
==
12312
@
pytest
.
mark
.
parametrize
(
"input"
,
[(),
(
"abc"
,
),
(
None
,
),
(
None
,
bool
,
[
1
,
2
,
3
])])
@
pytest
.
mark
.
parametrize
(
"output"
,
[
0
,
1
,
2
])
def
test_sha256
(
input
:
tuple
,
output
:
int
):
hash
=
sha256
(
input
)
assert
hash
is
not
None
assert
isinstance
(
hash
,
int
)
assert
hash
!=
0
bytes
=
pickle
.
dumps
(
input
,
protocol
=
pickle
.
HIGHEST_PROTOCOL
)
assert
hash
==
int
.
from_bytes
(
hashlib
.
sha256
(
bytes
).
digest
(),
byteorder
=
"big"
)
# hashing again, returns the same value
assert
hash
==
sha256
(
input
)
# hashing different input, returns different value
assert
hash
!=
sha256
(
input
+
(
1
,
))
tests/tool_use/test_chat_completion_request_validations.py
View file @
fcfc474d
...
...
@@ -45,7 +45,8 @@ def test_chat_completion_request_with_no_tools():
assert
request
.
tool_choice
==
'none'
def
test_chat_completion_request_with_tool_choice_but_no_tools
():
@
pytest
.
mark
.
parametrize
(
'tool_choice'
,
[
'auto'
,
'required'
])
def
test_chat_completion_request_with_tool_choice_but_no_tools
(
tool_choice
):
with
pytest
.
raises
(
ValueError
,
match
=
"When using `tool_choice`, `tools` must be set."
):
ChatCompletionRequest
.
model_validate
({
...
...
@@ -56,7 +57,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model'
:
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
),
'tool_choice'
:
'auto'
tool_choice
})
with
pytest
.
raises
(
ValueError
,
...
...
@@ -69,7 +70,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
'model'
:
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
),
'tool_choice'
:
'auto'
,
tool_choice
,
'tools'
:
None
})
Prev
1
…
8
9
10
11
12
13
14
15
16
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment