Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ec5e299c
Commit
ec5e299c
authored
Feb 21, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.3' into v0.7.3-dev
parents
47bd229c
ed6e9075
Changes
521
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
446 additions
and
142 deletions
+446
-142
tests/models/multimodal/processing/test_qwen2_vl.py
tests/models/multimodal/processing/test_qwen2_vl.py
+8
-8
tests/models/registry.py
tests/models/registry.py
+37
-18
tests/models/test_initialization.py
tests/models/test_initialization.py
+6
-3
tests/models/test_transformers.py
tests/models/test_transformers.py
+49
-5
tests/models/test_vision.py
tests/models/test_vision.py
+34
-0
tests/models/utils.py
tests/models/utils.py
+11
-7
tests/mq_llm_engine/test_abort.py
tests/mq_llm_engine/test_abort.py
+2
-2
tests/mq_llm_engine/test_error_handling.py
tests/mq_llm_engine/test_error_handling.py
+4
-2
tests/mq_llm_engine/test_load.py
tests/mq_llm_engine/test_load.py
+4
-2
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+10
-8
tests/multimodal/utils.py
tests/multimodal/utils.py
+0
-3
tests/neuron/test_prefix_prefill.py
tests/neuron/test_prefix_prefill.py
+67
-51
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+1
-1
tests/plugins_tests/test_scheduler_plugins.py
tests/plugins_tests/test_scheduler_plugins.py
+33
-0
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+0
-1
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+7
-7
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+38
-11
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+68
-0
tests/quantization/test_lm_head.py
tests/quantization/test_lm_head.py
+12
-13
tests/quantization/test_ptpc_fp8.py
tests/quantization/test_ptpc_fp8.py
+55
-0
No files found.
Too many changes to show.
To preserve performance only
521 of 521+
files are displayed.
Plain diff
Email patch
tests/models/multimodal/processing/test_qwen2_vl.py
View file @
ec5e299c
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
pytest
import
pytest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.
multimodal.utils
import
cached_
get_
tokenizer
from
vllm.
transformers_utils.tokenizer
import
cached_tokenizer
_from_config
from
....conftest
import
_ImageAssets
from
....conftest
import
_ImageAssets
from
...utils
import
build_model_context
from
...utils
import
build_model_context
...
@@ -18,6 +18,7 @@ from ...utils import build_model_context
...
@@ -18,6 +18,7 @@ from ...utils import build_model_context
])
])
# yapf: enable
# yapf: enable
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"num_imgs"
,
[
1
,
2
])
@
pytest
.
mark
.
parametrize
(
"kwargs_on_init"
,
[
True
,
False
])
def
test_processor_override
(
def
test_processor_override
(
image_assets
:
_ImageAssets
,
image_assets
:
_ImageAssets
,
model_id
:
str
,
model_id
:
str
,
...
@@ -25,31 +26,30 @@ def test_processor_override(
...
@@ -25,31 +26,30 @@ def test_processor_override(
expected_toks_per_img
:
int
,
expected_toks_per_img
:
int
,
expected_pixels_shape
:
tuple
[
int
,
int
],
expected_pixels_shape
:
tuple
[
int
,
int
],
num_imgs
:
int
,
num_imgs
:
int
,
kwargs_on_init
:
bool
,
):
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx
=
build_model_context
(
ctx
=
build_model_context
(
model_name
=
model_id
,
model_name
=
model_id
,
tokenizer_name
=
model_id
,
tokenizer_name
=
model_id
,
mm_processor_kwargs
=
None
,
mm_processor_kwargs
=
mm_processor_kwargs
if
kwargs_on_init
else
None
,
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
limit_mm_per_prompt
=
{
"image"
:
num_imgs
},
)
)
tokenizer
=
cached_get_tokenizer
(
tokenizer
=
cached_tokenizer_from_config
(
ctx
.
model_config
)
ctx
.
model_config
.
tokenizer
,
trust_remote_code
=
ctx
.
model_config
.
trust_remote_code
,
)
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
ctx
.
model_config
,
ctx
.
model_config
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
)
)
hf_processor_mm_kwargs
=
{}
if
kwargs_on_init
else
mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
# Build the image str / prompt based on the number of images we pass
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
*
num_imgs
prompt
=
"<|vision_start|><|image_pad|><|vision_end|>"
*
num_imgs
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
mm_data
=
{
"image"
:
[
image_assets
[
0
].
pil_image
]
*
num_imgs
}
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
mm
_processor_kwargs
)
processed_inputs
=
processor
.
apply
(
prompt
,
mm_data
,
hf
_processor_
mm_
kwargs
)
# Ensure we have the right number of placeholders per num_crops size
# Ensure we have the right number of placeholders per num_crops size
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
mm
_processor_kwargs
)
hf_processor
=
processor
.
info
.
get_hf_processor
(
**
hf
_processor_
mm_
kwargs
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
image_token_id
=
tokenizer
.
convert_tokens_to_ids
(
hf_processor
.
image_token
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
img_tok_count
=
processed_inputs
[
"prompt_token_ids"
].
count
(
image_token_id
)
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values"
].
shape
pixel_shape
=
processed_inputs
[
"mm_kwargs"
][
"pixel_values"
].
shape
...
...
tests/models/registry.py
View file @
ec5e299c
...
@@ -102,8 +102,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -102,8 +102,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"BaichuanForCausalLM"
:
_HfExamplesInfo
(
"baichuan-inc/Baichuan2-7B-chat"
,
"BaichuanForCausalLM"
:
_HfExamplesInfo
(
"baichuan-inc/Baichuan2-7B-chat"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"BambaForCausalLM"
:
_HfExamplesInfo
(
"ibm-ai-platform/Bamba-9B"
),
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloomz-1b1"
),
"BloomForCausalLM"
:
_HfExamplesInfo
(
"bigscience/bloomz-1b1"
),
# ChatGLMModel supports multimodal
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/chatglm3-6b"
,
trust_remote_code
=
True
),
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r-v01"
,
"CohereForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r-v01"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
"Cohere2ForCausalLM"
:
_HfExamplesInfo
(
"CohereForAI/c4ai-command-r7b-12-2024"
,
# noqa: E501
...
@@ -137,11 +139,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -137,11 +139,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"InternLM3ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm3-8b-instruct"
,
"InternLM3ForCausalLM"
:
_HfExamplesInfo
(
"internlm/internlm3-8b-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JAISLMHeadModel"
:
_HfExamplesInfo
(
"inceptionai/jais-13b-chat"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
),
"JambaForCausalLM"
:
_HfExamplesInfo
(
"ai21labs/AI21-Jamba-1.5-Mini"
,
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Meta-Llama-3-8B"
),
extras
=
{
"tiny"
:
"ai21labs/Jamba-tiny-dev"
}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-1B-Instruct"
),
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
"decapoda-research/llama-7b-hf"
,
is_available_online
=
False
),
is_available_online
=
False
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
"MambaForCausalLM"
:
_HfExamplesInfo
(
"state-spaces/mamba-130m-hf"
),
"Mamba2ForCausalLM"
:
_HfExamplesInfo
(
"mistralai/Mamba-Codestral-7B-v0.1"
,
is_available_online
=
False
),
"FalconMambaForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-mamba-7b-instruct"
),
# noqa: E501
"FalconMambaForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-mamba-7b-instruct"
),
# noqa: E501
"MiniCPMForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-2B-sft-bf16"
,
"MiniCPMForCausalLM"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-2B-sft-bf16"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
...
@@ -166,8 +171,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
...
@@ -166,8 +171,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3.5-MoE-instruct"
,
"PhiMoEForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3.5-MoE-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
# QWenLMHeadModel supports multimodal
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"Qwen/Qwen-7B-Chat"
,
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-7B-Instruct"
),
trust_remote_code
=
True
),
"Qwen2ForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen2-7B-Instruct"
,
extras
=
{
"2.5"
:
"Qwen/Qwen2.5-7B-Instruct"
}),
# noqa: E501
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"Qwen2MoeForCausalLM"
:
_HfExamplesInfo
(
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
),
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
"RWForCausalLM"
:
_HfExamplesInfo
(
"tiiuae/falcon-40b"
,
is_available_online
=
False
),
is_available_online
=
False
),
...
@@ -213,6 +220,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
...
@@ -213,6 +220,10 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"TIGER-Lab/VLM2Vec-Full"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"MrLight/dse-qwen2-2b-mrl-v1"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"MrLight/dse-qwen2-2b-mrl-v1"
),
# noqa: E501
# The model on Huggingface is currently being updated,
# hence I temporarily mark it as not available online
"PrithviGeoSpatialMAE"
:
_HfExamplesInfo
(
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
,
# noqa: E501
is_available_online
=
False
),
}
}
_CROSS_ENCODER_EXAMPLE_MODELS
=
{
_CROSS_ENCODER_EXAMPLE_MODELS
=
{
...
@@ -227,18 +238,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -227,18 +238,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"AriaForConditionalGeneration"
:
_HfExamplesInfo
(
"rhymes-ai/Aria"
),
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
),
# noqa: E501
"Blip2ForConditionalGeneration"
:
_HfExamplesInfo
(
"Salesforce/blip2-opt-2.7b"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"ChameleonForConditionalGeneration"
:
_HfExamplesInfo
(
"facebook/chameleon-7b"
),
# noqa: E501
"ChatGLMModel"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
extras
=
{
"text_only"
:
"THUDM/chatglm3-6b"
},
trust_remote_code
=
True
),
"ChatGLMForConditionalGeneration"
:
_HfExamplesInfo
(
"chatglm2-6b"
,
is_available_online
=
False
),
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
"DeepseekVLV2ForCausalLM"
:
_HfExamplesInfo
(
"deepseek-ai/deepseek-vl2-tiny"
,
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]}),
# noqa: E501
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"FuyuForCausalLM"
:
_HfExamplesInfo
(
"adept/fuyu-8b"
),
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
),
"GLM4VForCausalLM"
:
_HfExamplesInfo
(
"THUDM/glm-4v-9b"
,
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"GLM4VForCausalLM"
]}),
# noqa: E501
"H2OVLChatModel"
:
_HfExamplesInfo
(
"h2oai/h2ovl-mississippi-800m"
,
extras
=
{
"2b"
:
"h2oai/h2ovl-mississippi-2b"
}),
# noqa: E501
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
"InternVLChatModel"
:
_HfExamplesInfo
(
"OpenGVLab/InternVL2-1B"
,
extras
=
{
"2B"
:
"OpenGVLab/InternVL2-2B"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-1.5-7b-hf"
,
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
}),
# noqa: E501
extras
=
{
"mistral"
:
"mistral-community/pixtral-12b"
}),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-v1.6-mistral-7b-hf"
),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
_HfExamplesInfo
(
"llava-hf/llava-v1.6-mistral-7b-hf"
),
# noqa: E501
...
@@ -248,25 +260,29 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -248,25 +260,29 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}),
# noqa: E501
hf_overrides
=
{
"architectures"
:
[
"MantisForConditionalGeneration"
]}),
# noqa: E501
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
"MiniCPMO"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-o-2_6"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-V-2_6"
,
"MiniCPMV"
:
_HfExamplesInfo
(
"openbmb/MiniCPM-Llama3-V-2_5"
,
extras
=
{
"2.6"
:
"openbmb/MiniCPM-V-2_6"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
"MolmoForCausalLM"
:
_HfExamplesInfo
(
"allenai/Molmo-7B-D-0924"
,
extras
=
{
"olmo"
:
"allenai/Molmo-7B-O-0924"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
"NVLM_D"
:
_HfExamplesInfo
(
"nvidia/NVLM-D-72B"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-pt-224"
),
# noqa: E501
"PaliGemmaForConditionalGeneration"
:
_HfExamplesInfo
(
"google/paligemma-3b-mix-224"
,
# noqa: E501
extras
=
{
"v2"
:
"google/paligemma2-3b-ft-docci-448"
}),
# noqa: E501
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
"Phi3VForCausalLM"
:
_HfExamplesInfo
(
"microsoft/Phi-3-vision-128k-instruct"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
"PixtralForConditionalGeneration"
:
_HfExamplesInfo
(
"mistralai/Pixtral-12B-2409"
,
# noqa: E501
tokenizer_mode
=
"mistral"
),
tokenizer_mode
=
"mistral"
),
"QWenLMHeadModel"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL-Chat"
,
"QwenVLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen-VL"
,
extras
=
{
"text_only"
:
"Qwen/Qwen-7B-Chat"
},
# noqa: E501
extras
=
{
"chat"
:
"Qwen/Qwen-VL-Chat"
},
# noqa: E501
trust_remote_code
=
True
),
trust_remote_code
=
True
,
hf_overrides
=
{
"architectures"
:
[
"QwenVLForConditionalGeneration"
]}),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2AudioForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-Audio-7B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2-VL-2B-Instruct"
),
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
"Qwen2_5_VLForConditionalGeneration"
:
_HfExamplesInfo
(
"Qwen/Qwen2.5-VL-3B-Instruct"
,
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
min_transformers_version
=
"4.49"
),
# noqa: E501
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_
3
"
,
"UltravoxModel"
:
_HfExamplesInfo
(
"fixie-ai/ultravox-v0_
5-llama-3_2-1b
"
,
trust_remote_code
=
True
),
trust_remote_code
=
True
),
# [Encoder-decoder]
# [Encoder-decoder]
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
"MllamaForConditionalGeneration"
:
_HfExamplesInfo
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
),
# noqa: E501
...
@@ -280,6 +296,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
...
@@ -280,6 +296,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model
=
"abhigoyal/vllm-medusa-llama-68m-random"
),
# noqa: E501
speculative_model
=
"abhigoyal/vllm-medusa-llama-68m-random"
),
# noqa: E501
"MLPSpeculatorPreTrainedModel"
:
_HfExamplesInfo
(
"JackFram/llama-160m"
,
"MLPSpeculatorPreTrainedModel"
:
_HfExamplesInfo
(
"JackFram/llama-160m"
,
speculative_model
=
"ibm-ai-platform/llama-160m-accelerator"
),
# noqa: E501
speculative_model
=
"ibm-ai-platform/llama-160m-accelerator"
),
# noqa: E501
"DeepSeekMTPModel"
:
_HfExamplesInfo
(
"luccafong/deepseek_mtp_main_random"
,
speculative_model
=
"luccafong/deepseek_mtp_draft_random"
,
# noqa: E501
trust_remote_code
=
True
),
}
}
_FALLBACK_MODEL
=
{
_FALLBACK_MODEL
=
{
...
...
tests/models/test_initialization.py
View file @
ec5e299c
...
@@ -7,6 +7,7 @@ from transformers import PretrainedConfig
...
@@ -7,6 +7,7 @@ from transformers import PretrainedConfig
from
vllm
import
LLM
from
vllm
import
LLM
from
..conftest
import
MODELS_ON_S3
from
.registry
import
HF_EXAMPLE_MODELS
from
.registry
import
HF_EXAMPLE_MODELS
...
@@ -18,8 +19,7 @@ def test_can_initialize(model_arch):
...
@@ -18,8 +19,7 @@ def test_can_initialize(model_arch):
# Avoid OOM
# Avoid OOM
def
hf_overrides
(
hf_config
:
PretrainedConfig
)
->
PretrainedConfig
:
def
hf_overrides
(
hf_config
:
PretrainedConfig
)
->
PretrainedConfig
:
if
hf_config
.
model_type
==
"deepseek_vl_v2"
:
hf_config
.
update
(
model_info
.
hf_overrides
)
hf_config
.
update
({
"architectures"
:
[
"DeepseekVLV2ForCausalLM"
]})
if
hasattr
(
hf_config
,
"text_config"
):
if
hasattr
(
hf_config
,
"text_config"
):
text_config
:
PretrainedConfig
=
hf_config
.
text_config
text_config
:
PretrainedConfig
=
hf_config
.
text_config
...
@@ -43,8 +43,11 @@ def test_can_initialize(model_arch):
...
@@ -43,8 +43,11 @@ def test_can_initialize(model_arch):
with
patch
.
object
(
LLM
.
get_engine_class
(),
"_initialize_kv_caches"
,
with
patch
.
object
(
LLM
.
get_engine_class
(),
"_initialize_kv_caches"
,
_initialize_kv_caches
):
_initialize_kv_caches
):
model_name
=
model_info
.
default
if
model_name
in
MODELS_ON_S3
:
model_name
=
f
"s3://vllm-ci-model-weights/
{
model_name
.
split
(
'/'
)[
-
1
]
}
"
LLM
(
LLM
(
model_
info
.
default
,
model_
name
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer
=
model_info
.
tokenizer
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
speculative_model
=
model_info
.
speculative_model
,
speculative_model
=
model_info
.
speculative_model
,
...
...
tests/models/test_transformers.py
View file @
ec5e299c
...
@@ -45,10 +45,14 @@ def check_implementation(
...
@@ -45,10 +45,14 @@ def check_implementation(
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"transformers"
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"transformers"
),
(
"openai-community/gpt2"
,
"transformers"
),
(
"openai-community/gpt2"
,
"transformers"
),
(
"ArthurZ/Ilama-3.2-1B"
,
"auto"
),
# CUSTOM CODE
(
"ArthurZ/Ilama-3.2-1B"
,
"auto"
),
# CUSTOM CODE
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"auto"
),
])
# trust_remote_code=True by default
])
# trust_remote_code=True by default
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
,
def
test_models
(
model_impl
)
->
None
:
hf_runner
:
Type
[
HfRunner
],
vllm_runner
:
Type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
model_impl
:
str
,
)
->
None
:
maybe_raises
=
nullcontext
()
maybe_raises
=
nullcontext
()
if
model
==
"openai-community/gpt2"
and
model_impl
==
"transformers"
:
if
model
==
"openai-community/gpt2"
and
model_impl
==
"transformers"
:
...
@@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,
...
@@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
def
test_distributed
(
def
test_distributed
(
hf_runner
,
hf_runner
:
Type
[
HfRunner
]
,
vllm_runner
,
vllm_runner
:
Type
[
VllmRunner
]
,
example_prompts
,
example_prompts
,
):
):
kwargs
=
{
"model_impl"
:
"transformers"
,
"tensor_parallel_size"
:
2
}
kwargs
=
{
"model_impl"
:
"transformers"
,
"tensor_parallel_size"
:
2
}
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
check_implementation
(
hf_runner
,
vllm_runner
,
example_prompts
,
"meta-llama/Llama-3.2-1B-Instruct"
,
**
kwargs
)
"meta-llama/Llama-3.2-1B-Instruct"
,
**
kwargs
)
@
pytest
.
mark
.
parametrize
(
"model, quantization_kwargs"
,
[
(
"meta-llama/Llama-3.2-1B-Instruct"
,
{
"quantization"
:
"bitsandbytes"
,
"load_format"
:
"bitsandbytes"
,
},
),
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_quantization
(
vllm_runner
:
Type
[
VllmRunner
],
example_prompts
:
list
[
str
],
model
:
str
,
quantization_kwargs
:
dict
[
str
,
str
],
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
model_impl
=
"auto"
,
enforce_eager
=
True
,
**
quantization_kwargs
)
as
vllm_model
:
# type: ignore[arg-type]
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
)
with
vllm_runner
(
model
,
model_impl
=
"transformers"
,
enforce_eager
=
True
,
**
quantization_kwargs
)
as
vllm_model
:
# type: ignore[arg-type]
transformers_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
transformers_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"transformers"
,
name_1
=
"vllm"
,
)
tests/models/test_vision.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.model_executor.models.vision
import
resolve_visual_encoder_outputs
@
pytest
.
mark
.
parametrize
(
(
"feature_sample_layers"
,
"num_layers_loaded"
,
"max_possible_layers"
,
"expected_features"
),
[
# All layers loaded
([
1
,
10
],
10
,
10
,
[
1
,
10
]),
([
-
10
,
-
1
],
10
,
10
,
[
1
,
10
]),
# Some layers not loaded
([
1
,
10
],
10
,
20
,
[
1
,
10
]),
([
-
20
,
-
11
],
10
,
20
,
[
1
,
10
]),
])
def
test_resolve_visual_encoder_outputs
(
feature_sample_layers
,
num_layers_loaded
,
max_possible_layers
,
expected_features
):
"""
Test that offsets are correctly handled for vision feature layers.
"""
encoder_outputs
=
[
torch
.
tensor
([
idx
])
for
idx
in
range
(
num_layers_loaded
+
1
)
]
output_tensor
=
resolve_visual_encoder_outputs
(
encoder_outputs
=
encoder_outputs
,
feature_sample_layers
=
feature_sample_layers
,
post_layer_norm
=
None
,
max_possible_layers
=
max_possible_layers
)
assert
torch
.
equal
(
torch
.
tensor
(
expected_features
),
output_tensor
)
tests/models/utils.py
View file @
ec5e299c
...
@@ -248,13 +248,16 @@ def check_logprobs_close(
...
@@ -248,13 +248,16 @@ def check_logprobs_close(
warnings
.
warn
(
fail_msg
,
stacklevel
=
2
)
warnings
.
warn
(
fail_msg
,
stacklevel
=
2
)
def
build_model_context
(
model_name
:
str
,
def
build_model_context
(
task
:
TaskOption
=
"auto"
,
model_name
:
str
,
tokenizer_name
:
Optional
[
str
]
=
None
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_name
:
Optional
[
str
]
=
None
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
trust_remote_code
:
bool
=
False
,
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
):
mm_processor_kwargs
:
Optional
[
Dict
]
=
None
,
limit_mm_per_prompt
:
Optional
[
Dict
]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
True
,
):
"""Creates an InputContext for a given model.
"""Creates an InputContext for a given model.
Args:
Args:
...
@@ -283,5 +286,6 @@ def build_model_context(model_name: str,
...
@@ -283,5 +286,6 @@ def build_model_context(model_name: str,
seed
=
0
,
seed
=
0
,
mm_processor_kwargs
=
mm_processor_kwargs
,
mm_processor_kwargs
=
mm_processor_kwargs
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
limit_mm_per_prompt
=
limit_mm_per_prompt
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
)
)
return
InputContext
(
model_config
)
return
InputContext
(
model_config
)
tests/mq_llm_engine/test_abort.py
View file @
ec5e299c
...
@@ -12,8 +12,8 @@ from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
...
@@ -12,8 +12,8 @@ from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
google/
gemma-1.1-2b-it"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"gemma-1.1-2b-it"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
)
RAISED_ERROR
=
KeyError
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
RAISED_VALUE
=
"foo"
EXPECTED_TOKENS
=
250
EXPECTED_TOKENS
=
250
...
...
tests/mq_llm_engine/test_error_handling.py
View file @
ec5e299c
...
@@ -23,8 +23,10 @@ from vllm.usage.usage_lib import UsageContext
...
@@ -23,8 +23,10 @@ from vllm.usage.usage_lib import UsageContext
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-1.1-2b-it"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"gemma-1.1-2b-it"
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
enforce_eager
=
True
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
,
enforce_eager
=
True
)
RAISED_ERROR
=
KeyError
RAISED_ERROR
=
KeyError
RAISED_VALUE
=
"foo"
RAISED_VALUE
=
"foo"
...
...
tests/mq_llm_engine/test_load.py
View file @
ec5e299c
...
@@ -12,12 +12,14 @@ from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
...
@@ -12,12 +12,14 @@ from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"
google/
gemma-1.1-2b-it"
)
MODEL
=
os
.
path
.
join
(
models_path_prefix
,
"gemma-1.1-2b-it"
)
NUM_EXPECTED_TOKENS
=
10
NUM_EXPECTED_TOKENS
=
10
NUM_REQUESTS
=
10000
NUM_REQUESTS
=
10000
# Scenarios to test for num generated token.
# Scenarios to test for num generated token.
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
disable_log_requests
=
True
)
ENGINE_ARGS
=
AsyncEngineArgs
(
model
=
MODEL
,
load_format
=
"runai_streamer"
,
disable_log_requests
=
True
)
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
...
...
tests/multimodal/test_processing.py
View file @
ec5e299c
...
@@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
...
@@ -22,8 +22,8 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
replace_token_matches
)
replace_token_matches
)
# yapf: enable
# yapf: enable
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.
multimodal.utils
import
cached_get_t
okenizer
from
vllm.
transformers_utils.tokenizer
import
(
AnyT
okenizer
,
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
cached_tokenizer_from_config
)
from
vllm.utils
import
full_groupby
from
vllm.utils
import
full_groupby
from
.utils
import
random_image
from
.utils
import
random_image
...
@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
...
@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
assert
result
==
expected
assert
result
==
expected
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"limit"
,
"num_supported"
,
"is_valid"
),
(
"limit"
,
"num_supported"
,
"is_valid"
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
...
@@ -576,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
...
@@ -576,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
)
profiler
=
MultiModalProfiler
(
processor
)
profiler
=
MultiModalProfiler
(
processor
)
...
@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
...
@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
profiler
.
get_dummy_data
(
model_config
.
max_model_len
)
profiler
.
get_dummy_data
(
model_config
.
max_model_len
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"llava-hf/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"num_images"
,
"limit"
,
"is_valid"
),
(
"num_images"
,
"limit"
,
"is_valid"
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
[(
0
,
0
,
True
),
(
0
,
1
,
True
),
(
1
,
0
,
False
),
(
1
,
1
,
True
),
(
1
,
2
,
True
),
...
@@ -615,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
...
@@ -615,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
)
rng
=
np
.
random
.
RandomState
(
0
)
rng
=
np
.
random
.
RandomState
(
0
)
...
@@ -661,7 +663,7 @@ class _ProcessorProxy:
...
@@ -661,7 +663,7 @@ class _ProcessorProxy:
return
dict
(
exists
=
exists
)
return
dict
(
exists
=
exists
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen2-VL-
7
B-Instruct"
])
# Dummy
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen2-VL-
2
B-Instruct"
])
# Dummy
# yapf: disable
# yapf: disable
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"call_kwargs"
,
"expected_kwargs"
),
(
"call_kwargs"
,
"expected_kwargs"
),
...
@@ -687,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
...
@@ -687,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
processor
=
MULTIMODAL_REGISTRY
.
create_processor
(
model_config
,
model_config
,
tokenizer
=
cached_
get_
tokenizer
(
model_config
.
tokenizer
),
tokenizer
=
cached_tokenizer
_from_config
(
model_config
),
)
)
orig_get_hf_processor
=
processor
.
info
.
get_hf_processor
orig_get_hf_processor
=
processor
.
info
.
get_hf_processor
...
...
tests/multimodal/utils.py
View file @
ec5e299c
...
@@ -17,10 +17,7 @@ def random_video(
...
@@ -17,10 +17,7 @@ def random_video(
min_wh
:
int
,
min_wh
:
int
,
max_wh
:
int
,
max_wh
:
int
,
):
):
# Temporary workaround for https://github.com/huggingface/transformers/issues/35412
num_frames
=
rng
.
randint
(
min_frames
,
max_frames
)
num_frames
=
rng
.
randint
(
min_frames
,
max_frames
)
num_frames
=
(
num_frames
//
2
)
*
2
w
,
h
=
rng
.
randint
(
min_wh
,
max_wh
,
size
=
(
2
,
))
w
,
h
=
rng
.
randint
(
min_wh
,
max_wh
,
size
=
(
2
,
))
return
rng
.
randint
(
0
,
255
,
size
=
(
num_frames
,
w
,
h
,
3
),
dtype
=
np
.
uint8
)
return
rng
.
randint
(
0
,
255
,
size
=
(
num_frames
,
w
,
h
,
3
),
dtype
=
np
.
uint8
)
...
...
tests/neuron/test_prefix_prefill.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
random
from
typing
import
Optional
from
typing
import
Optional
import
pytest
import
pytest
...
@@ -171,12 +170,22 @@ def ref_context_attention(
...
@@ -171,12 +170,22 @@ def ref_context_attention(
return
output
return
output
@
pytest
.
mark
.
parametrize
(
"block_size, large_tile_size"
,
[
(
32
,
2048
),
# 64 blocks
(
32
,
4096
),
# 128 blocks
(
32
,
8192
),
# 256 blocks
(
64
,
8192
),
# 128 blocks
],
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"num_heads,num_queries_per_kv,head_size,mixed_precision"
,
"num_heads,num_queries_per_kv,head_size,mixed_precision"
,
[
[
(
4
,
2
,
8
,
False
),
(
4
,
2
,
8
,
False
),
(
4
,
2
,
8
,
True
),
(
4
,
2
,
8
,
True
),
(
32
,
8
,
64
,
True
),
(
32
,
8
,
64
,
True
),
(
16
,
2
,
128
,
True
),
],
],
)
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
@@ -184,6 +193,8 @@ def test_contexted_kv_attention(
...
@@ -184,6 +193,8 @@ def test_contexted_kv_attention(
num_heads
:
int
,
num_heads
:
int
,
num_queries_per_kv
:
int
,
num_queries_per_kv
:
int
,
head_size
:
int
,
head_size
:
int
,
block_size
:
int
,
large_tile_size
,
mixed_precision
:
bool
,
mixed_precision
:
bool
,
)
->
None
:
)
->
None
:
import
os
import
os
...
@@ -192,40 +203,46 @@ def test_contexted_kv_attention(
...
@@ -192,40 +203,46 @@ def test_contexted_kv_attention(
from
vllm.attention.ops.nki_flash_attn
import
flash_attn_varlen_nkifunc
from
vllm.attention.ops.nki_flash_attn
import
flash_attn_varlen_nkifunc
assert
large_tile_size
%
block_size
==
0
device
=
xm
.
xla_device
()
device
=
xm
.
xla_device
()
os
.
environ
[
"NEURON_CC_FLAGS"
]
=
(
compiler_flags
=
[
" --model-type=transformer -O1 "
"--model-type=transformer -O1"
,
" --internal-hlo2tensorizer-options='--verify-hlo' "
)
"--internal-hlo2tensorizer-options='--verify-hlo'"
,
"--retry_failed_compilation"
,
]
compiler_flags_str
=
" "
.
join
(
compiler_flags
)
os
.
environ
[
"NEURON_CC_FLAGS"
]
=
compiler_flags_str
random
.
seed
(
0
)
torch
.
manual_seed
(
0
)
torch
.
manual_seed
(
0
)
torch
.
set_printoptions
(
sci_mode
=
False
)
torch
.
set_printoptions
(
sci_mode
=
False
)
min_ctx_len
=
2
min_ctx_len
=
3
2
max_ctx_len
=
6
4
max_ctx_len
=
102
4
min_query_len
=
2
min_query_len
=
16
max_query_len
=
64
max_query_len
=
512
prefill_batch_size
=
2
prefill_batch_size
=
4
decode_batch_size
=
6
decode_batch_size
=
12
batch_size
=
prefill_batch_size
+
decode_batch_size
batch_size
=
prefill_batch_size
+
decode_batch_size
block_size
=
32
max_model_len
=
(
max_query_len
+
max_ctx_len
)
*
4
max_model_len
=
(
max_query_len
+
max_ctx_len
)
*
4
max_block_per_request
=
max_model_len
//
block_size
max_block_per_request
=
max_model_len
//
block_size
dtype
=
torch
.
float32
dtype
=
torch
.
float32
cache_size
=
(
batch_size
*
max_block_per_request
)
+
2
cache_size
=
(
batch_size
*
max_block_per_request
)
+
2
ctx_lens
=
[
prefill_ctx_lens
=
torch
.
randint
(
min_ctx_len
,
random
.
randint
(
min_ctx_len
,
max_ctx_len
)
max_ctx_len
+
1
,
(
prefill_batch_size
,
),
for
_
in
range
(
prefill_batch_size
)
dtype
=
torch
.
long
).
tolist
()
]
+
[
decode_ctx_lens
=
torch
.
randint
(
min_ctx_len
,
random
.
randint
(
min_ctx_len
,
max_ctx_len
)
max_ctx_len
+
1
,
(
decode_batch_size
,
),
for
_
in
range
(
decode_batch_size
)
dtype
=
torch
.
long
).
tolist
()
]
ctx_lens
=
prefill_ctx_lens
+
decode_ctx_lens
query_lens
=
[
query_lens
=
torch
.
randint
(
random
.
randint
(
min_query_len
,
max_query_len
)
min_query_len
,
for
_
in
range
(
prefill_batch_size
)
max_query_len
+
1
,
]
+
[
1
for
_
in
range
(
decode_batch_size
)]
(
prefill_batch_size
,
),
dtype
=
torch
.
long
,
).
tolist
()
+
[
1
for
_
in
range
(
decode_batch_size
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
query_lens
,
ctx_lens
)]
seq_lens
=
[
a
+
b
for
a
,
b
in
zip
(
query_lens
,
ctx_lens
)]
num_kv_heads
=
num_heads
//
num_queries_per_kv
num_kv_heads
=
num_heads
//
num_queries_per_kv
...
@@ -254,7 +271,6 @@ def test_contexted_kv_attention(
...
@@ -254,7 +271,6 @@ def test_contexted_kv_attention(
values
=
values
[
torch
.
randperm
(
cache_size
)]
values
=
values
[
torch
.
randperm
(
cache_size
)]
block_table
=
values
[:
batch_size
*
max_block_per_request
].
view
(
block_table
=
values
[:
batch_size
*
max_block_per_request
].
view
(
batch_size
,
max_block_per_request
)
batch_size
,
max_block_per_request
)
torch
.
tensor
(
seq_lens
,
dtype
=
torch
.
long
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_ctx_len
=
torch
.
tensor
(
ctx_lens
,
dtype
=
torch
.
long
)
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
query_lens
[:
-
1
],
b_start_loc
=
torch
.
cumsum
(
torch
.
tensor
([
0
]
+
query_lens
[:
-
1
],
dtype
=
torch
.
long
),
dtype
=
torch
.
long
),
...
@@ -311,9 +327,7 @@ def test_contexted_kv_attention(
...
@@ -311,9 +327,7 @@ def test_contexted_kv_attention(
# build neuron program
# build neuron program
return_debug_tensors
=
False
return_debug_tensors
=
False
B_P_SIZE
=
128
B_P_SIZE
=
128
LARGE_TILE_SZ
=
2048
LARGE_TILE_SZ
=
large_tile_size
max_num_queries
=
(
(
sum
(
query_lens
)
+
block_size
-
1
)
//
block_size
)
*
block_size
def
get_active_block_tables
(
block_tables
,
query_lens
,
seq_lens
,
block_size
,
def
get_active_block_tables
(
block_tables
,
query_lens
,
seq_lens
,
block_size
,
num_blocks
):
num_blocks
):
...
@@ -332,26 +346,28 @@ def test_contexted_kv_attention(
...
@@ -332,26 +346,28 @@ def test_contexted_kv_attention(
0
,
0
,
)
)
def
shift_bit_length
(
x
):
def
ceil_div
(
a
,
b
):
return
1
<<
(
x
-
1
).
bit_length
()
return
(
a
+
b
-
1
)
//
b
def
pad_to_multiple
(
a
,
b
):
return
ceil_div
(
a
,
b
)
*
b
def
pad_to_next_power_of_2
(
a
):
assert
a
>
0
return
2
**
int
(
a
-
1
).
bit_length
()
# calculate input shapes
# calculate input shapes
max_num_queries_shifted
=
shift_bit_length
(
max_num_queries
)
max_num_queries
=
pad_to_multiple
(
sum
(
query_lens
),
block_size
)
max_num_queries_factor
=
B_P_SIZE
//
max_num_queries_shifted
max_num_queries
=
pad_to_next_power_of_2
(
max_num_queries
)
max_num_queries_padded
=
max_num_queries_shifted
*
max_num_queries_factor
assert
(
max_num_queries_padded
==
B_P_SIZE
),
"invalid {max_num_queries_padded=}"
head_size_padded
=
B_P_SIZE
head_size_padded
=
B_P_SIZE
assert
head_size_padded
>=
head_size
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
context_lens
=
torch
.
tensor
(
seq_lens
)
-
torch
.
tensor
(
query_lens
)
num_active_blocks_shifted
=
shift_bit_length
(
num_active_blocks
=
ceil_div
(
context_lens
,
block_size
).
sum
().
item
()
((
context_lens
+
block_size
-
1
)
//
block_size
).
sum
().
item
())
num_active_blocks
=
pad_to_multiple
(
num_active_blocks
,
num_active_blocks_factor
=
(
LARGE_TILE_SZ
//
block_size
//
LARGE_TILE_SZ
//
block_size
)
num_active_blocks_shifted
)
num_active_blocks
=
num_active_blocks_shifted
*
num_active_blocks_factor
assert
(
num_active_blocks
*
block_size
)
==
LARGE_TILE_SZ
,
"invalid {num_active_blocks=}"
context_kv_len
=
num_active_blocks
*
block_size
context_kv_len
=
num_active_blocks
*
block_size
assert
context_kv_len
==
LARGE_TILE_SZ
,
f
"invalid
{
context_kv_len
=
}
"
assert
(
context_kv_len
%
LARGE_TILE_SZ
==
0
),
f
"invalid context_kv_len=
{
context_kv_len
}
"
# pad QKV tensors
# pad QKV tensors
pad_dims
=
(
pad_dims
=
(
...
@@ -360,7 +376,7 @@ def test_contexted_kv_attention(
...
@@ -360,7 +376,7 @@ def test_contexted_kv_attention(
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
_padded
-
query
.
shape
[
0
],
max_num_queries
-
query
.
shape
[
0
],
)
)
query
=
F
.
pad
(
query
,
pad_dims
,
"constant"
,
0
)
query
=
F
.
pad
(
query
,
pad_dims
,
"constant"
,
0
)
k
=
F
.
pad
(
k
,
pad_dims
,
"constant"
,
0
)
k
=
F
.
pad
(
k
,
pad_dims
,
"constant"
,
0
)
...
@@ -397,7 +413,7 @@ def test_contexted_kv_attention(
...
@@ -397,7 +413,7 @@ def test_contexted_kv_attention(
0
,
0
,
context_kv_len
-
prior_mask
.
shape
[
1
],
context_kv_len
-
prior_mask
.
shape
[
1
],
0
,
0
,
B_P_SIZE
-
prior_mask
.
shape
[
0
],
max_num_queries
-
prior_mask
.
shape
[
0
],
),
),
"constant"
,
"constant"
,
0
,
0
,
...
@@ -406,9 +422,9 @@ def test_contexted_kv_attention(
...
@@ -406,9 +422,9 @@ def test_contexted_kv_attention(
active_mask
,
active_mask
,
(
(
0
,
0
,
B_P_SIZE
-
active_mask
.
shape
[
1
],
max_num_queries
-
active_mask
.
shape
[
1
],
0
,
0
,
B_P_SIZE
-
active_mask
.
shape
[
0
],
max_num_queries
-
active_mask
.
shape
[
0
],
),
),
"constant"
,
"constant"
,
0
,
0
,
...
@@ -430,6 +446,8 @@ def test_contexted_kv_attention(
...
@@ -430,6 +446,8 @@ def test_contexted_kv_attention(
n_kv_head
=
num_kv_heads
,
n_kv_head
=
num_kv_heads
,
head_size
=
head_size
,
head_size
=
head_size
,
mixed_precision
=
mixed_precision
,
mixed_precision
=
mixed_precision
,
LARGE_TILE_SZ
=
LARGE_TILE_SZ
,
return_debug_tensors
=
return_debug_tensors
,
)
)
if
return_debug_tensors
:
if
return_debug_tensors
:
...
@@ -439,17 +457,15 @@ def test_contexted_kv_attention(
...
@@ -439,17 +457,15 @@ def test_contexted_kv_attention(
output_nki
=
flash_attn_varlen_nkifunc
(
*
input_args
,
**
input_kwargs
)
output_nki
=
flash_attn_varlen_nkifunc
(
*
input_args
,
**
input_kwargs
)
debug_tensors
=
[]
debug_tensors
=
[]
output_nki
=
torch
.
tensor
(
output_nki
).
cpu
()
debug_tensors
=
[
torch
.
tensor
(
dt
).
cpu
()
for
dt
in
debug_tensors
]
debug_tensors
=
[
torch
.
tensor
(
dt
).
cpu
()
for
dt
in
debug_tensors
]
num_actual_tokens
=
sum
(
query_lens
)
num_actual_tokens
=
sum
(
query_lens
)
print
(
f
"
{
num_actual_tokens
=
}
"
)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
# - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
output_nki
=
output_nki
.
permute
(
output_nki
=
output_nki
.
cpu
().
permute
(
0
,
2
,
1
,
3
)[:,
:,
:,
:
head_size
]
0
,
2
,
1
,
3
)[:,
:,
:,
:
head_size
].
cpu
()
[
0
,
:
num_actual_tokens
,
:,
:]
output_nki
=
output_nki
[
0
,
:
num_actual_tokens
,
:,
:]
output_ref_padded
=
F
.
pad
(
output_ref_padded
=
F
.
pad
(
output_ref
,
output_ref
,
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
_padded
-
output_ref
.
shape
[
0
]),
(
0
,
0
,
0
,
0
,
0
,
0
,
0
,
max_num_queries
-
output_ref
.
shape
[
0
]),
"constant"
,
"constant"
,
0
,
0
,
)
)
...
...
tests/plugins_tests/test_platform_plugins.py
View file @
ec5e299c
...
@@ -14,7 +14,7 @@ def test_platform_plugins():
...
@@ -14,7 +14,7 @@ def test_platform_plugins():
import
os
import
os
example_file
=
os
.
path
.
join
(
example_file
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
current_file
))),
os
.
path
.
dirname
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
current_file
))),
"examples"
,
"offline_inference/basic.py"
)
"examples"
,
"offline_inference/basic
/basic
.py"
)
runpy
.
run_path
(
example_file
)
runpy
.
run_path
(
example_file
)
# check if the plugin is loaded correctly
# check if the plugin is loaded correctly
...
...
tests/plugins_tests/test_scheduler_plugins.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
from
vllm.core.scheduler
import
Scheduler
class
DummyScheduler
(
Scheduler
):
def
schedule
(
self
):
raise
Exception
(
"Exception raised by DummyScheduler"
)
def
test_scheduler_plugins
():
import
pytest
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.sampling_params
import
SamplingParams
with
pytest
.
raises
(
Exception
)
as
exception_info
:
engine_args
=
EngineArgs
(
model
=
"facebook/opt-125m"
,
enforce_eager
=
True
,
# reduce test time
scheduler_cls
=
DummyScheduler
,
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
=
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
str
(
exception_info
.
value
)
==
"Exception raised by DummyScheduler"
tests/quantization/test_compressed_tensors.py
View file @
ec5e299c
...
@@ -506,7 +506,6 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
...
@@ -506,7 +506,6 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
assert
output
assert
output
@
pytest
.
mark
.
skip
(
reason
=
"2of4 sparse w16a16 CUTLASS produces bad output."
)
@
pytest
.
mark
.
skipif
(
@
pytest
.
mark
.
skipif
(
not
sparse_cutlass_supported
(),
not
sparse_cutlass_supported
(),
reason
=
"2of4 Sparse is not yet supported on this GPU type."
,
reason
=
"2of4 Sparse is not yet supported on this GPU type."
,
...
...
tests/quantization/test_cpu_offload.py
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# Expanded quantized model tests for CPU offloading
# Expanded quantized model tests for CPU offloading
# Base tests: tests/basic_correctness/test_cpu_offload.py
# Base tests: tests/basic_correctness/test_cpu_offload.py
...
@@ -16,14 +16,14 @@ from vllm.utils import is_hip
...
@@ -16,14 +16,14 @@ from vllm.utils import is_hip
reason
=
"fp8 is not supported on this GPU type."
)
reason
=
"fp8 is not supported on this GPU type."
)
def
test_cpu_offload_fp8
():
def
test_cpu_offload_fp8
():
# Test quantization of an unquantized checkpoint
# Test quantization of an unquantized checkpoint
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
),
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3
.2-1
B-Instruct"
),
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"
2
"
],
[
"--quantization"
,
"fp8"
,
"--cpu-offload-gb"
,
"
1
"
],
max_wait_seconds
=
480
)
max_wait_seconds
=
480
)
# Test loading a quantized checkpoint
# Test loading a quantized checkpoint
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/
Meta-Llama-3-8
B-Instruct-FP8"
),
[],
#
compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/
Qwen2-1.5
B-Instruct-FP8"), [],
[
"--cpu-offload-gb"
,
"
2
"
],
#
["--cpu-offload-gb", "
1
"],
max_wait_seconds
=
480
)
#
max_wait_seconds=480)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
is_hip
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
or
is_hip
(),
...
...
tests/quantization/test_fp8.py
View file @
ec5e299c
...
@@ -58,10 +58,21 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
...
@@ -58,10 +58,21 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
# NOTE: it is valid for scales to be 1.0 (default value), but
if
not
current_platform
.
is_rocm
():
# we know these checkpoints have scales < 1.0
# NOTE: This code path requires validation on Non-CUDA platform
assert
0.0
<
attn
.
_k_scale
<
1.0
# NOTE: it is valid for scales to be 1.0 (default value), but
assert
0.0
<
attn
.
_v_scale
<
1.0
# we know these checkpoints have scales < 1.0
assert
0.0
<
attn
.
_k_scale
<
1.0
assert
0.0
<
attn
.
_v_scale
<
1.0
else
:
# NOTE: This code path is for ROCm platform
# NOTE: it is valid for scales to be 1.0 (default value), but
# we know these checkpoints have scales < 1.0
# However on ROCm platform, the _k_scale and _v_scale will be
# scaled by a factor of 2 as described in
# vllm/model_executor/layers/quantization/kv_cache.py
assert
0.0
<
attn
.
_k_scale
<
(
1.0
*
2.0
)
assert
0.0
<
attn
.
_v_scale
<
(
1.0
*
2.0
)
llm
.
apply_model
(
check_model
)
llm
.
apply_model
(
check_model
)
...
@@ -94,13 +105,29 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
...
@@ -94,13 +105,29 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
if
current_platform
.
has_device_capability
(
89
)
and
not
force_marlin
:
if
current_platform
.
is_cuda
():
# For GPUs with hardware support, we keep weights in fp8
if
current_platform
.
has_device_capability
(
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
89
)
and
not
force_marlin
:
else
:
# For GPUs with hardware support, we keep weights in fp8
# For GPUs without hardware support, we pack the fp8 weights
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
# for weight-only quantization using Marlin kernels
else
:
assert
fc1
.
weight
.
dtype
==
torch
.
int32
# For GPUs without hardware support, we pack the fp8 weights
# for weight-only quantization using Marlin kernels
assert
fc1
.
weight
.
dtype
==
torch
.
int32
elif
current_platform
.
is_rocm
():
# Only MI300 and above support quantization='fp8'
if
current_platform
.
has_device_capability
(
94
)
and
not
force_marlin
:
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fnuz
else
:
# unsupported ROCm platform
pytest
.
skip
(
"Skip `test_load_fp16_model`. "
"It only runs on ROCm platform with FP8 compute."
" e.g. MI300X and above."
)
else
:
# unsupported platform
pytest
.
skip
(
"Skip `test_load_fp16_model`. "
"It only runs on CUDA and ROCm platform."
)
llm
.
apply_model
(
check_model
)
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_gptq_dynamic.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
"""Tests whether gptq models with dynamic quantized can be loaded.
Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.quantization.gptq
import
GPTQLinearMethod
from
vllm.model_executor.layers.quantization.gptq_marlin
import
(
GPTQMarlinLinearMethod
)
from
vllm.model_executor.layers.quantization.utils.gptq_utils
import
(
get_dynamic_override
)
PROMPT
=
"On the surface of Mars, we found"
# The first layer is quantized using bits=4, group_size=128
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT
=
[
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
,
True
),
(
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
,
False
),
]
@
pytest
.
mark
.
parametrize
(
"model_id, use_marlin_kernel"
,
MODEL_QUANT
)
def
test_gptq_with_dynamic
(
vllm_runner
,
model_id
:
str
,
use_marlin_kernel
:
bool
):
vllm_model
=
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
)
linear_method_cls
=
GPTQMarlinLinearMethod
if
use_marlin_kernel
else
(
GPTQLinearMethod
)
for
name
,
submodule
in
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
.
named_modules
()):
if
name
==
"lm_head"
:
assert
isinstance
(
submodule
.
quant_method
,
linear_method_cls
)
elif
name
==
'model.layers.0.self_attn.qkv_proj'
:
# The first layer is quantized using bits=4, group_size=128
# desc_act=True
assert
isinstance
(
submodule
.
quant_method
,
linear_method_cls
)
config
=
submodule
.
quant_method
.
quant_config
assert
config
.
weight_bits
==
4
assert
config
.
group_size
==
128
assert
config
.
desc_act
elif
name
==
'model.layers.1.self_attn.qkv_proj'
:
# The second layer is quantized using bits=8, group_size=32
# desc_act=False
assert
isinstance
(
submodule
.
quant_method
,
linear_method_cls
)
config
=
submodule
.
quant_method
.
quant_config
assert
get_dynamic_override
(
config
,
layer_name
=
name
,
key
=
"bits"
)
==
8
assert
get_dynamic_override
(
config
,
layer_name
=
name
,
key
=
"group_size"
)
==
32
assert
not
get_dynamic_override
(
config
,
layer_name
=
name
,
key
=
"desc_act"
)
elif
(
name
==
'model.layers.2.self_attn.qkv_proj'
or
name
==
'model.layers.2.mlp.gate_up_proj'
):
# All other layers (layer index >= 2) are not quantized
assert
isinstance
(
submodule
.
quant_method
,
UnquantizedLinearMethod
)
del
vllm_model
tests/quantization/test_lm_head.py
View file @
ec5e299c
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
"""
"""
from
typing
import
Tuple
import
pytest
import
pytest
import
torch
import
torch
...
@@ -19,31 +18,31 @@ from ..utils import models_path_prefix
...
@@ -19,31 +18,31 @@ from ..utils import models_path_prefix
PROMPT
=
"On the surface of Mars, we found"
PROMPT
=
"On the surface of Mars, we found"
MODELS_QUANT
=
[(
MODELS_QUANT
=
[
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"
),
(
os
.
path
.
join
(
models_path_prefix
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head"
),
True
),
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
),
False
)]
(
os
.
path
.
join
(
models_path_prefix
,
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024"
),
False
),
# (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
),
False
),
# (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)
]
@
pytest
.
mark
.
parametrize
(
"model_lm_head_quant"
,
MODELS_QUANT
)
@
pytest
.
mark
.
parametrize
(
"model_
id,
lm_head_quant
ized
"
,
MODELS_QUANT
)
def
test_lm_head
(
def
test_lm_head
(
vllm_runner
,
vllm_runner
,
model_lm_head_quant
:
Tuple
[
str
,
bool
],
model_id
:
str
,
lm_head_quantized
:
bool
,
)
->
None
:
)
->
None
:
model
,
lm_head_quantized
=
model_lm_head_quant
with
vllm_runner
(
model_id
,
dtype
=
torch
.
float16
,
with
vllm_runner
(
model
,
dtype
=
torch
.
float16
,
max_model_len
=
2048
)
as
vllm_model
:
max_model_len
=
2048
)
as
vllm_model
:
def
check_model
(
model
):
def
check_model
(
model
):
lm_head_layer
=
model
.
lm_head
lm_head_layer
=
model
.
lm_head
if
lm_head_quantized
:
if
lm_head_quantized
:
assert
isinstance
(
lm_head_layer
.
linear
_method
,
assert
isinstance
(
lm_head_layer
.
quant
_method
,
(
GPTQLinearMethod
,
GPTQMarlinLinearMethod
,
(
GPTQLinearMethod
,
GPTQMarlinLinearMethod
,
MarlinLinearMethod
))
MarlinLinearMethod
))
else
:
else
:
assert
isinstance
(
lm_head_layer
.
linear
_method
,
assert
isinstance
(
lm_head_layer
.
quant
_method
,
UnquantizedEmbeddingMethod
)
UnquantizedEmbeddingMethod
)
vllm_model
.
apply_model
(
check_model
)
vllm_model
.
apply_model
(
check_model
)
...
...
tests/quantization/test_ptpc_fp8.py
0 → 100644
View file @
ec5e299c
# SPDX-License-Identifier: Apache-2.0
"""Tests whether PTPC w8a8 FP8 computation is enabled correctly.
Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
"""
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.quantization.fp8
import
Fp8KVCacheMethod
from
vllm.model_executor.layers.quantization.ptpc_fp8
import
(
PTPCFp8LinearMethod
)
from
vllm.platforms
import
current_platform
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"ptpc_fp8"
),
reason
=
"PTPC FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"This test is for ROCm GPU."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"auto"
,
"bfloat16"
,
"float16"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
,
"fp8_e4m3"
])
def
test_ptpc_fp8_rocm
(
vllm_runner
,
dtype
:
str
,
kv_cache_dtype
:
str
)
->
None
:
try
:
with
vllm_runner
(
"facebook/opt-125m"
,
dtype
=
dtype
,
quantization
=
"ptpc_fp8"
,
kv_cache_dtype
=
kv_cache_dtype
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
fc1
=
model
.
model
.
decoder
.
layers
[
0
].
fc1
assert
isinstance
(
fc1
.
quant_method
,
PTPCFp8LinearMethod
)
if
kv_cache_dtype
==
"ptpc_fp8"
:
attn
=
model
.
model
.
decoder
.
layers
[
0
].
self_attn
.
attn
assert
isinstance
(
attn
.
quant_method
,
Fp8KVCacheMethod
)
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
if
current_platform
.
has_device_capability
(
94
):
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fnuz
else
:
pytest
.
skip
()
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
except
AssertionError
as
e
:
if
str
(
e
)
==
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified."
:
# noqa: E501
# If the error message matches, the test passes
pass
else
:
# If the error message does not match, re-raise the exception
raise
Prev
1
…
8
9
10
11
12
13
14
15
16
…
27
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment