Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
02462465
Unverified
Commit
02462465
authored
Nov 05, 2024
by
Michael Goin
Committed by
GitHub
Nov 05, 2024
Browse files
[CI] Prune tests/models/decoder_only/language/* tests (#9940)
Signed-off-by:
mgoin
<
michael@neuralmagic.com
>
parent
b9c64c0c
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
70 additions
and
270 deletions
+70
-270
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-2
tests/models/decoder_only/language/test_big_models.py
tests/models/decoder_only/language/test_big_models.py
+0
-93
tests/models/decoder_only/language/test_fp8.py
tests/models/decoder_only/language/test_fp8.py
+5
-5
tests/models/decoder_only/language/test_gptq_marlin.py
tests/models/decoder_only/language/test_gptq_marlin.py
+0
-13
tests/models/decoder_only/language/test_gptq_marlin_24.py
tests/models/decoder_only/language/test_gptq_marlin_24.py
+6
-6
tests/models/decoder_only/language/test_marlin.py
tests/models/decoder_only/language/test_marlin.py
+0
-69
tests/models/decoder_only/language/test_mistral.py
tests/models/decoder_only/language/test_mistral.py
+21
-16
tests/models/decoder_only/language/test_models.py
tests/models/decoder_only/language/test_models.py
+37
-32
tests/models/decoder_only/language/test_qwen.py
tests/models/decoder_only/language/test_qwen.py
+0
-34
No files found.
.buildkite/test-pipeline.yaml
View file @
02462465
...
...
@@ -321,7 +321,6 @@ steps:
-
tests/models/decoder_only/language
commands
:
-
pytest -v -s models/decoder_only/language/test_models.py
-
pytest -v -s models/decoder_only/language/test_big_models.py
-
label
:
Decoder-only Language Models Test (Extended)
# 1h20min
nightly
:
true
...
...
@@ -329,7 +328,7 @@ steps:
-
vllm/
-
tests/models/decoder_only/language
commands
:
-
pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
--ignore=models/decoder_only/language/test_big_models.py
-
pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
-
label
:
Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
...
...
tests/models/decoder_only/language/test_big_models.py
deleted
100644 → 0
View file @
b9c64c0c
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import
pytest
from
vllm.platforms
import
current_platform
from
...utils
import
check_logprobs_close
,
check_outputs_equal
MODELS
=
[
"meta-llama/Llama-2-7b-hf"
,
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b"
,
# "mosaicml/mpt-7b", # Broken
# "Qwen/Qwen1.5-0.5B" # Broken,
]
if
not
current_platform
.
is_cpu
():
MODELS
+=
[
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B"
,
# Head size isn't supported on CPU
"h2oai/h2o-danube3-4b-base"
,
]
# TODO: remove this after CPU float16 support ready
target_dtype
=
"float"
if
current_platform
.
is_cpu
()
else
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
if
model
==
"openbmb/MiniCPM3-4B"
:
# the output becomes slightly different when upgrading to
# pytorch 2.5 . Changing to logprobs checks instead of exact
# output checks.
NUM_LOG_PROBS
=
8
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
NUM_LOG_PROBS
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
else
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
tests/models/decoder_only/language/test_fp8.py
View file @
02462465
...
...
@@ -21,11 +21,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
"kv_cache_dtype,base_model,test_model,scale_path"
,
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
(
"fp8_e4m3"
,
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
,
"nm-testing/
Meta-
Llama-3
-8
B-Instruct-FP8-KV"
,
None
),
(
"fp8_e4m3"
,
"meta-llama/Llama-3
.2-1
B-Instruct"
,
"nm-testing/Llama-3
.2-1
B-Instruct-FP8-KV"
,
None
),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
(
"fp8_e5m2"
,
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
,
"meta-llama/
Meta-
Llama-3
-8
B-Instruct"
,
None
),
(
"fp8_e5m2"
,
"meta-llama/Llama-3
.2-1
B-Instruct"
,
"meta-llama/Llama-3
.2-1
B-Instruct"
,
None
),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
(
"fp8_e4m3"
,
"meta-llama/Llama-2-7b-chat-hf"
,
"meta-llama/Llama-2-7b-chat-hf"
,
...
...
@@ -33,7 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
4
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"enforce_eager"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
"FLASH_ATTN"
,
"XFORMERS"
,
"FLASHINFER"
])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
...
...
tests/models/decoder_only/language/test_gptq_marlin.py
View file @
02462465
...
...
@@ -22,24 +22,11 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
MODELS
=
[
# act_order==False, group_size=channelwise
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
# act_order==False, group_size=128
(
"TheBloke/Llama-2-7B-GPTQ"
,
"main"
),
# act_order==True, group_size=128
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"main"
),
# act_order==True, group_size=64
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-4bit-64g-actorder_True"
),
# act_order==True, group_size=32
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-4bit-32g-actorder_True"
),
# 8-bit, act_order==True, group_size=channelwise
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit--1g-actorder_True"
),
# 8-bit, act_order==True, group_size=128
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit-128g-actorder_True"
),
# 8-bit, act_order==True, group_size=32
(
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
"gptq-8bit-32g-actorder_True"
),
# 4-bit, act_order==True, group_size=128
(
"TechxGenus/gemma-1.1-2b-it-GPTQ"
,
"main"
)
...
...
tests/models/decoder_only/language/test_gptq_marlin_24.py
View file @
02462465
...
...
@@ -25,16 +25,16 @@ model_pairs = [
# 4-bit, group_size == 128
ModelPair
(
model_marlin
=
"alexm-nm/tinyllama-24-marlin24-4bit-g128"
,
model_gptq
=
"alexm-nm/tinyllama-24-gptq-4bit-g128"
),
# 4-bit, group_size == channelwise
ModelPair
(
model_marlin
=
"alexm-nm/tinyllama-24-marlin24-4bit-channelwise"
,
model_gptq
=
"alexm-nm/tinyllama-24-gptq-4bit-channelwise"
),
#
#
4-bit, group_size == channelwise
#
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
#
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
# 8-bit, group_size == 128
ModelPair
(
model_marlin
=
"alexm-nm/tinyllama-24-marlin24-8bit-g128"
,
model_gptq
=
"alexm-nm/tinyllama-24-gptq-8bit-g128"
),
# 8-bit, group_size == channelwise
ModelPair
(
model_marlin
=
"alexm-nm/tinyllama-24-marlin24-8bit-channelwise"
,
model_gptq
=
"alexm-nm/tinyllama-24-gptq-8bit-channelwise"
),
#
#
8-bit, group_size == channelwise
#
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
#
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
]
...
...
tests/models/decoder_only/language/test_marlin.py
deleted
100644 → 0
View file @
b9c64c0c
"""Compare the outputs of a GPTQ model to a Marlin model.
Note: GPTQ and Marlin do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
Marlin/GPTQ models are in the top 3 selections of each other.
Note: Marlin internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.
Run `pytest tests/models/test_marlin.py`.
"""
from
dataclasses
import
dataclass
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
from
...utils
import
check_logprobs_close
@
dataclass
class
ModelPair
:
model_marlin
:
str
model_gptq
:
str
model_pairs
=
[
ModelPair
(
model_marlin
=
"nm-testing/zephyr-beta-7b-marlin-g128"
,
model_gptq
=
"nm-testing/zephyr-beta-7b-gptq-g128"
),
ModelPair
(
model_marlin
=
"robertgshaw2/zephyr-7b-beta-channelwise-marlin"
,
model_gptq
=
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
),
ModelPair
(
model_marlin
=
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"
,
model_gptq
=
"robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq"
)
]
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"marlin"
),
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
vllm_runner
,
example_prompts
,
model_pair
:
ModelPair
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
with
vllm_runner
(
model_pair
.
model_marlin
,
dtype
=
dtype
,
quantization
=
"marlin"
)
as
marlin_model
:
marlin_outputs
=
marlin_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model_pair
.
model_gptq
,
dtype
=
dtype
,
quantization
=
"gptq"
)
as
gptq_model
:
gptq_outputs
=
gptq_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
gptq_outputs
,
outputs_1_lst
=
marlin_outputs
,
name_0
=
"gptq"
,
name_1
=
"marlin"
,
)
tests/models/decoder_only/language/test_mistral.py
View file @
02462465
...
...
@@ -4,7 +4,7 @@ Run `pytest tests/models/test_mistral.py`.
"""
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
SamplingParams
from
...utils
import
check_logprobs_close
...
...
@@ -15,6 +15,10 @@ MODELS = [
# "mistralai/Mistral-Nemo-Instruct-2407"
]
MISTRAL_FORMAT_MODELS
=
[
"mistralai/Mistral-7B-Instruct-v0.3"
,
]
SAMPLING_PARAMS
=
SamplingParams
(
max_tokens
=
512
,
temperature
=
0.0
,
logprobs
=
5
)
SYMBOLIC_LANG_PROMPTS
=
[
"勇敢な船乗りについての詩を書く"
,
# japanese
...
...
@@ -95,7 +99,7 @@ def test_models(
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
[
1
:]
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
...
...
@@ -135,28 +139,29 @@ def test_mistral_format(
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
[
1
:]
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"prompt"
,
SYMBOLIC_LANG_PROMPTS
)
def
test_mistral_symbolic_languages
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
prompt
:
str
,
)
->
None
:
prompt
=
"hi"
msg
=
{
"role"
:
"user"
,
"content"
:
prompt
}
llm
=
LLM
(
model
=
model
,
dtype
=
dtype
,
max_model_len
=
8192
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
)
outputs
=
llm
.
chat
([
msg
],
sampling_params
=
SAMPLING_PARAMS
)
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
with
vllm_runner
(
model
,
dtype
=
dtype
,
max_model_len
=
8192
,
tokenizer_mode
=
"mistral"
,
config_format
=
"mistral"
,
load_format
=
"mistral"
)
as
vllm_model
:
for
prompt
in
SYMBOLIC_LANG_PROMPTS
:
msg
=
{
"role"
:
"user"
,
"content"
:
prompt
}
outputs
=
vllm_model
.
model
.
chat
([
msg
],
sampling_params
=
SAMPLING_PARAMS
)
assert
"�"
not
in
outputs
[
0
].
outputs
[
0
].
text
.
strip
()
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
[
1
:])
# v1 can't do func calling
@
pytest
.
mark
.
parametrize
(
"model"
,
MISTRAL_FORMAT_MODELS
)
# v1 can't do func calling
def
test_mistral_function_calling
(
vllm_runner
,
model
:
str
,
...
...
tests/models/decoder_only/language/test_models.py
View file @
02462465
...
...
@@ -7,25 +7,39 @@ Run `pytest tests/models/test_models.py`.
"""
import
pytest
from
...utils
import
check_outputs_equal
from
vllm.platforms
import
current_platform
from
...utils
import
check_logprobs_close
MODELS
=
[
"facebook/opt-125m"
,
"gpt2"
,
"bigcode/tiny_starcoder_py"
,
"EleutherAI/pythia-70m"
,
"bigscience/bloom-560m"
,
# Testing alibi slopes.
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
,
# "allenai/OLMo-1B", # Broken
"bigcode/starcoder2-3b"
,
"google/gemma-1.1-2b-it"
,
"facebook/opt-125m"
,
# opt
"openai-community/gpt2"
,
# gpt2
# "Milos/slovak-gpt-j-405M", # gptj
# "bigcode/tiny_starcoder_py", # gpt_bigcode
# "EleutherAI/pythia-70m", # gpt_neox
"bigscience/bloom-560m"
,
# bloom - testing alibi slopes
"microsoft/phi-2"
,
# phi
# "stabilityai/stablelm-3b-4e1t", # stablelm
# "bigcode/starcoder2-3b", # starcoder2
"google/gemma-1.1-2b-it"
,
# gemma
"Qwen/Qwen2.5-0.5B-Instruct"
,
# qwen2
"meta-llama/Llama-3.2-1B-Instruct"
,
# llama
]
if
not
current_platform
.
is_cpu
():
MODELS
+=
[
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B"
,
]
# TODO: remove this after CPU float16 support ready
target_dtype
=
"float"
if
current_platform
.
is_cpu
()
else
"half"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
target_dtype
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_models
(
hf_runner
,
vllm_runner
,
...
...
@@ -33,33 +47,24 @@ def test_models(
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
# To pass the small model tests, we need full precision.
assert
dtype
==
"float"
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
check_
outputs_equal
(
check_
logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
def
test_model_print
(
vllm_runner
,
model
:
str
,
dtype
:
str
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print
(
vllm_model
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
tests/models/decoder_only/language/test_qwen.py
deleted
100644 → 0
View file @
b9c64c0c
"""Ensure that a text-only Qwen model can be run without throwing an error.
We explicitly test this because Qwen is implemented as a multimodal and
supports a visual encoder for models like Qwen-VL.
"""
from
typing
import
List
,
Type
import
pytest
from
....conftest
import
VllmRunner
models
=
[
"Qwen/Qwen-7B-Chat"
# Has no visual encoder
]
@
pytest
.
mark
.
parametrize
(
"model"
,
models
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_text_only_qwen_model_can_be_loaded_and_run
(
vllm_runner
:
Type
[
VllmRunner
],
example_prompts
:
List
[
str
],
model
:
str
,
*
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
):
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
=
num_logprobs
,
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment