Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a84e598e
Unverified
Commit
a84e598e
authored
Sep 14, 2024
by
Cyrus Leung
Committed by
GitHub
Sep 13, 2024
Browse files
[CI/Build] Reorganize models tests (#7820)
parent
0a4806f0
Changes
54
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
137 additions
and
69 deletions
+137
-69
tests/models/decoder_only/vision_language/test_llava_next.py
tests/models/decoder_only/vision_language/test_llava_next.py
+4
-6
tests/models/decoder_only/vision_language/test_llava_next_video.py
...els/decoder_only/vision_language/test_llava_next_video.py
+2
-4
tests/models/decoder_only/vision_language/test_minicpmv.py
tests/models/decoder_only/vision_language/test_minicpmv.py
+3
-5
tests/models/decoder_only/vision_language/test_paligemma.py
tests/models/decoder_only/vision_language/test_paligemma.py
+3
-5
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+3
-5
tests/models/decoder_only/vision_language/test_pixtral.py
tests/models/decoder_only/vision_language/test_pixtral.py
+16
-7
tests/models/decoder_only/vision_language/test_qwen.py
tests/models/decoder_only/vision_language/test_qwen.py
+3
-5
tests/models/embedding/__init__.py
tests/models/embedding/__init__.py
+0
-0
tests/models/embedding/language/__init__.py
tests/models/embedding/language/__init__.py
+0
-0
tests/models/embedding/language/test_embedding.py
tests/models/embedding/language/test_embedding.py
+0
-0
tests/models/encoder_decoder/__init__.py
tests/models/encoder_decoder/__init__.py
+0
-0
tests/models/encoder_decoder/language/__init__.py
tests/models/encoder_decoder/language/__init__.py
+0
-0
tests/models/encoder_decoder/language/test_bart.py
tests/models/encoder_decoder/language/test_bart.py
+84
-31
tests/utils.py
tests/utils.py
+19
-1
No files found.
tests/models/test_llava_next.py
→
tests/models/
decoder_only/vision_language/
test_llava_next.py
View file @
a84e598e
...
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
...
@@ -6,11 +6,9 @@ from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
from
..
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
_ImageAssets
)
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
_LIMIT_IMAGE_PER_PROMPT
=
4
_LIMIT_IMAGE_PER_PROMPT
=
4
...
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
...
@@ -197,7 +195,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
dtype
,
max_tokens
,
num_logprobs
)
->
None
:
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
and corresponding MultiModalConfig as input.
...
...
tests/models/test_llava_next_video.py
→
tests/models/
decoder_only/vision_language/
test_llava_next_video.py
View file @
a84e598e
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import (rescale_video_size, resize_video,
sample_frames_from_video
)
sample_frames_from_video
)
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
VIDEO_ASSETS
,
HfRunner
,
VllmRunner
,
_VideoAssets
from
....conftest
import
VIDEO_ASSETS
,
HfRunner
,
VllmRunner
,
_VideoAssets
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
_PREFACE
=
(
_PREFACE
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"A chat between a curious human and an artificial intelligence assistant. "
...
...
tests/models/test_minicpmv.py
→
tests/models/
decoder_only/vision_language/
test_minicpmv.py
View file @
a84e598e
...
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
...
@@ -9,10 +9,8 @@ from transformers import BatchEncoding
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.multimodal.utils
import
rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
# The image token is placed before "user" on purpose so that the test can pass
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
...
@@ -65,7 +63,7 @@ def run_test(
...
@@ -65,7 +63,7 @@ def run_test(
):
):
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
and corresponding MultiModalConfig as input.
...
...
tests/models/test_paligemma.py
→
tests/models/
decoder_only/vision_language/
test_paligemma.py
View file @
a84e598e
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
...
@@ -8,10 +8,8 @@ from vllm.multimodal.utils import rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_hip
from
vllm.utils
import
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
VllmRunner
,
_ImageAssets
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"stop_sign"
:
...
@@ -69,7 +67,7 @@ def run_test(
...
@@ -69,7 +67,7 @@ def run_test(
):
):
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
and corresponding MultiModalConfig as input.
...
...
tests/models/test_phi3v.py
→
tests/models/
decoder_only/vision_language/
test_phi3v.py
View file @
a84e598e
...
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
...
@@ -9,10 +9,8 @@ from vllm.multimodal.utils import rescale_image_size
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
is_cpu
,
is_hip
from
vllm.utils
import
is_cpu
,
is_hip
from
..conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
....conftest
import
IMAGE_ASSETS
,
HfRunner
,
PromptImageInput
,
VllmRunner
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
HF_IMAGE_PROMPTS
=
IMAGE_ASSETS
.
prompts
({
"stop_sign"
:
"stop_sign"
:
...
@@ -71,7 +69,7 @@ def run_test(
...
@@ -71,7 +69,7 @@ def run_test(
):
):
"""Inference result should be the same between hf and vllm.
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test
is under tests/images
.
All the image fixtures for the test
are from IMAGE_ASSETS
.
For huggingface runner, we provide the PIL images as input.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
and corresponding MultiModalConfig as input.
...
...
tests/models/test_pixtral.py
→
tests/models/
decoder_only/vision_language/
test_pixtral.py
View file @
a84e598e
...
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
...
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_mistral.py`.
import
json
import
json
import
uuid
import
uuid
from
dataclasses
import
asdict
from
dataclasses
import
asdict
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Tuple
from
typing
import
TYPE_CHECKING
,
Any
,
Dict
,
List
,
Optional
,
Tuple
import
pytest
import
pytest
from
mistral_common.protocol.instruct.messages
import
ImageURLChunk
from
mistral_common.protocol.instruct.messages
import
ImageURLChunk
...
@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
...
@@ -17,9 +17,11 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.multimodal
import
MultiModalDataBuiltins
from
vllm.sequence
import
Logprob
,
SampleLogprobs
from
vllm.sequence
import
Logprob
,
SampleLogprobs
from
.utils
import
check_logprobs_close
from
....utils
import
VLLM_PATH
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
if
TYPE_CHECKING
:
from
_typeshed
import
StrPath
MODELS
=
[
"mistralai/Pixtral-12B-2409"
]
MODELS
=
[
"mistralai/Pixtral-12B-2409"
]
IMG_URLS
=
[
IMG_URLS
=
[
...
@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
...
@@ -83,14 +85,21 @@ SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT
=
dict
(
image
=
4
)
LIMIT_MM_PER_PROMPT
=
dict
(
image
=
4
)
MAX_MODEL_LEN
=
[
8192
,
65536
]
MAX_MODEL_LEN
=
[
8192
,
65536
]
FIXTURE_LOGPROBS_CHAT
=
"tests/models/fixtures/pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE
=
"tests/models/fixtures/pixtral_chat_engine.json"
FIXTURES_PATH
=
VLLM_PATH
/
"tests/models/fixtures"
assert
FIXTURES_PATH
.
exists
()
FIXTURE_LOGPROBS_CHAT
=
FIXTURES_PATH
/
"pixtral_chat.json"
FIXTURE_LOGPROBS_ENGINE
=
FIXTURES_PATH
/
"pixtral_chat_engine.json"
OutputsLogprobs
=
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
OutputsLogprobs
=
List
[
Tuple
[
List
[
int
],
str
,
Optional
[
SampleLogprobs
]]]
# For the test author to store golden output in JSON
# For the test author to store golden output in JSON
def
_dump_outputs_w_logprobs
(
outputs
:
OutputsLogprobs
,
filename
:
str
)
->
None
:
def
_dump_outputs_w_logprobs
(
outputs
:
OutputsLogprobs
,
filename
:
"StrPath"
,
)
->
None
:
json_data
=
[(
tokens
,
text
,
json_data
=
[(
tokens
,
text
,
[{
k
:
asdict
(
v
)
[{
k
:
asdict
(
v
)
for
k
,
v
in
token_logprobs
.
items
()}
for
k
,
v
in
token_logprobs
.
items
()}
...
@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
...
@@ -101,7 +110,7 @@ def _dump_outputs_w_logprobs(outputs: OutputsLogprobs, filename: str) -> None:
json
.
dump
(
json_data
,
f
)
json
.
dump
(
json_data
,
f
)
def
load_outputs_w_logprobs
(
filename
:
str
)
->
OutputsLogprobs
:
def
load_outputs_w_logprobs
(
filename
:
"StrPath"
)
->
OutputsLogprobs
:
with
open
(
filename
,
"rb"
)
as
f
:
with
open
(
filename
,
"rb"
)
as
f
:
json_data
=
json
.
load
(
f
)
json_data
=
json
.
load
(
f
)
...
...
tests/models/test_qwen.py
→
tests/models/
decoder_only/vision_language/
test_qwen.py
View file @
a84e598e
...
@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
...
@@ -10,11 +10,9 @@ from vllm.inputs import InputContext, LLMInputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
rescale_image_size
from
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
PromptImageInput
,
from
..
..conftest
import
(
IMAGE_ASSETS
,
HfRunner
,
ImageAsset
,
PromptImageInput
,
VllmRunner
,
_ImageAssets
)
VllmRunner
,
_ImageAssets
)
from
.utils
import
check_logprobs_close
from
...utils
import
check_logprobs_close
pytestmark
=
pytest
.
mark
.
vlm
text_only_models
=
[
text_only_models
=
[
"Qwen/Qwen-7B-Chat"
# Has no visual component
"Qwen/Qwen-7B-Chat"
# Has no visual component
...
...
tests/models/embedding/__init__.py
0 → 100644
View file @
a84e598e
tests/models/embedding/language/__init__.py
0 → 100644
View file @
a84e598e
tests/models/test_embedding.py
→
tests/models/
embedding/language/
test_embedding.py
View file @
a84e598e
File moved
tests/models/encoder_decoder/__init__.py
0 → 100644
View file @
a84e598e
tests/models/encoder_decoder/language/__init__.py
0 → 100644
View file @
a84e598e
tests/models/test_bart.py
→
tests/models/
encoder_decoder/language/
test_bart.py
View file @
a84e598e
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/test_bart.py`.
Run `pytest tests/models/
encoder_decoder/language/
test_bart.py`.
"""
"""
from
typing
import
List
,
Optional
,
Tuple
from
typing
import
List
,
Optional
,
Tuple
,
Type
from
vllm.utils
import
is_cpu
from
vllm.utils
import
is_cpu
...
@@ -16,8 +16,10 @@ if not is_cpu():
...
@@ -16,8 +16,10 @@ if not is_cpu():
from
vllm.sequence
import
SampleLogprobs
from
vllm.sequence
import
SampleLogprobs
from
..conftest
import
DecoderPromptType
from
....conftest
import
(
DecoderPromptType
,
ExplicitEncoderDecoderPrompt
,
from
.utils
import
check_logprobs_close
HfRunner
,
VllmRunner
)
from
....utils
import
multi_gpu_test
from
...utils
import
check_logprobs_close
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
MODELS
=
[
"facebook/bart-base"
,
"facebook/bart-large-cnn"
]
...
@@ -34,20 +36,18 @@ if not is_cpu():
...
@@ -34,20 +36,18 @@ if not is_cpu():
return
output_ids
,
hf_output_str
,
out_logprobs
return
output_ids
,
hf_output_str
,
out_logprobs
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
run_test
(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
hf_runner
:
Type
[
HfRunner
],
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
vllm_runner
:
Type
[
VllmRunner
],
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
prompts
:
List
[
ExplicitEncoderDecoderPrompt
[
str
,
str
]],
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
decoder_prompt_type
:
DecoderPromptType
,
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
:
str
,
model
:
str
,
*
,
dtype
:
str
,
dtype
:
str
,
max_tokens
:
int
,
max_tokens
:
int
,
num_logprobs
:
int
,
num_logprobs
:
int
,
decoder_prompt_type
:
DecoderPromptType
,
tensor_parallel_size
:
int
,
distributed_executor_backend
:
Optional
[
str
]
=
None
,
)
->
None
:
)
->
None
:
'''
'''
Test the vLLM BART model for a variety of encoder/decoder input prompts,
Test the vLLM BART model for a variety of encoder/decoder input prompts,
...
@@ -116,8 +116,29 @@ if not is_cpu():
...
@@ -116,8 +116,29 @@ if not is_cpu():
token during the process of validating the vLLM decoded output.
token during the process of validating the vLLM decoded output.
'''
'''
test_case_prompts
=
example_encoder_decoder_prompts
[
# NOTE: take care of the order. run vLLM first, and then run HF.
decoder_prompt_type
]
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default).
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
prompts
,
max_tokens
,
num_logprobs
)
# Configuration settings for HF baseline
# Configuration settings for HF baseline
hf_kwargs
=
{
hf_kwargs
=
{
...
@@ -135,26 +156,12 @@ if not is_cpu():
...
@@ -135,26 +156,12 @@ if not is_cpu():
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
auto_cls
=
AutoModelForSeq2SeqLM
)
as
hf_model
:
hf_outputs
=
(
hf_outputs
=
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
hf_model
.
generate_encoder_decoder_greedy_logprobs_limit
(
test_case_
prompts
,
prompts
,
max_tokens
,
max_tokens
,
num_logprobs
,
num_logprobs
,
**
hf_kwargs
,
**
hf_kwargs
,
))
))
# Note: currently encoder/decoder models are only compatible with
# enforce_eager=True. Normally this is not a problem because
# for encoder/decoder models vLLM will
# default to enforce_eager=True if enforce_eager
# is left unspecified. However, the
# VllmRunner test fixture (which wraps around the LLM class) defaults to
# enforce_eager=False (a behavior which a number of already-exisitng
# decoder-only unit tests expect), so when testing an encoder/decoder
# model we must explicitly specify enforce_eager=True in the VllmRunner
# constructor.
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_encoder_decoder_greedy_logprobs
(
test_case_prompts
,
max_tokens
,
num_logprobs
)
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
hf_skip_tokens
=
(
1
if
decoder_prompt_type
==
DecoderPromptType
.
NONE
else
0
)
else
0
)
...
@@ -168,3 +175,49 @@ if not is_cpu():
...
@@ -168,3 +175,49 @@ if not is_cpu():
name_1
=
"vllm"
,
name_1
=
"vllm"
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
num_outputs_0_skip_tokens
=
hf_skip_tokens
,
)
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
1
,
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"facebook/bart-large-cnn"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
distributed_executor_backend
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
run_test
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
[
decoder_prompt_type
],
decoder_prompt_type
,
model
,
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/utils.py
View file @
a84e598e
...
@@ -10,6 +10,7 @@ from pathlib import Path
...
@@ -10,6 +10,7 @@ from pathlib import Path
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Optional
import
openai
import
openai
import
pytest
import
requests
import
requests
from
openai.types.completion
import
Completion
from
openai.types.completion
import
Completion
from
transformers
import
AutoTokenizer
from
transformers
import
AutoTokenizer
...
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
...
@@ -22,7 +23,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.entrypoints.openai.cli_args
import
make_arg_parser
from
vllm.model_executor.model_loader.loader
import
get_model_loader
from
vllm.model_executor.model_loader.loader
import
get_model_loader
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
FlexibleArgumentParser
,
get_open_port
,
is_hip
from
vllm.utils
import
(
FlexibleArgumentParser
,
cuda_device_count_stateless
,
get_open_port
,
is_hip
)
if
current_platform
.
is_rocm
():
if
current_platform
.
is_rocm
():
from
amdsmi
import
(
amdsmi_get_gpu_vram_usage
,
from
amdsmi
import
(
amdsmi_get_gpu_vram_usage
,
...
@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
...
@@ -452,6 +454,22 @@ def fork_new_process_for_each_test(
return
wrapper
return
wrapper
def
multi_gpu_test
(
*
,
num_gpus
:
int
):
"""
Decorate a test to be run only when multiple GPUs are available.
"""
test_selector
=
getattr
(
pytest
.
mark
,
f
"distributed_
{
num_gpus
}
_gpus"
)
test_skipif
=
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
num_gpus
,
reason
=
f
"Need at least
{
num_gpus
}
GPUs to run the test."
,
)
def
wrapper
(
f
:
Callable
[
_P
,
None
])
->
Callable
[
_P
,
None
]:
return
test_selector
(
test_skipif
(
fork_new_process_for_each_test
(
f
)))
return
wrapper
async
def
completions_with_server_args
(
async
def
completions_with_server_args
(
prompts
:
List
[
str
],
prompts
:
List
[
str
],
model_name
:
str
,
model_name
:
str
,
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment