Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b489fc3c
Unverified
Commit
b489fc3c
authored
Nov 08, 2024
by
Cyrus Leung
Committed by
GitHub
Nov 08, 2024
Browse files
[CI/Build] Update CPU tests to include all "standard" tests (#5481)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
208ce622
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
63 additions
and
48 deletions
+63
-48
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test-ppc64le.sh
+13
-8
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+17
-8
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-2
pyproject.toml
pyproject.toml
+2
-1
requirements-test.in
requirements-test.in
+0
-5
tests/models/decoder_only/audio_language/test_ultravox.py
tests/models/decoder_only/audio_language/test_ultravox.py
+13
-4
tests/models/decoder_only/vision_language/test_h2ovl.py
tests/models/decoder_only/vision_language/test_h2ovl.py
+0
-1
tests/models/decoder_only/vision_language/test_models.py
tests/models/decoder_only/vision_language/test_models.py
+4
-7
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+0
-2
tests/models/utils.py
tests/models/utils.py
+1
-2
vllm/assets/image.py
vllm/assets/image.py
+1
-1
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+2
-2
vllm/multimodal/utils.py
vllm/multimodal/utils.py
+4
-4
vllm/worker/cpu_worker.py
vllm/worker/cpu_worker.py
+5
-1
No files found.
.buildkite/run-cpu-test-ppc64le.sh
View file @
b489fc3c
...
...
@@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
# Run basic model test
docker
exec
cpu-test bash
-c
"
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m
\"
not vlm
\"
\
--ignore=tests/models/test_embedding.py
\
--ignore=tests/models/test_oot_registration.py
\
--ignore=tests/models/test_registry.py
\
--ignore=tests/models/test_jamba.py
\
--ignore=tests/models/test_mamba.py
\
--ignore=tests/models/test_danube3_4b.py"
# Mamba kernels and Danube3-4B on CPU is not supported
set -e
pip install pytest pytest-asyncio
\
decord einops librosa peft Pillow sentence-transformers soundfile
\
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# online inference
docker
exec
cpu-test bash
-c
"
set -e
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py
\
...
...
.buildkite/run-cpu-test.sh
View file @
b489fc3c
...
...
@@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
--cpuset-mems
=
1
--privileged
=
true
--network
host
-e
HF_TOKEN
--env
VLLM_CPU_KVCACHE_SPACE
=
4
--shm-size
=
4g
--name
cpu-test-avx2 cpu-test-avx2
# offline inference
docker
exec
cpu-test-avx2 bash
-c
"python3 examples/offline_inference.py"
docker
exec
cpu-test-avx2 bash
-c
"
set -e
python3 examples/offline_inference.py"
# Run basic model test
docker
exec
cpu-test bash
-c
"
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
set -e
pip install pytest pytest-asyncio
\
decord einops librosa peft Pillow sentence-transformers soundfile
\
transformers_stream_generator matplotlib datamodel_code_generator
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
# Embedding models are not supported for CPU yet
# pytest -v -s tests/models/embedding/language
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language
\
--ignore=tests/models/test_fp8.py
\
--ignore=tests/models/decoder_only/language/test_jamba.py
\
--ignore=tests/models/decoder_only/language/test_mamba.py
\
--ignore=tests/models/decoder_only/language/test_granitemoe.py
\
--ignore=tests/models/decoder_only/language/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models/decoder_only/language/test_models.py
# Chunked prefill not supported for CPU yet
# pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
# Run compressed-tensor test
docker
exec
cpu-test bash
-c
"
set -e
pytest -s -v
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker
exec
cpu-test bash
-c
"
set -e
pytest -s -v
\
tests/quantization/test_ipex_quant.py"
# online inference
docker
exec
cpu-test bash
-c
"
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
...
...
.buildkite/test-pipeline.yaml
View file @
b489fc3c
...
...
@@ -269,7 +269,6 @@ steps:
source_file_dependencies
:
-
benchmarks/
commands
:
-
pip install aiohttp
-
bash run-benchmarks.sh
-
label
:
Quantization Test
# 33min
...
...
@@ -331,7 +330,7 @@ steps:
commands
:
-
pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
-
label
:
Decoder-only Multi-Modal Models Test (Standard)
-
label
:
Decoder-only Multi-Modal Models Test (Standard)
# 26min
#mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
...
...
pyproject.toml
View file @
b489fc3c
...
...
@@ -93,7 +93,8 @@ skip_gitignore = true
[tool.pytest.ini_options]
markers
=
[
"skip_global_cleanup"
,
"core_model: run this model test in each PR instead of just daily"
,
"core_model: enable this model test in each PR instead of only nightly"
,
"cpu_model: enable this model test in CPU tests"
,
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs"
,
"skip_v1: do not run this test with v1"
,
]
requirements-test.in
View file @
b489fc3c
...
...
@@ -12,9 +12,7 @@ decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
opencv-python # required for video tests
peft
requests
ray[adag]==2.35
sentence-transformers # required for embedding tests
soundfile # required for audio tests
...
...
@@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
# Benchmarking
aiohttp
# quantization
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.9
...
...
tests/models/decoder_only/audio_language/test_ultravox.py
View file @
b489fc3c
...
...
@@ -5,11 +5,11 @@ import pytest
import
pytest_asyncio
from
transformers
import
AutoModel
,
AutoTokenizer
,
BatchEncoding
from
tests.utils
import
RemoteOpenAIServer
from
vllm.sequence
import
SampleLogprobs
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
....conftest
import
HfRunner
,
VllmRunner
from
....utils
import
RemoteOpenAIServer
from
...utils
import
check_logprobs_close
MODEL_NAME
=
"fixie-ai/ultravox-v0_3"
...
...
@@ -39,7 +39,10 @@ def audio(request):
return
AudioAsset
(
request
.
param
)
@
pytest
.
fixture
(
params
=
({},
CHUNKED_PREFILL_KWARGS
))
@
pytest
.
fixture
(
params
=
[
pytest
.
param
({},
marks
=
pytest
.
mark
.
cpu_model
),
pytest
.
param
(
CHUNKED_PREFILL_KWARGS
),
])
def
server
(
request
,
audio_assets
):
args
=
[
"--dtype=bfloat16"
,
"--max-model-len=4096"
,
"--enforce-eager"
,
...
...
@@ -185,7 +188,10 @@ def run_multi_audio_test(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[{},
CHUNKED_PREFILL_KWARGS
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[
pytest
.
param
({},
marks
=
pytest
.
mark
.
cpu_model
),
pytest
.
param
(
CHUNKED_PREFILL_KWARGS
),
])
def
test_models
(
hf_runner
,
vllm_runner
,
audio
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
vllm_kwargs
:
dict
)
->
None
:
...
...
@@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[{},
CHUNKED_PREFILL_KWARGS
])
@
pytest
.
mark
.
parametrize
(
"vllm_kwargs"
,
[
pytest
.
param
({},
marks
=
pytest
.
mark
.
cpu_model
),
pytest
.
param
(
CHUNKED_PREFILL_KWARGS
),
])
def
test_models_with_multiple_audios
(
vllm_runner
,
audio_assets
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
vllm_kwargs
:
dict
)
->
None
:
...
...
tests/models/decoder_only/vision_language/test_h2ovl.py
View file @
b489fc3c
...
...
@@ -14,7 +14,6 @@ models = [
"h2oai/h2ovl-mississippi-800m"
,
# Replace with your actual model names
"h2oai/h2ovl-mississippi-2b"
,
]
target_dtype
=
"bfloat16"
def
run_preprocessing_test
(
...
...
tests/models/decoder_only/vision_language/test_models.py
View file @
b489fc3c
...
...
@@ -94,7 +94,7 @@ VLM_TEST_SETTINGS = {
),
limit_mm_per_prompt
=
{
"image"
:
4
},
)],
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
),
"paligemma"
:
VLMTestInfo
(
models
=
[
"google/paligemma-3b-mix-224"
],
...
...
@@ -111,7 +111,8 @@ VLM_TEST_SETTINGS = {
"pixel_values"
),
vllm_output_post_proc
=
model_utils
.
paligemma_vllm_to_hf_output
,
dtype
=
"half"
if
current_platform
.
is_rocm
()
else
(
"half"
,
"float"
),
dtype
=
(
"half"
if
current_platform
.
is_cpu
()
or
current_platform
.
is_rocm
()
else
(
"half"
,
"float"
)),
marks
=
[
pytest
.
mark
.
core_model
],
),
"qwen2_vl"
:
VLMTestInfo
(
...
...
@@ -128,7 +129,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs
=
2
,
auto_cls
=
AutoModelForVision2Seq
,
vllm_output_post_proc
=
model_utils
.
qwen2_vllm_to_hf_output
,
marks
=
[
pytest
.
mark
.
core_model
],
marks
=
[
pytest
.
mark
.
core_model
,
pytest
.
mark
.
cpu_model
],
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
#### Extended model tests
...
...
@@ -172,7 +173,6 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos
=
True
,
vllm_output_post_proc
=
model_utils
.
fuyu_vllm_to_hf_output
,
num_logprobs
=
10
,
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
,
image_size_factors
=
[(),
(
0.25
,),
(
0.25
,
0.25
,
0.25
),
(
0.25
,
0.2
,
0.15
)],
),
"glm4"
:
VLMTestInfo
(
...
...
@@ -245,7 +245,6 @@ VLM_TEST_SETTINGS = {
models
=
[
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
],
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
prompt_formatter
=
lambda
vid_prompt
:
f
"<|im_start|>user
\n
{
vid_prompt
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
,
# noqa: E501
dtype
=
"half"
,
num_video_frames
=
16
,
max_model_len
=
16384
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
...
...
@@ -404,7 +403,6 @@ VLM_TEST_SETTINGS = {
prompt_formatter
=
lambda
img_prompt
:
f
"<|im_start|>User
\n
{
img_prompt
}
<|im_end|>
\n
<|im_start|>Assistant
\n
"
,
# noqa: E501
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
4096
,
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
,
use_tokenizer_eos
=
True
,
patch_hf_runner
=
model_utils
.
internvl_patch_hf_runner
,
custom_test_opts
=
[
...
...
@@ -419,7 +417,6 @@ VLM_TEST_SETTINGS = {
test_type
=
VLMTestType
.
CUSTOM_INPUTS
,
max_model_len
=
16384
,
max_num_seqs
=
2
,
dtype
=
"half"
,
postprocess_inputs
=
model_utils
.
get_key_type_post_processor
(
"pixel_values"
),
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
b489fc3c
...
...
@@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype
=
"half"
if
current_platform
.
is_cpu
():
target_dtype
=
"bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
# use other backends in the meantime
...
...
tests/models/utils.py
View file @
b489fc3c
...
...
@@ -5,7 +5,6 @@ import torch
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
TokensText
=
Tuple
[
List
[
int
],
str
]
...
...
@@ -270,7 +269,7 @@ def build_model_context(model_name: str,
if
tokenizer_name
is
None
:
tokenizer_name
=
model_name
if
dtype
is
None
:
dtype
=
"bfloat16"
if
current_platform
.
is_cpu
()
else
"half"
dtype
=
"half"
model_config
=
ModelConfig
(
model_name
,
...
...
vllm/assets/image.py
View file @
b489fc3c
...
...
@@ -27,4 +27,4 @@ class ImageAsset:
"""
image_path
=
get_vllm_public_assets
(
filename
=
f
"
{
self
.
name
}
.pt"
,
s3_prefix
=
VLM_IMAGES_DIR
)
return
torch
.
load
(
image_path
)
return
torch
.
load
(
image_path
,
map_location
=
"cpu"
)
vllm/model_executor/models/ultravox.py
View file @
b489fc3c
...
...
@@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
if
sr
!=
feature_extractor
.
sampling_rate
:
try
:
import
librosa
except
ImportError
:
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[audio] for audio support."
)
from
None
"Please install vllm[audio] for audio support."
)
from
exc
audio
=
librosa
.
resample
(
audio
,
orig_sr
=
sr
,
target_sr
=
feature_extractor
.
sampling_rate
)
...
...
vllm/multimodal/utils.py
View file @
b489fc3c
...
...
@@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
try
:
import
librosa
import
soundfile
except
ImportError
:
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[audio] for audio support."
)
from
None
"Please install vllm[audio] for audio support."
)
from
exc
return
librosa
,
soundfile
...
...
@@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
try
:
import
cv2
import
decord
except
ImportError
:
except
ImportError
as
exc
:
raise
ImportError
(
"Please install vllm[video] for video support."
)
from
None
"Please install vllm[video] for video support."
)
from
exc
return
cv2
,
decord
...
...
vllm/worker/cpu_worker.py
View file @
b489fc3c
...
...
@@ -151,7 +151,11 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
self
.
local_omp_cpuid
=
omp_cpuids
.
split
(
"|"
)[
rank
]
ModelRunnerClass
:
Type
[
CPUModelRunner
]
=
CPUModelRunner
if
self
.
model_config
.
is_encoder_decoder
:
if
self
.
model_config
.
task
==
"embedding"
:
raise
NotImplementedError
(
"Embedding models are not supported for CPU backend"
)
# ModelRunnerClass = CPUEmbeddingModelRunner
elif
self
.
model_config
.
is_encoder_decoder
:
ModelRunnerClass
=
CPUEncoderDecoderModelRunner
self
.
model_runner
:
CPUModelRunner
=
ModelRunnerClass
(
vllm_config
=
vllm_config
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment