Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
051eaf6d
Unverified
Commit
051eaf6d
authored
Oct 19, 2024
by
Cyrus Leung
Committed by
GitHub
Oct 18, 2024
Browse files
[Model] Add user-configurable task for models that support both generation and embedding (#9424)
parent
7dbe738d
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
218 additions
and
130 deletions
+218
-130
docs/source/models/supported_models.rst
docs/source/models/supported_models.rst
+8
-0
docs/source/models/vlm.rst
docs/source/models/vlm.rst
+2
-2
examples/offline_inference_vision_language_embedding.py
examples/offline_inference_vision_language_embedding.py
+1
-0
examples/openai_api_client_for_multimodal.py
examples/openai_api_client_for_multimodal.py
+2
-2
tests/conftest.py
tests/conftest.py
+3
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+13
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+32
-24
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+6
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+19
-4
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+92
-0
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+0
-88
tests/entrypoints/llm/test_init.py
tests/entrypoints/llm/test_init.py
+22
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+1
-1
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+2
-0
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+2
-1
tests/lora/test_worker.py
tests/lora/test_worker.py
+3
-2
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+1
-0
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+1
-0
tests/models/utils.py
tests/models/utils.py
+4
-2
tests/multimodal/test_mapper.py
tests/multimodal/test_mapper.py
+4
-0
No files found.
docs/source/models/supported_models.rst
View file @
051eaf6d
...
...
@@ -294,6 +294,10 @@ Text Embedding
-
- ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
Reward Modeling
---------------
...
...
@@ -482,6 +486,10 @@ Multimodal Embedding
- 🚧
- ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
----
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
...
...
docs/source/models/vlm.rst
View file @
051eaf6d
...
...
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
.. code-block:: bash
vllm serve microsoft/Phi-3.5-vision-instruct --
max-model-len 4096
\
--trust-remote-code --limit-mm-per-prompt image=2
vllm serve microsoft/Phi-3.5-vision-instruct --
task generate
\
--trust-remote-code
--max-model-len 4096
--limit-mm-per-prompt image=2
.. important::
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
...
...
examples/offline_inference_vision_language_embedding.py
View file @
051eaf6d
...
...
@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha
# Create an LLM.
llm
=
LLM
(
model
=
"TIGER-Lab/VLM2Vec-Full"
,
task
=
"embedding"
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
...
...
examples/openai_api_client_for_multimodal.py
View file @
051eaf6d
...
...
@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --
max-model-len 4096
\
--trust-remote-code --limit-mm-per-prompt image=2
vllm serve microsoft/Phi-3.5-vision-instruct --
task generate
\
--trust-remote-code
--max-model-len 4096
--limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
...
...
tests/conftest.py
View file @
051eaf6d
...
...
@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
,
...
...
@@ -619,6 +619,7 @@ class VllmRunner:
def
__init__
(
self
,
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
...
...
@@ -634,6 +635,7 @@ class VllmRunner:
)
->
None
:
self
.
model
=
LLM
(
model
=
model_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
trust_remote_code
=
True
,
dtype
=
dtype
,
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
051eaf6d
...
...
@@ -33,7 +33,8 @@ def test_simple():
num_seq_group
=
4
max_model_len
=
16
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
num_seq_group
,
max_model_len
,
enable_chunked_prefill
=
True
)
...
...
@@ -78,6 +79,7 @@ def test_chunk():
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -126,6 +128,7 @@ def test_complex():
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len
=
8
max_num_batched_tokens
=
2
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len
=
64
max_num_batched_tokens
=
32
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs
=
64
max_model_len
=
32
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
enable_chunked_prefill
=
True
)
...
...
@@ -348,6 +354,7 @@ def test_swap():
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len
=
200
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
@@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len
=
80
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_model_len
,
...
...
tests/core/test_scheduler.py
View file @
051eaf6d
...
...
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
def
test_scheduler_add_seq_group
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_seqs
=
64
,
max_model_len
=
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
...
...
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def
test_scheduler_abort_seq_group
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_seqs
=
64
,
max_model_len
=
1
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
...
...
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
,
"generate"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
num_seq_group
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
...
...
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
2
,
max_model_len
,
"generate"
,
max_num_batched_tokens
=
max_batched_num_tokens
,
max_num_seqs
=
2
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
...
...
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
,
"generate"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
2
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
...
...
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group
=
2
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
,
"generate"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
...
...
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def
test_scheduler_delay_factor
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
"generate"
,
max_num_batched_tokens
=
100
,
max_num_seqs
=
64
,
max_model_len
=
16
,
delay_factor
=
0.5
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
...
...
@@ -350,9 +357,10 @@ def initialize_scheduler(
):
block_size
=
block_size
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
max_model_len
,
"generate"
,
max_num_batched_tokens
=
max_token_budget
,
max_num_seqs
=
max_num_seqs
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
051eaf6d
...
...
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size
=
4
num_seq_group
=
4
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
task
=
"generate"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
num_seq_group
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
...
...
tests/distributed/test_pipeline_parallel.py
View file @
051eaf6d
...
...
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
import
pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
...
...
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
class
PPTestSettings
:
parallel_setups
:
List
[
ParallelSetup
]
distributed_backends
:
List
[
str
]
task
:
TaskOption
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
...
...
@@ -39,6 +41,7 @@ class PPTestSettings:
*
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
...
...
@@ -66,6 +69,7 @@ class PPTestSettings:
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
,
"ray"
],
task
=
task
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
...
...
@@ -75,6 +79,7 @@ class PPTestSettings:
*
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
...
...
@@ -86,6 +91,7 @@ class PPTestSettings:
chunked_prefill
=
False
),
],
distributed_backends
=
[
"mp"
],
task
=
task
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
)
...
...
@@ -94,7 +100,7 @@ class PPTestSettings:
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
self
.
task
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...
...
@@ -213,6 +219,7 @@ def _compare_tp(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
:
int
,
...
...
@@ -240,6 +247,8 @@ def _compare_tp(
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
task
!=
"auto"
:
common_args
.
extend
([
"--task"
,
task
])
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
...
...
@@ -297,7 +306,7 @@ def _compare_tp(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
GENERATION_MODEL_SETTINGS
.
items
()
...
...
@@ -310,6 +319,7 @@ def test_tp_language_generation(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
...
...
@@ -317,6 +327,7 @@ def test_tp_language_generation(
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
task
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
...
...
@@ -324,7 +335,7 @@ def test_tp_language_generation(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
EMBEDDING_MODEL_SETTINGS
.
items
()
...
...
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
...
...
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
task
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
...
...
@@ -351,7 +364,7 @@ def test_tp_language_embedding(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
[
params
for
model_name
,
settings
in
MULTIMODAL_MODEL_SETTINGS
.
items
()
...
...
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
...
...
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
_compare_tp
(
model_name
,
parallel_setup
,
distributed_backend
,
task
,
trust_remote_code
,
tokenizer_mode
,
num_gpus_available
,
...
...
tests/entrypoints/llm/test_chat.py
0 → 100644
View file @
051eaf6d
from
typing
import
List
import
pytest
from
vllm
import
LLM
from
..openai.test_vision
import
TEST_IMAGE_URLS
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_generate.py
View file @
051eaf6d
...
...
@@ -6,7 +6,6 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
MODEL_NAME
=
"facebook/opt-125m"
...
...
@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_init.py
0 → 100644
View file @
051eaf6d
import
pytest
from
vllm
import
LLM
from
...utils
import
error_on_warning
MODEL_NAME
=
"facebook/opt-125m"
def
test_pos_args_deprecated
():
with
error_on_warning
(
DeprecationWarning
):
LLM
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
error_on_warning
(
DeprecationWarning
):
LLM
(
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer', 'tokenizer_mode'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
,
"auto"
)
tests/entrypoints/openai/test_serving_chat.py
View file @
051eaf6d
...
...
@@ -22,12 +22,12 @@ class MockHFConfig:
@
dataclass
class
MockModelConfig
:
task
=
"generate"
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
tokenizer_mode
=
"auto"
max_model_len
=
100
tokenizer_revision
=
None
embedding_mode
=
False
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
...
...
tests/entrypoints/openai/test_vision.py
View file @
051eaf6d
...
...
@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
args
=
[
"--task"
,
"generate"
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
...
...
tests/entrypoints/test_chat_utils.py
View file @
051eaf6d
...
...
@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
def
phi3v_model_config
():
return
ModelConfig
(
PHI3V_MODEL_ID
,
PHI3V_MODEL_ID
,
task
=
"generate"
,
tokenizer
=
PHI3V_MODEL_ID
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
...
...
tests/lora/test_worker.py
View file @
051eaf6d
...
...
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
worker
=
Worker
(
model_config
=
ModelConfig
(
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
task
=
"auto"
,
tokenizer
=
"meta-llama/Llama-2-7b-hf"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
...
...
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
load_format
=
"dummy"
,
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
),
scheduler_config
=
SchedulerConfig
(
"generate"
,
32
,
32
,
32
),
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.
,
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
051eaf6d
...
...
@@ -89,6 +89,7 @@ def run_test(
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
051eaf6d
...
...
@@ -28,6 +28,7 @@ def test_models(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
task
=
"embedding"
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
dtype
=
dtype
,
...
...
tests/models/utils.py
View file @
051eaf6d
...
...
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import
torch
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.utils
import
is_cpu
...
...
@@ -248,6 +248,7 @@ def check_logprobs_close(
def
build_model_context
(
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
...
...
@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
model_config
=
ModelConfig
(
model_name
,
tokenizer_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
...
...
tests/multimodal/test_mapper.py
View file @
051eaf6d
...
...
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
...
...
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
...
...
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
...
...
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment