"examples/vscode:/vscode.git/clone" did not exist on "93529753cda1a8cba52fd7d5cc57d1273633337a"
Unverified Commit 051eaf6d authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Model] Add user-configurable task for models that support both generation and embedding (#9424)

parent 7dbe738d
...@@ -294,6 +294,10 @@ Text Embedding ...@@ -294,6 +294,10 @@ Text Embedding
- -
- ✅︎ - ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
Reward Modeling Reward Modeling
--------------- ---------------
...@@ -482,6 +486,10 @@ Multimodal Embedding ...@@ -482,6 +486,10 @@ Multimodal Embedding
- 🚧 - 🚧
- ✅︎ - ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
---- ----
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
......
...@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc ...@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
.. code-block:: bash .. code-block:: bash
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --limit-mm-per-prompt image=2 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
.. important:: .. important::
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API, Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
......
...@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha ...@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha
# Create an LLM. # Create an LLM.
llm = LLM( llm = LLM(
model="TIGER-Lab/VLM2Vec-Full", model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
......
...@@ -7,8 +7,8 @@ Launch the vLLM server with the following command: ...@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct) (multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --limit-mm-per-prompt image=2 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox) (audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
......
...@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs, ...@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import TokenizerPoolConfig from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment, from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel, destroy_model_parallel,
...@@ -619,6 +619,7 @@ class VllmRunner: ...@@ -619,6 +619,7 @@ class VllmRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
# Use smaller max model length, otherwise bigger model cannot run due # Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit. # to kv cache size limit.
...@@ -634,6 +635,7 @@ class VllmRunner: ...@@ -634,6 +635,7 @@ class VllmRunner:
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
model=model_name, model=model_name,
task=task,
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
......
...@@ -33,7 +33,8 @@ def test_simple(): ...@@ -33,7 +33,8 @@ def test_simple():
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
num_seq_group, num_seq_group,
max_model_len, max_model_len,
enable_chunked_prefill=True) enable_chunked_prefill=True)
...@@ -78,6 +79,7 @@ def test_chunk(): ...@@ -78,6 +79,7 @@ def test_chunk():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -126,6 +128,7 @@ def test_complex(): ...@@ -126,6 +128,7 @@ def test_complex():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -196,6 +199,7 @@ def test_maximal_decoding(): ...@@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len = 8 max_model_len = 8
max_num_batched_tokens = 2 max_num_batched_tokens = 2
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -289,6 +293,7 @@ def test_prompt_limit(): ...@@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len = 64 max_model_len = 64
max_num_batched_tokens = 32 max_num_batched_tokens = 32
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -321,7 +326,8 @@ def test_prompt_limit_exceed(): ...@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs = 64 max_seqs = 64
max_model_len = 32 max_model_len = 32
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True) enable_chunked_prefill=True)
...@@ -348,6 +354,7 @@ def test_swap(): ...@@ -348,6 +354,7 @@ def test_swap():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap(): ...@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt(): ...@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs(): ...@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -617,6 +627,7 @@ def test_perfix_caching(): ...@@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
......
...@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group, ...@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
def test_scheduler_add_seq_group(): def test_scheduler_add_seq_group():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
1, max_num_seqs=64,
max_model_len=1,
) )
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
...@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group(): ...@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group(): def test_scheduler_abort_seq_group():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
1, max_num_seqs=64,
max_model_len=1,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
...@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple(): ...@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
num_seq_group, max_num_batched_tokens=64,
max_model_len, max_num_seqs=num_seq_group,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
...@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized(): ...@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len = 30 max_model_len = 30
max_batched_num_tokens = 30 max_batched_num_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_batched_num_tokens, "generate",
2, max_num_batched_tokens=max_batched_num_tokens,
max_model_len, max_num_seqs=2,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
...@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort(): ...@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size = 4 block_size = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
2, max_num_batched_tokens=64,
max_model_len, max_num_seqs=2,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 2
...@@ -204,9 +209,10 @@ def test_scheduler_max_seqs(): ...@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group = 2 max_seq_group = 2
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
max_seq_group, max_num_batched_tokens=64,
max_model_len, max_num_seqs=max_seq_group,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
...@@ -248,9 +254,10 @@ def test_scheduler_max_seqs(): ...@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor(): def test_scheduler_delay_factor():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
16, max_num_seqs=64,
max_model_len=16,
delay_factor=0.5, delay_factor=0.5,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
...@@ -350,9 +357,10 @@ def initialize_scheduler( ...@@ -350,9 +357,10 @@ def initialize_scheduler(
): ):
block_size = block_size block_size = block_size
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_token_budget, "generate",
max_num_seqs, max_num_batched_tokens=max_token_budget,
max_model_len, max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_cpu_blocks = num_cpu_blocks
......
...@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder(): ...@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) scheduler_config = SchedulerConfig(
task="generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
......
...@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional ...@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
...@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple): ...@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
class PPTestSettings: class PPTestSettings:
parallel_setups: List[ParallelSetup] parallel_setups: List[ParallelSetup]
distributed_backends: List[str] distributed_backends: List[str]
task: TaskOption
trust_remote_code: bool trust_remote_code: bool
tokenizer_mode: Optional[str] tokenizer_mode: Optional[str]
...@@ -39,6 +41,7 @@ class PPTestSettings: ...@@ -39,6 +41,7 @@ class PPTestSettings:
*, *,
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
): ):
...@@ -66,6 +69,7 @@ class PPTestSettings: ...@@ -66,6 +69,7 @@ class PPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
task=task,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
) )
...@@ -75,6 +79,7 @@ class PPTestSettings: ...@@ -75,6 +79,7 @@ class PPTestSettings:
*, *,
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
): ):
...@@ -86,6 +91,7 @@ class PPTestSettings: ...@@ -86,6 +91,7 @@ class PPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp"], distributed_backends=["mp"],
task=task,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
) )
...@@ -94,7 +100,7 @@ class PPTestSettings: ...@@ -94,7 +100,7 @@ class PPTestSettings:
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends: for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend, yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode) self.task, self.trust_remote_code, self.tokenizer_mode)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...@@ -213,6 +219,7 @@ def _compare_tp( ...@@ -213,6 +219,7 @@ def _compare_tp(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption,
trust_remote_code: bool, trust_remote_code: bool,
tokenizer_mode: Optional[str], tokenizer_mode: Optional[str],
num_gpus_available: int, num_gpus_available: int,
...@@ -240,6 +247,8 @@ def _compare_tp( ...@@ -240,6 +247,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill") common_args.append("--enable-chunked-prefill")
if eager_mode: if eager_mode:
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if trust_remote_code: if trust_remote_code:
common_args.append("--trust-remote-code") common_args.append("--trust-remote-code")
if tokenizer_mode: if tokenizer_mode:
...@@ -297,7 +306,7 @@ def _compare_tp( ...@@ -297,7 +306,7 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "trust_remote_code", "tokenizer_mode"),
[ [
params for model_name, settings in GENERATION_MODEL_SETTINGS.items() params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
...@@ -310,6 +319,7 @@ def test_tp_language_generation( ...@@ -310,6 +319,7 @@ def test_tp_language_generation(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption,
trust_remote_code: bool, trust_remote_code: bool,
tokenizer_mode: Optional[str], tokenizer_mode: Optional[str],
num_gpus_available, num_gpus_available,
...@@ -317,6 +327,7 @@ def test_tp_language_generation( ...@@ -317,6 +327,7 @@ def test_tp_language_generation(
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
task,
trust_remote_code, trust_remote_code,
tokenizer_mode, tokenizer_mode,
num_gpus_available, num_gpus_available,
...@@ -324,7 +335,7 @@ def test_tp_language_generation( ...@@ -324,7 +335,7 @@ def test_tp_language_generation(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "trust_remote_code", "tokenizer_mode"),
[ [
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
...@@ -337,6 +348,7 @@ def test_tp_language_embedding( ...@@ -337,6 +348,7 @@ def test_tp_language_embedding(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption,
trust_remote_code: bool, trust_remote_code: bool,
tokenizer_mode: Optional[str], tokenizer_mode: Optional[str],
num_gpus_available, num_gpus_available,
...@@ -344,6 +356,7 @@ def test_tp_language_embedding( ...@@ -344,6 +356,7 @@ def test_tp_language_embedding(
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
task,
trust_remote_code, trust_remote_code,
tokenizer_mode, tokenizer_mode,
num_gpus_available, num_gpus_available,
...@@ -351,7 +364,7 @@ def test_tp_language_embedding( ...@@ -351,7 +364,7 @@ def test_tp_language_embedding(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "trust_remote_code", "tokenizer_mode"),
[ [
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
...@@ -364,6 +377,7 @@ def test_tp_multimodal_generation( ...@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption,
trust_remote_code: bool, trust_remote_code: bool,
tokenizer_mode: Optional[str], tokenizer_mode: Optional[str],
num_gpus_available, num_gpus_available,
...@@ -371,6 +385,7 @@ def test_tp_multimodal_generation( ...@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
task,
trust_remote_code, trust_remote_code,
tokenizer_mode, tokenizer_mode,
num_gpus_available, num_gpus_available,
......
from typing import List
import pytest
from vllm import LLM
from ..openai.test_vision import TEST_IMAGE_URLS
def test_chat():
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt1 = "Explain the concept of entropy."
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
outputs = llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat():
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]
messages = [conversation1, conversation2]
outputs = llm.chat(messages)
assert len(outputs) == 2
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
)
messages = [{
"role":
"user",
"content": [
*({
"type": "image_url",
"image_url": {
"url": image_url
}
} for image_url in image_urls),
{
"type": "text",
"text": "What's in this image?"
},
],
}]
outputs = llm.chat(messages)
assert len(outputs) >= 0
...@@ -6,7 +6,6 @@ import pytest ...@@ -6,7 +6,6 @@ import pytest
from vllm import LLM, RequestOutput, SamplingParams from vllm import LLM, RequestOutput, SamplingParams
from ...conftest import cleanup from ...conftest import cleanup
from ..openai.test_vision import TEST_IMAGE_URLS
MODEL_NAME = "facebook/opt-125m" MODEL_NAME = "facebook/opt-125m"
...@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM): ...@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied # sampling_params is None, default params should be applied
outputs = llm.generate(PROMPTS, sampling_params=None) outputs = llm.generate(PROMPTS, sampling_params=None)
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)
def test_chat():
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt1 = "Explain the concept of entropy."
messages = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
outputs = llm.chat(messages)
assert len(outputs) == 1
def test_multi_chat():
llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
conversation1 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt1
},
]
conversation2 = [
{
"role": "system",
"content": "You are a helpful assistant"
},
{
"role": "user",
"content": prompt2
},
]
messages = [conversation1, conversation2]
outputs = llm.chat(messages)
assert len(outputs) == 2
@pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
trust_remote_code=True,
limit_mm_per_prompt={"image": 2},
)
messages = [{
"role":
"user",
"content": [
*({
"type": "image_url",
"image_url": {
"url": image_url
}
} for image_url in image_urls),
{
"type": "text",
"text": "What's in this image?"
},
],
}]
outputs = llm.chat(messages)
assert len(outputs) >= 0
import pytest
from vllm import LLM
from ...utils import error_on_warning
MODEL_NAME = "facebook/opt-125m"
def test_pos_args_deprecated():
with error_on_warning(DeprecationWarning):
LLM(model=MODEL_NAME, tokenizer=MODEL_NAME)
with error_on_warning(DeprecationWarning):
LLM(MODEL_NAME, tokenizer=MODEL_NAME)
with pytest.warns(DeprecationWarning, match="'tokenizer'"):
LLM(MODEL_NAME, MODEL_NAME)
with pytest.warns(DeprecationWarning,
match="'tokenizer', 'tokenizer_mode'"):
LLM(MODEL_NAME, MODEL_NAME, "auto")
...@@ -22,12 +22,12 @@ class MockHFConfig: ...@@ -22,12 +22,12 @@ class MockHFConfig:
@dataclass @dataclass
class MockModelConfig: class MockModelConfig:
task = "generate"
tokenizer = MODEL_NAME tokenizer = MODEL_NAME
trust_remote_code = False trust_remote_code = False
tokenizer_mode = "auto" tokenizer_mode = "auto"
max_model_len = 100 max_model_len = 100
tokenizer_revision = None tokenizer_revision = None
embedding_mode = False
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
......
...@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [ ...@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task",
"generate",
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--max-model-len", "--max-model-len",
......
...@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" ...@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def phi3v_model_config(): def phi3v_model_config():
return ModelConfig(PHI3V_MODEL_ID, return ModelConfig(PHI3V_MODEL_ID,
PHI3V_MODEL_ID, task="generate",
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
......
...@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
worker = Worker( worker = Worker(
model_config=ModelConfig( model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-hf", task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
...@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
load_format="dummy", load_format="dummy",
), ),
parallel_config=ParallelConfig(1, 1, False), parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32), scheduler_config=SchedulerConfig("generate", 32, 32, 32),
device_config=DeviceConfig("cuda"), device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(block_size=16, cache_config=CacheConfig(block_size=16,
gpu_memory_utilization=1., gpu_memory_utilization=1.,
......
...@@ -89,6 +89,7 @@ def run_test( ...@@ -89,6 +89,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
task="generate",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype=dtype, dtype=dtype,
......
...@@ -28,6 +28,7 @@ def test_models( ...@@ -28,6 +28,7 @@ def test_models(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, with vllm_runner(model,
task="embedding",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype=dtype, dtype=dtype,
......
...@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union ...@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import torch import torch
from vllm.config import ModelConfig from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from vllm.utils import is_cpu from vllm.utils import is_cpu
...@@ -248,6 +248,7 @@ def check_logprobs_close( ...@@ -248,6 +248,7 @@ def check_logprobs_close(
def build_model_context(model_name: str, def build_model_context(model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False, trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None, dtype: Optional[Union[str, torch.dtype]] = None,
...@@ -273,7 +274,8 @@ def build_model_context(model_name: str, ...@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
model_config = ModelConfig( model_config = ModelConfig(
model_name, model_name,
tokenizer_name, task=task,
tokenizer=tokenizer_name,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
dtype=dtype, dtype=dtype,
......
...@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): ...@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME, tokenizer=MODEL_NAME,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
...@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, ...@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME, tokenizer=MODEL_NAME,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
...@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): ...@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME, tokenizer=MODEL_NAME,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
...@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images): ...@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME, tokenizer=MODEL_NAME,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment