Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
051eaf6d
Unverified
Commit
051eaf6d
authored
Oct 19, 2024
by
Cyrus Leung
Committed by
GitHub
Oct 18, 2024
Browse files
[Model] Add user-configurable task for models that support both generation and embedding (#9424)
parent
7dbe738d
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
218 additions
and
130 deletions
+218
-130
docs/source/models/supported_models.rst
docs/source/models/supported_models.rst
+8
-0
docs/source/models/vlm.rst
docs/source/models/vlm.rst
+2
-2
examples/offline_inference_vision_language_embedding.py
examples/offline_inference_vision_language_embedding.py
+1
-0
examples/openai_api_client_for_multimodal.py
examples/openai_api_client_for_multimodal.py
+2
-2
tests/conftest.py
tests/conftest.py
+3
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+13
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+32
-24
tests/core/test_scheduler_encoder_decoder.py
tests/core/test_scheduler_encoder_decoder.py
+6
-1
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+19
-4
tests/entrypoints/llm/test_chat.py
tests/entrypoints/llm/test_chat.py
+92
-0
tests/entrypoints/llm/test_generate.py
tests/entrypoints/llm/test_generate.py
+0
-88
tests/entrypoints/llm/test_init.py
tests/entrypoints/llm/test_init.py
+22
-0
tests/entrypoints/openai/test_serving_chat.py
tests/entrypoints/openai/test_serving_chat.py
+1
-1
tests/entrypoints/openai/test_vision.py
tests/entrypoints/openai/test_vision.py
+2
-0
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+2
-1
tests/lora/test_worker.py
tests/lora/test_worker.py
+3
-2
tests/models/decoder_only/vision_language/test_phi3v.py
tests/models/decoder_only/vision_language/test_phi3v.py
+1
-0
tests/models/embedding/vision_language/test_phi3v.py
tests/models/embedding/vision_language/test_phi3v.py
+1
-0
tests/models/utils.py
tests/models/utils.py
+4
-2
tests/multimodal/test_mapper.py
tests/multimodal/test_mapper.py
+4
-0
No files found.
docs/source/models/supported_models.rst
View file @
051eaf6d
...
@@ -294,6 +294,10 @@ Text Embedding
...
@@ -294,6 +294,10 @@ Text Embedding
-
-
- ✅︎
- ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
Reward Modeling
Reward Modeling
---------------
---------------
...
@@ -482,6 +486,10 @@ Multimodal Embedding
...
@@ -482,6 +486,10 @@ Multimodal Embedding
- 🚧
- 🚧
- ✅︎
- ✅︎
.. important::
Some model architectures support both generation and embedding tasks.
In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
----
----
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
...
...
docs/source/models/vlm.rst
View file @
051eaf6d
...
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
...
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
.. code-block:: bash
.. code-block:: bash
vllm serve microsoft/Phi-3.5-vision-instruct --
max-model-len 4096
\
vllm serve microsoft/Phi-3.5-vision-instruct --
task generate
\
--trust-remote-code --limit-mm-per-prompt image=2
--trust-remote-code
--max-model-len 4096
--limit-mm-per-prompt image=2
.. important::
.. important::
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,
...
...
examples/offline_inference_vision_language_embedding.py
View file @
051eaf6d
...
@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha
...
@@ -7,6 +7,7 @@ prompt = "<|image_1|> Represent the given image with the following question: Wha
# Create an LLM.
# Create an LLM.
llm
=
LLM
(
llm
=
LLM
(
model
=
"TIGER-Lab/VLM2Vec-Full"
,
model
=
"TIGER-Lab/VLM2Vec-Full"
,
task
=
"embedding"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
...
...
examples/openai_api_client_for_multimodal.py
View file @
051eaf6d
...
@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
...
@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --
max-model-len 4096
\
vllm serve microsoft/Phi-3.5-vision-instruct --
task generate
\
--trust-remote-code --limit-mm-per-prompt image=2
--trust-remote-code
--max-model-len 4096
--limit-mm-per-prompt image=2
(audio inference with Ultravox)
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
...
...
tests/conftest.py
View file @
051eaf6d
...
@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
...
@@ -25,7 +25,7 @@ from tests.models.utils import (TokensTextLogprobs,
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
TokenizerPoolConfig
from
vllm.config
import
TaskOption
,
TokenizerPoolConfig
from
vllm.connections
import
global_http_connection
from
vllm.connections
import
global_http_connection
from
vllm.distributed
import
(
destroy_distributed_environment
,
from
vllm.distributed
import
(
destroy_distributed_environment
,
destroy_model_parallel
,
destroy_model_parallel
,
...
@@ -619,6 +619,7 @@ class VllmRunner:
...
@@ -619,6 +619,7 @@ class VllmRunner:
def
__init__
(
def
__init__
(
self
,
self
,
model_name
:
str
,
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_name
:
Optional
[
str
]
=
None
,
# Use smaller max model length, otherwise bigger model cannot run due
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
# to kv cache size limit.
...
@@ -634,6 +635,7 @@ class VllmRunner:
...
@@ -634,6 +635,7 @@ class VllmRunner:
)
->
None
:
)
->
None
:
self
.
model
=
LLM
(
self
.
model
=
LLM
(
model
=
model_name
,
model
=
model_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
tokenizer
=
tokenizer_name
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
051eaf6d
...
@@ -33,7 +33,8 @@ def test_simple():
...
@@ -33,7 +33,8 @@ def test_simple():
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
num_seq_group
,
num_seq_group
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
@@ -78,6 +79,7 @@ def test_chunk():
...
@@ -78,6 +79,7 @@ def test_chunk():
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -126,6 +128,7 @@ def test_complex():
...
@@ -126,6 +128,7 @@ def test_complex():
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -196,6 +199,7 @@ def test_maximal_decoding():
...
@@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len
=
8
max_model_len
=
8
max_num_batched_tokens
=
2
max_num_batched_tokens
=
2
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -289,6 +293,7 @@ def test_prompt_limit():
...
@@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len
=
64
max_model_len
=
64
max_num_batched_tokens
=
32
max_num_batched_tokens
=
32
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
...
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs
=
64
max_seqs
=
64
max_model_len
=
32
max_model_len
=
32
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
,
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
enable_chunked_prefill
=
True
)
enable_chunked_prefill
=
True
)
...
@@ -348,6 +354,7 @@ def test_swap():
...
@@ -348,6 +354,7 @@ def test_swap():
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
...
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len
=
200
max_model_len
=
200
max_num_batched_tokens
=
30
max_num_batched_tokens
=
30
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
...
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
@@ -617,6 +627,7 @@ def test_perfix_caching():
...
@@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len
=
80
max_model_len
=
80
max_num_batched_tokens
=
64
max_num_batched_tokens
=
64
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
"generate"
,
max_num_batched_tokens
,
max_num_batched_tokens
,
max_seqs
,
max_seqs
,
max_model_len
,
max_model_len
,
...
...
tests/core/test_scheduler.py
View file @
051eaf6d
...
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
...
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
def
test_scheduler_add_seq_group
():
def
test_scheduler_add_seq_group
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
"generate"
,
64
,
max_num_batched_tokens
=
100
,
1
,
max_num_seqs
=
64
,
max_model_len
=
1
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
...
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
...
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def
test_scheduler_abort_seq_group
():
def
test_scheduler_abort_seq_group
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
"generate"
,
64
,
max_num_batched_tokens
=
100
,
1
,
max_num_seqs
=
64
,
max_model_len
=
1
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
...
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
...
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
64
,
"generate"
,
num_seq_group
,
max_num_batched_tokens
=
64
,
max_model_len
,
max_num_seqs
=
num_seq_group
,
max_model_len
=
max_model_len
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
...
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
...
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len
=
30
max_model_len
=
30
max_batched_num_tokens
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
"generate"
,
2
,
max_num_batched_tokens
=
max_batched_num_tokens
,
max_model_len
,
max_num_seqs
=
2
,
max_model_len
=
max_model_len
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_cpu_blocks
=
16
...
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size
=
4
block_size
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
64
,
"generate"
,
2
,
max_num_batched_tokens
=
64
,
max_model_len
,
max_num_seqs
=
2
,
max_model_len
=
max_model_len
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
2
...
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
...
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group
=
2
max_seq_group
=
2
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
64
,
"generate"
,
max_seq_group
,
max_num_batched_tokens
=
64
,
max_model_len
,
max_num_seqs
=
max_seq_group
,
max_model_len
=
max_model_len
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
...
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
...
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def
test_scheduler_delay_factor
():
def
test_scheduler_delay_factor
():
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
100
,
"generate"
,
64
,
max_num_batched_tokens
=
100
,
16
,
max_num_seqs
=
64
,
max_model_len
=
16
,
delay_factor
=
0.5
,
delay_factor
=
0.5
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
...
@@ -350,9 +357,10 @@ def initialize_scheduler(
...
@@ -350,9 +357,10 @@ def initialize_scheduler(
):
):
block_size
=
block_size
block_size
=
block_size
scheduler_config
=
SchedulerConfig
(
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
"generate"
,
max_num_seqs
,
max_num_batched_tokens
=
max_token_budget
,
max_model_len
,
max_num_seqs
=
max_num_seqs
,
max_model_len
=
max_model_len
,
)
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
...
...
tests/core/test_scheduler_encoder_decoder.py
View file @
051eaf6d
...
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
...
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
task
=
"generate"
,
max_num_batched_tokens
=
64
,
max_num_seqs
=
num_seq_group
,
max_model_len
=
max_model_len
,
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_cpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
cache_config
.
num_gpu_blocks
=
16
# enc and dec prompts per seq_group
...
...
tests/distributed/test_pipeline_parallel.py
View file @
051eaf6d
...
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
...
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
import
pytest
import
pytest
from
vllm.config
import
TaskOption
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
from
..utils
import
compare_two_settings
,
fork_new_process_for_each_test
...
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
...
@@ -31,6 +32,7 @@ class ParallelSetup(NamedTuple):
class
PPTestSettings
:
class
PPTestSettings
:
parallel_setups
:
List
[
ParallelSetup
]
parallel_setups
:
List
[
ParallelSetup
]
distributed_backends
:
List
[
str
]
distributed_backends
:
List
[
str
]
task
:
TaskOption
trust_remote_code
:
bool
trust_remote_code
:
bool
tokenizer_mode
:
Optional
[
str
]
tokenizer_mode
:
Optional
[
str
]
...
@@ -39,6 +41,7 @@ class PPTestSettings:
...
@@ -39,6 +41,7 @@ class PPTestSettings:
*
,
*
,
tp_base
:
int
=
1
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
):
...
@@ -66,6 +69,7 @@ class PPTestSettings:
...
@@ -66,6 +69,7 @@ class PPTestSettings:
chunked_prefill
=
False
),
chunked_prefill
=
False
),
],
],
distributed_backends
=
[
"mp"
,
"ray"
],
distributed_backends
=
[
"mp"
,
"ray"
],
task
=
task
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
tokenizer_mode
=
tokenizer_mode
,
)
)
...
@@ -75,6 +79,7 @@ class PPTestSettings:
...
@@ -75,6 +79,7 @@ class PPTestSettings:
*
,
*
,
tp_base
:
int
=
1
,
tp_base
:
int
=
1
,
pp_base
:
int
=
2
,
pp_base
:
int
=
2
,
task
:
TaskOption
=
"auto"
,
trust_remote_code
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
tokenizer_mode
:
Optional
[
str
]
=
None
,
):
):
...
@@ -86,6 +91,7 @@ class PPTestSettings:
...
@@ -86,6 +91,7 @@ class PPTestSettings:
chunked_prefill
=
False
),
chunked_prefill
=
False
),
],
],
distributed_backends
=
[
"mp"
],
distributed_backends
=
[
"mp"
],
task
=
task
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
tokenizer_mode
=
tokenizer_mode
,
tokenizer_mode
=
tokenizer_mode
,
)
)
...
@@ -94,7 +100,7 @@ class PPTestSettings:
...
@@ -94,7 +100,7 @@ class PPTestSettings:
for
parallel_setup
in
self
.
parallel_setups
:
for
parallel_setup
in
self
.
parallel_setups
:
for
distributed_backend
in
self
.
distributed_backends
:
for
distributed_backend
in
self
.
distributed_backends
:
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
yield
(
model_name
,
parallel_setup
,
distributed_backend
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
self
.
task
,
self
.
trust_remote_code
,
self
.
tokenizer_mode
)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...
@@ -213,6 +219,7 @@ def _compare_tp(
...
@@ -213,6 +219,7 @@ def _compare_tp(
model_name
:
str
,
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
:
int
,
num_gpus_available
:
int
,
...
@@ -240,6 +247,8 @@ def _compare_tp(
...
@@ -240,6 +247,8 @@ def _compare_tp(
common_args
.
append
(
"--enable-chunked-prefill"
)
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
common_args
.
append
(
"--enforce-eager"
)
if
task
!=
"auto"
:
common_args
.
extend
([
"--task"
,
task
])
if
trust_remote_code
:
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
if
tokenizer_mode
:
...
@@ -297,7 +306,7 @@ def _compare_tp(
...
@@ -297,7 +306,7 @@ def _compare_tp(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
"trust_remote_code"
,
"tokenizer_mode"
),
[
[
params
for
model_name
,
settings
in
GENERATION_MODEL_SETTINGS
.
items
()
params
for
model_name
,
settings
in
GENERATION_MODEL_SETTINGS
.
items
()
...
@@ -310,6 +319,7 @@ def test_tp_language_generation(
...
@@ -310,6 +319,7 @@ def test_tp_language_generation(
model_name
:
str
,
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
num_gpus_available
,
...
@@ -317,6 +327,7 @@ def test_tp_language_generation(
...
@@ -317,6 +327,7 @@ def test_tp_language_generation(
_compare_tp
(
model_name
,
_compare_tp
(
model_name
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
task
,
trust_remote_code
,
trust_remote_code
,
tokenizer_mode
,
tokenizer_mode
,
num_gpus_available
,
num_gpus_available
,
...
@@ -324,7 +335,7 @@ def test_tp_language_generation(
...
@@ -324,7 +335,7 @@ def test_tp_language_generation(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
"trust_remote_code"
,
"tokenizer_mode"
),
[
[
params
for
model_name
,
settings
in
EMBEDDING_MODEL_SETTINGS
.
items
()
params
for
model_name
,
settings
in
EMBEDDING_MODEL_SETTINGS
.
items
()
...
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
...
@@ -337,6 +348,7 @@ def test_tp_language_embedding(
model_name
:
str
,
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
num_gpus_available
,
...
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
...
@@ -344,6 +356,7 @@ def test_tp_language_embedding(
_compare_tp
(
model_name
,
_compare_tp
(
model_name
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
task
,
trust_remote_code
,
trust_remote_code
,
tokenizer_mode
,
tokenizer_mode
,
num_gpus_available
,
num_gpus_available
,
...
@@ -351,7 +364,7 @@ def test_tp_language_embedding(
...
@@ -351,7 +364,7 @@ def test_tp_language_embedding(
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
(
"model_name"
,
"parallel_setup"
,
"distributed_backend"
,
"task"
,
"trust_remote_code"
,
"tokenizer_mode"
),
"trust_remote_code"
,
"tokenizer_mode"
),
[
[
params
for
model_name
,
settings
in
MULTIMODAL_MODEL_SETTINGS
.
items
()
params
for
model_name
,
settings
in
MULTIMODAL_MODEL_SETTINGS
.
items
()
...
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
...
@@ -364,6 +377,7 @@ def test_tp_multimodal_generation(
model_name
:
str
,
model_name
:
str
,
parallel_setup
:
ParallelSetup
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
distributed_backend
:
str
,
task
:
TaskOption
,
trust_remote_code
:
bool
,
trust_remote_code
:
bool
,
tokenizer_mode
:
Optional
[
str
],
tokenizer_mode
:
Optional
[
str
],
num_gpus_available
,
num_gpus_available
,
...
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
...
@@ -371,6 +385,7 @@ def test_tp_multimodal_generation(
_compare_tp
(
model_name
,
_compare_tp
(
model_name
,
parallel_setup
,
parallel_setup
,
distributed_backend
,
distributed_backend
,
task
,
trust_remote_code
,
trust_remote_code
,
tokenizer_mode
,
tokenizer_mode
,
num_gpus_available
,
num_gpus_available
,
...
...
tests/entrypoints/llm/test_chat.py
0 → 100644
View file @
051eaf6d
from
typing
import
List
import
pytest
from
vllm
import
LLM
from
..openai.test_vision
import
TEST_IMAGE_URLS
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_generate.py
View file @
051eaf6d
...
@@ -6,7 +6,6 @@ import pytest
...
@@ -6,7 +6,6 @@ import pytest
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
vllm
import
LLM
,
RequestOutput
,
SamplingParams
from
...conftest
import
cleanup
from
...conftest
import
cleanup
from
..openai.test_vision
import
TEST_IMAGE_URLS
MODEL_NAME
=
"facebook/opt-125m"
MODEL_NAME
=
"facebook/opt-125m"
...
@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
...
@@ -104,90 +103,3 @@ def test_multiple_sampling_params(llm: LLM):
# sampling_params is None, default params should be applied
# sampling_params is None, default params should be applied
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
outputs
=
llm
.
generate
(
PROMPTS
,
sampling_params
=
None
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
assert
len
(
PROMPTS
)
==
len
(
outputs
)
def
test_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
1
def
test_multi_chat
():
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3-8B-Instruct"
)
prompt1
=
"Explain the concept of entropy."
prompt2
=
"Explain what among us is."
conversation1
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt1
},
]
conversation2
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant"
},
{
"role"
:
"user"
,
"content"
:
prompt2
},
]
messages
=
[
conversation1
,
conversation2
]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
==
2
@
pytest
.
mark
.
parametrize
(
"image_urls"
,
[[
TEST_IMAGE_URLS
[
0
],
TEST_IMAGE_URLS
[
1
]]])
def
test_chat_multi_image
(
image_urls
:
List
[
str
]):
llm
=
LLM
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
dtype
=
"bfloat16"
,
max_model_len
=
4096
,
max_num_seqs
=
5
,
enforce_eager
=
True
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
"image"
:
2
},
)
messages
=
[{
"role"
:
"user"
,
"content"
:
[
*
({
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_url
}
}
for
image_url
in
image_urls
),
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}]
outputs
=
llm
.
chat
(
messages
)
assert
len
(
outputs
)
>=
0
tests/entrypoints/llm/test_init.py
0 → 100644
View file @
051eaf6d
import
pytest
from
vllm
import
LLM
from
...utils
import
error_on_warning
MODEL_NAME
=
"facebook/opt-125m"
def
test_pos_args_deprecated
():
with
error_on_warning
(
DeprecationWarning
):
LLM
(
model
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
error_on_warning
(
DeprecationWarning
):
LLM
(
MODEL_NAME
,
tokenizer
=
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
)
with
pytest
.
warns
(
DeprecationWarning
,
match
=
"'tokenizer', 'tokenizer_mode'"
):
LLM
(
MODEL_NAME
,
MODEL_NAME
,
"auto"
)
tests/entrypoints/openai/test_serving_chat.py
View file @
051eaf6d
...
@@ -22,12 +22,12 @@ class MockHFConfig:
...
@@ -22,12 +22,12 @@ class MockHFConfig:
@
dataclass
@
dataclass
class
MockModelConfig
:
class
MockModelConfig
:
task
=
"generate"
tokenizer
=
MODEL_NAME
tokenizer
=
MODEL_NAME
trust_remote_code
=
False
trust_remote_code
=
False
tokenizer_mode
=
"auto"
tokenizer_mode
=
"auto"
max_model_len
=
100
max_model_len
=
100
tokenizer_revision
=
None
tokenizer_revision
=
None
embedding_mode
=
False
multimodal_config
=
MultiModalConfig
()
multimodal_config
=
MultiModalConfig
()
hf_config
=
MockHFConfig
()
hf_config
=
MockHFConfig
()
...
...
tests/entrypoints/openai/test_vision.py
View file @
051eaf6d
...
@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
...
@@ -23,6 +23,8 @@ TEST_IMAGE_URLS = [
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
():
def
server
():
args
=
[
args
=
[
"--task"
,
"generate"
,
"--dtype"
,
"--dtype"
,
"bfloat16"
,
"bfloat16"
,
"--max-model-len"
,
"--max-model-len"
,
...
...
tests/entrypoints/test_chat_utils.py
View file @
051eaf6d
...
@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
...
@@ -18,7 +18,8 @@ PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
phi3v_model_config
():
def
phi3v_model_config
():
return
ModelConfig
(
PHI3V_MODEL_ID
,
return
ModelConfig
(
PHI3V_MODEL_ID
,
PHI3V_MODEL_ID
,
task
=
"generate"
,
tokenizer
=
PHI3V_MODEL_ID
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
dtype
=
"bfloat16"
,
dtype
=
"bfloat16"
,
...
...
tests/lora/test_worker.py
View file @
051eaf6d
...
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
worker
=
Worker
(
worker
=
Worker
(
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
"meta-llama/Llama-2-7b-hf"
,
task
=
"auto"
,
tokenizer
=
"meta-llama/Llama-2-7b-hf"
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
seed
=
0
,
seed
=
0
,
...
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
...
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
load_format
=
"dummy"
,
load_format
=
"dummy"
,
),
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
scheduler_config
=
SchedulerConfig
(
32
,
32
,
32
),
scheduler_config
=
SchedulerConfig
(
"generate"
,
32
,
32
,
32
),
device_config
=
DeviceConfig
(
"cuda"
),
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.
,
gpu_memory_utilization
=
1.
,
...
...
tests/models/decoder_only/vision_language/test_phi3v.py
View file @
051eaf6d
...
@@ -89,6 +89,7 @@ def run_test(
...
@@ -89,6 +89,7 @@ def run_test(
# max_model_len should be greater than image_feature_size
# max_model_len should be greater than image_feature_size
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
task
=
"generate"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/models/embedding/vision_language/test_phi3v.py
View file @
051eaf6d
...
@@ -28,6 +28,7 @@ def test_models(
...
@@ -28,6 +28,7 @@ def test_models(
# if we run HF first, the cuda initialization will be done and it
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
with
vllm_runner
(
model
,
task
=
"embedding"
,
max_model_len
=
4096
,
max_model_len
=
4096
,
max_num_seqs
=
2
,
max_num_seqs
=
2
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/models/utils.py
View file @
051eaf6d
...
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
...
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Tuple, Union
import
torch
import
torch
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
from
vllm.inputs
import
InputContext
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.sequence
import
Logprob
,
PromptLogprobs
,
SampleLogprobs
from
vllm.utils
import
is_cpu
from
vllm.utils
import
is_cpu
...
@@ -248,6 +248,7 @@ def check_logprobs_close(
...
@@ -248,6 +248,7 @@ def check_logprobs_close(
def
build_model_context
(
model_name
:
str
,
def
build_model_context
(
model_name
:
str
,
task
:
TaskOption
=
"auto"
,
tokenizer_name
:
Optional
[
str
]
=
None
,
tokenizer_name
:
Optional
[
str
]
=
None
,
trust_remote_code
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
dtype
:
Optional
[
Union
[
str
,
torch
.
dtype
]]
=
None
,
...
@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
...
@@ -273,7 +274,8 @@ def build_model_context(model_name: str,
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_name
,
model_name
,
tokenizer_name
,
task
=
task
,
tokenizer
=
tokenizer_name
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
dtype
=
dtype
,
...
...
tests/multimodal/test_mapper.py
View file @
051eaf6d
...
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
...
@@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
...
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
...
@@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
...
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
...
@@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
...
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
...
@@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
task
=
"auto"
,
tokenizer
=
MODEL_NAME
,
tokenizer
=
MODEL_NAME
,
tokenizer_mode
=
"auto"
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
trust_remote_code
=
False
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment