Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9ae2f603
Unverified
Commit
9ae2f603
authored
Dec 04, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 04, 2025
Browse files
[Misc] Various cleanups for MM input processing (#29970)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
80f8af4b
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
67 additions
and
225 deletions
+67
-225
docs/features/multimodal_inputs.md
docs/features/multimodal_inputs.md
+3
-5
examples/online_serving/prompt_embed_inference_with_openai_client.py
...line_serving/prompt_embed_inference_with_openai_client.py
+3
-9
tests/entrypoints/openai/test_vision_embeds.py
tests/entrypoints/openai/test_vision_embeds.py
+31
-44
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+4
-123
tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
...1/entrypoints/openai/test_completion_with_image_embeds.py
+2
-15
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+5
-8
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+0
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+0
-6
vllm/entrypoints/score_utils.py
vllm/entrypoints/score_utils.py
+1
-4
vllm/model_executor/models/hunyuan_vision.py
vllm/model_executor/models/hunyuan_vision.py
+2
-1
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+2
-2
vllm/model_executor/models/keye_vl1_5.py
vllm/model_executor/models/keye_vl1_5.py
+2
-2
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+2
-5
vllm/utils/serial_utils.py
vllm/utils/serial_utils.py
+10
-0
No files found.
docs/features/multimodal_inputs.md
View file @
9ae2f603
...
@@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
...
@@ -795,14 +795,12 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
??? code
??? code
```python
```python
from vllm.utils.serial_utils import tensor2base64
image_embedding = torch.load(...)
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
buffer = io.BytesIO()
base64_image_embedding = tensor2base64(image_embedding)
torch.save(image_embedding, buffer)
buffer.seek(0)
binary_data = buffer.read()
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
client = OpenAI(
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
# defaults to os.environ.get("OPENAI_API_KEY")
...
...
examples/online_serving/prompt_embed_inference_with_openai_client.py
View file @
9ae2f603
...
@@ -28,13 +28,11 @@ Dependencies:
...
@@ -28,13 +28,11 @@ Dependencies:
- openai
- openai
"""
"""
import
base64
import
io
import
torch
import
transformers
import
transformers
from
openai
import
OpenAI
from
openai
import
OpenAI
from
vllm.utils.serial_utils
import
tensor2base64
def
main
():
def
main
():
client
=
OpenAI
(
client
=
OpenAI
(
...
@@ -58,11 +56,7 @@ def main():
...
@@ -58,11 +56,7 @@ def main():
prompt_embeds
=
embedding_layer
(
token_ids
).
squeeze
(
0
)
prompt_embeds
=
embedding_layer
(
token_ids
).
squeeze
(
0
)
# Prompt embeddings
# Prompt embeddings
buffer
=
io
.
BytesIO
()
encoded_embeds
=
tensor2base64
(
prompt_embeds
)
torch
.
save
(
prompt_embeds
,
buffer
)
buffer
.
seek
(
0
)
binary_data
=
buffer
.
read
()
encoded_embeds
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
completion
=
client
.
completions
.
create
(
completion
=
client
.
completions
.
create
(
model
=
model_name
,
model
=
model_name
,
...
...
tests/entrypoints/openai/test_vision_embeds.py
View file @
9ae2f603
...
@@ -2,64 +2,47 @@
...
@@ -2,64 +2,47 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
base64
import
io
import
numpy
as
np
import
numpy
as
np
import
pytest
import
pytest
import
requests
import
requests
import
torch
import
torch
from
...utils
import
RemoteOpenAIServer
from
vllm.utils.serial_utils
import
tensor2base64
MODEL_NAME
=
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
from
...utils
import
RemoteOpenAIServer
DTYPE
=
"float16"
def
_terratorch_dummy_
inputs
(
model_name
:
str
):
def
_terratorch_dummy_
messages
(
):
pixel_values
=
torch
.
full
((
6
,
512
,
512
),
1.0
,
dtype
=
torch
.
float16
)
pixel_values
=
torch
.
full
((
6
,
512
,
512
),
1.0
,
dtype
=
torch
.
float16
)
location_coords
=
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
)
location_coords
=
torch
.
full
((
1
,
2
),
1.0
,
dtype
=
torch
.
float16
)
buffer_tiff
=
io
.
BytesIO
()
return
[
torch
.
save
(
pixel_values
,
buffer_tiff
)
buffer_tiff
.
seek
(
0
)
binary_data
=
buffer_tiff
.
read
()
base64_tensor_embedding
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
buffer_coord
=
io
.
BytesIO
()
torch
.
save
(
location_coords
,
buffer_coord
)
buffer_coord
.
seek
(
0
)
binary_data
=
buffer_coord
.
read
()
base64_coord_embedding
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
return
{
"model"
:
model_name
,
"additional_data"
:
{
"prompt_token_ids"
:
[
1
]},
"encoding_format"
:
"base64"
,
"messages"
:
[
{
{
"role"
:
"user"
,
"role"
:
"user"
,
"content"
:
[
"content"
:
[
{
{
"type"
:
"image_embeds"
,
"type"
:
"image_embeds"
,
"image_embeds"
:
{
"image_embeds"
:
{
"pixel_values"
:
base64_tensor_embedding
,
"pixel_values"
:
tensor2base64
(
pixel_values
)
,
"location_coords"
:
base64_coord_embedding
,
"location_coords"
:
tensor2base64
(
location_coords
)
,
},
},
}
}
],
],
}
}
],
]
}
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
async
def
test_single_request
(
model_name
:
str
):
"model_name"
,
[
"ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
]
)
def
test_single_request
(
model_name
:
str
):
args
=
[
args
=
[
"--runner"
,
"--runner"
,
"pooling"
,
"pooling"
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"--dtype"
,
DTYPE
,
"float16"
,
"--enforce-eager"
,
"--enforce-eager"
,
"--trust-remote-code"
,
"--trust-remote-code"
,
"--max-num-seqs"
,
"--max-num-seqs"
,
...
@@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
...
@@ -70,11 +53,15 @@ async def test_single_request(model_name: str):
"--enable-mm-embeds"
,
"--enable-mm-embeds"
,
]
]
with
RemoteOpenAIServer
(
MODEL_NAME
,
args
)
as
server
:
with
RemoteOpenAIServer
(
model_name
,
args
)
as
server
:
prompt
=
_terratorch_dummy_inputs
(
model_name
)
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
# test single pooling
json
=
{
response
=
requests
.
post
(
server
.
url_for
(
"pooling"
),
json
=
prompt
)
"model"
:
model_name
,
"messages"
:
_terratorch_dummy_messages
(),
"encoding_format"
:
"base64"
,
},
)
response
.
raise_for_status
()
response
.
raise_for_status
()
output
=
response
.
json
()[
"data"
][
0
][
"data"
]
output
=
response
.
json
()[
"data"
][
0
][
"data"
]
...
...
tests/entrypoints/test_chat_utils.py
View file @
9ae2f603
...
@@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
...
@@ -29,6 +29,7 @@ from vllm.multimodal.utils import (
encode_video_base64
,
encode_video_base64
,
)
)
from
vllm.tokenizers
import
MistralTokenizer
,
get_tokenizer
from
vllm.tokenizers
import
MistralTokenizer
,
get_tokenizer
from
vllm.utils.serial_utils
import
tensor2base64
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
VLLM_PATH
from
..utils
import
VLLM_PATH
...
@@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
...
@@ -85,11 +86,6 @@ def phi3v_model_config_image_embeds():
)
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
phi3v_tokenizer
():
return
get_tokenizer
(
PHI3V_MODEL_ID
)
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
qwen2_audio_model_config
():
def
qwen2_audio_model_config
():
return
ModelConfig
(
return
ModelConfig
(
...
@@ -115,11 +111,6 @@ def audio_embeds_model_config():
...
@@ -115,11 +111,6 @@ def audio_embeds_model_config():
)
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
qwen2_audio_tokenizer
():
return
get_tokenizer
(
QWEN2AUDIO_MODEL_ID
)
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
qwen25omni_model_config_mm_interleaved
():
def
qwen25omni_model_config_mm_interleaved
():
return
ModelConfig
(
return
ModelConfig
(
...
@@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
...
@@ -134,11 +125,6 @@ def qwen25omni_model_config_mm_interleaved():
)
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
qwen25omni_tokenizer
():
return
get_tokenizer
(
QWEN25OMNI_MODEL_ID
)
@
pytest
.
fixture
(
scope
=
"function"
)
@
pytest
.
fixture
(
scope
=
"function"
)
def
mistral_model_config
():
def
mistral_model_config
():
return
ModelConfig
(
return
ModelConfig
(
...
@@ -150,11 +136,6 @@ def mistral_model_config():
...
@@ -150,11 +136,6 @@ def mistral_model_config():
)
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
mistral_tokenizer
():
return
get_tokenizer
(
MISTRAL_MODEL_ID
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_url
():
def
image_url
():
image
=
ImageAsset
(
"cherry_blossom"
)
image
=
ImageAsset
(
"cherry_blossom"
)
...
@@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
...
@@ -239,7 +220,6 @@ def _assert_mm_data_inputs(
def
test_parse_chat_messages_single_image
(
def
test_parse_chat_messages_single_image
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
...
@@ -253,7 +233,6 @@ def test_parse_chat_messages_single_image(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
...
@@ -266,7 +245,6 @@ def test_parse_chat_messages_single_image(
def
test_parse_chat_messages_single_image_with_uuid
(
def
test_parse_chat_messages_single_image_with_uuid
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
...
@@ -287,7 +265,6 @@ def test_parse_chat_messages_single_image_with_uuid(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
...
@@ -300,7 +277,6 @@ def test_parse_chat_messages_single_image_with_uuid(
def
test_parse_chat_messages_single_empty_image_with_uuid
(
def
test_parse_chat_messages_single_empty_image_with_uuid
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
...
@@ -319,7 +295,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
...
@@ -332,7 +307,6 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
def
test_parse_chat_messages_single_image_with_bad_uuid_format
(
def
test_parse_chat_messages_single_image_with_bad_uuid_format
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
...
@@ -354,7 +328,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
...
@@ -367,7 +340,6 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
def
test_parse_chat_messages_multiple_images_with_uuids
(
def
test_parse_chat_messages_multiple_images_with_uuids
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid1
=
"my_uuid_1"
image_uuid1
=
"my_uuid_1"
...
@@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
...
@@ -397,7 +369,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
...
@@ -413,7 +384,6 @@ def test_parse_chat_messages_multiple_images_with_uuids(
def
test_parse_chat_messages_multiple_empty_images_with_uuids
(
def
test_parse_chat_messages_multiple_empty_images_with_uuids
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid1
=
"my_uuid_1"
image_uuid1
=
"my_uuid_1"
...
@@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
...
@@ -439,7 +409,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
...
@@ -455,7 +424,6 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
def
test_parse_chat_messages_mixed_empty_images_with_uuids
(
def
test_parse_chat_messages_mixed_empty_images_with_uuids
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid1
=
"my_uuid_1"
image_uuid1
=
"my_uuid_1"
...
@@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
...
@@ -483,7 +451,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
...
@@ -500,7 +467,6 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_single_image_with_uuid_async
(
async
def
test_parse_chat_messages_single_image_with_uuid_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
...
@@ -519,7 +485,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
...
@@ -533,7 +498,6 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_empty_image_with_uuid_async
(
async
def
test_parse_chat_messages_empty_image_with_uuid_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
...
@@ -552,7 +516,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
...
@@ -566,7 +529,6 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_with_uuids_async
(
async
def
test_parse_chat_messages_multiple_images_with_uuids_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid1
=
"my_uuid_1"
image_uuid1
=
"my_uuid_1"
...
@@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
...
@@ -592,7 +554,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
...
@@ -609,7 +570,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_empty_images_with_uuids_async
(
async
def
test_parse_chat_messages_multiple_empty_images_with_uuids_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid1
=
"my_uuid_1"
image_uuid1
=
"my_uuid_1"
...
@@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
...
@@ -635,7 +595,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
...
@@ -652,7 +611,6 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_with_partial_uuids_async
(
async
def
test_parse_chat_messages_multiple_images_with_partial_uuids_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid2
=
"my_uuid_2"
image_uuid2
=
"my_uuid_2"
...
@@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
...
@@ -676,7 +634,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
...
@@ -692,7 +649,6 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
def
test_parse_chat_messages_empty_system
(
def
test_parse_chat_messages_empty_system
(
mistral_model_config
,
mistral_model_config
,
mistral_tokenizer
,
):
):
# Test string format
# Test string format
conversation
,
_
,
_
=
parse_chat_messages
(
conversation
,
_
,
_
=
parse_chat_messages
(
...
@@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
...
@@ -704,7 +660,6 @@ def test_parse_chat_messages_empty_system(
},
},
],
],
mistral_model_config
,
mistral_model_config
,
mistral_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
assert
conversation
==
[
assert
conversation
==
[
...
@@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
...
@@ -722,7 +677,6 @@ def test_parse_chat_messages_empty_system(
},
},
],
],
mistral_model_config
,
mistral_model_config
,
mistral_tokenizer
,
content_format
=
"openai"
,
content_format
=
"openai"
,
)
)
assert
conversation
==
[
assert
conversation
==
[
...
@@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
...
@@ -734,7 +688,6 @@ def test_parse_chat_messages_empty_system(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_single_image_async
(
async
def
test_parse_chat_messages_single_image_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
...
@@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
...
@@ -748,7 +701,6 @@ async def test_parse_chat_messages_single_image_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
...
@@ -761,7 +713,6 @@ async def test_parse_chat_messages_single_image_async(
def
test_parse_chat_messages_multiple_images
(
def
test_parse_chat_messages_multiple_images
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
...
@@ -779,7 +730,6 @@ def test_parse_chat_messages_multiple_images(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
...
@@ -795,7 +745,6 @@ def test_parse_chat_messages_multiple_images(
def
test_parse_chat_messages_empty_pil_image_with_uuid
(
def
test_parse_chat_messages_empty_pil_image_with_uuid
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
):
):
uuid
=
"abcd"
uuid
=
"abcd"
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
...
@@ -809,7 +758,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
...
@@ -825,7 +773,6 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
def
test_parse_chat_messages_empty_image_embeds_with_uuid
(
def
test_parse_chat_messages_empty_image_embeds_with_uuid
(
phi3v_model_config_image_embeds
,
phi3v_model_config_image_embeds
,
phi3v_tokenizer
,
):
):
uuid
=
"abcd"
uuid
=
"abcd"
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
...
@@ -839,7 +786,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
}
}
],
],
phi3v_model_config_image_embeds
,
phi3v_model_config_image_embeds
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
...
@@ -857,7 +803,6 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
def
test_parse_chat_messages_empty_audio_embeds_with_uuid
(
def
test_parse_chat_messages_empty_audio_embeds_with_uuid
(
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
):
):
"""Test audio_embeds with UUID (no actual embeds data)."""
"""Test audio_embeds with UUID (no actual embeds data)."""
uuid
=
"test-audio-uuid-123"
uuid
=
"test-audio-uuid-123"
...
@@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
...
@@ -873,7 +818,6 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
}
}
],
],
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
...
@@ -889,11 +833,8 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
def
test_parse_chat_messages_audio_embeds_with_string
(
def
test_parse_chat_messages_audio_embeds_with_string
(
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
):
):
"""Test audio_embeds with base64 string embedding data."""
"""Test audio_embeds with base64 string embedding data."""
import
base64
import
io
import
torch
import
torch
...
@@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
...
@@ -901,11 +842,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
audio_embedding
=
torch
.
randn
(
1
,
128
,
768
)
audio_embedding
=
torch
.
randn
(
1
,
128
,
768
)
# Encode it as base64
# Encode it as base64
buffer
=
io
.
BytesIO
()
base64_audio_embedding
=
tensor2base64
(
audio_embedding
)
torch
.
save
(
audio_embedding
,
buffer
)
buffer
.
seek
(
0
)
binary_data
=
buffer
.
read
()
base64_audio_embedding
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
[
[
...
@@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
...
@@ -921,7 +858,6 @@ def test_parse_chat_messages_audio_embeds_with_string(
}
}
],
],
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
...
@@ -939,11 +875,8 @@ def test_parse_chat_messages_audio_embeds_with_string(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_audio_embeds_async
(
async
def
test_parse_chat_messages_audio_embeds_async
(
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
):
):
"""Test audio_embeds with async futures."""
"""Test audio_embeds with async futures."""
import
base64
import
io
import
torch
import
torch
...
@@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
...
@@ -951,11 +884,7 @@ async def test_parse_chat_messages_audio_embeds_async(
audio_embedding
=
torch
.
randn
(
1
,
128
,
768
)
audio_embedding
=
torch
.
randn
(
1
,
128
,
768
)
# Encode it as base64
# Encode it as base64
buffer
=
io
.
BytesIO
()
base64_audio_embedding
=
tensor2base64
(
audio_embedding
)
torch
.
save
(
audio_embedding
,
buffer
)
buffer
.
seek
(
0
)
binary_data
=
buffer
.
read
()
base64_audio_embedding
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
[
[
...
@@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
...
@@ -971,7 +900,6 @@ async def test_parse_chat_messages_audio_embeds_async(
}
}
],
],
audio_embeds_model_config
,
audio_embeds_model_config
,
qwen2_audio_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
...
@@ -990,7 +918,6 @@ async def test_parse_chat_messages_audio_embeds_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_empty_image_embeds_with_uuid_async
(
async
def
test_parse_chat_messages_empty_image_embeds_with_uuid_async
(
phi3v_model_config_image_embeds
,
phi3v_model_config_image_embeds
,
phi3v_tokenizer
,
):
):
uuid
=
"abcd"
uuid
=
"abcd"
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
...
@@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
...
@@ -1004,7 +931,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
}
}
],
],
phi3v_model_config_image_embeds
,
phi3v_model_config_image_embeds
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
...
@@ -1024,7 +950,6 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_async
(
async
def
test_parse_chat_messages_multiple_images_async
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
...
@@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
...
@@ -1042,7 +967,6 @@ async def test_parse_chat_messages_multiple_images_async(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
...
@@ -1058,7 +982,6 @@ async def test_parse_chat_messages_multiple_images_async(
def
test_parse_chat_messages_placeholder_already_in_prompt
(
def
test_parse_chat_messages_placeholder_already_in_prompt
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
...
@@ -1076,7 +999,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
assert
conversation
==
[
assert
conversation
==
[
...
@@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
...
@@ -1091,7 +1013,6 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
def
test_parse_chat_messages_placeholder_one_already_in_prompt
(
def
test_parse_chat_messages_placeholder_one_already_in_prompt
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
...
@@ -1110,7 +1031,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
...
@@ -1127,7 +1047,6 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
def
test_parse_chat_messages_multiple_images_across_messages
(
def
test_parse_chat_messages_multiple_images_across_messages
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
...
@@ -1149,7 +1068,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
},
},
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
...
@@ -1164,7 +1082,6 @@ def test_parse_chat_messages_multiple_images_across_messages(
def
test_parse_chat_messages_multiple_images_with_uuids_across_messages
(
def
test_parse_chat_messages_multiple_images_with_uuids_across_messages
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
...
@@ -1195,7 +1112,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
},
},
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
...
@@ -1210,7 +1126,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
def
test_parse_chat_messages_context_text_format
(
def
test_parse_chat_messages_context_text_format
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
[
[
...
@@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
...
@@ -1222,7 +1137,6 @@ def test_parse_chat_messages_context_text_format(
{
"role"
:
"user"
,
"content"
:
"What about this one?"
},
{
"role"
:
"user"
,
"content"
:
"What about this one?"
},
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"openai"
,
content_format
=
"openai"
,
)
)
...
@@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
...
@@ -1246,7 +1160,6 @@ def test_parse_chat_messages_context_text_format(
def
test_parse_chat_messages_rejects_too_many_images_in_one_message
(
def
test_parse_chat_messages_rejects_too_many_images_in_one_message
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
with
warnings
.
catch_warnings
():
with
warnings
.
catch_warnings
():
...
@@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
...
@@ -1277,14 +1190,12 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
def
test_parse_chat_messages_rejects_too_many_images_across_messages
(
def
test_parse_chat_messages_rejects_too_many_images_across_messages
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
with
warnings
.
catch_warnings
():
with
warnings
.
catch_warnings
():
...
@@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
...
@@ -1322,14 +1233,12 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
},
},
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
def
test_parse_chat_messages_multiple_images_uncommon_input
(
def
test_parse_chat_messages_multiple_images_uncommon_input
(
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
...
@@ -1344,7 +1253,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
}
}
],
],
phi3v_model_config
,
phi3v_model_config
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
...
@@ -1360,7 +1268,6 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
def
test_parse_chat_messages_multiple_images_interleave
(
def
test_parse_chat_messages_multiple_images_interleave
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
...
@@ -1380,7 +1287,6 @@ def test_parse_chat_messages_multiple_images_interleave(
}
}
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
...
@@ -1398,7 +1304,6 @@ def test_parse_chat_messages_multiple_images_interleave(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_interleave_async
(
async
def
test_parse_chat_messages_multiple_images_interleave_async
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages_futures
(
...
@@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
...
@@ -1418,7 +1323,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
}
}
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
...
@@ -1436,7 +1340,6 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_with_uuids_interleave_async
(
async
def
test_parse_chat_messages_multiple_images_with_uuids_interleave_async
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
...
@@ -1465,7 +1368,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
}
}
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
...
@@ -1482,7 +1384,6 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
def
test_parse_chat_messages_multiple_images_multiple_messages_interleave
(
def
test_parse_chat_messages_multiple_images_multiple_messages_interleave
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
...
@@ -1505,7 +1406,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
},
},
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
...
@@ -1523,7 +1423,6 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
def
test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave
(
def
test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
image_uuid
=
str
(
hash
(
image_url
))
image_uuid
=
str
(
hash
(
image_url
))
...
@@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
...
@@ -1555,7 +1454,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
},
},
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
...
@@ -1573,7 +1471,6 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
def
test_parse_chat_messages_multiple_modals_multiple_messages_interleave
(
def
test_parse_chat_messages_multiple_modals_multiple_messages_interleave
(
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
image_url
,
image_url
,
video_url
,
video_url
,
audio_url
,
audio_url
,
...
@@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
...
@@ -1601,7 +1498,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
},
},
],
],
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
...
@@ -1627,7 +1523,6 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
def
test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave
(
def
test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave
(
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
image_url
,
image_url
,
video_url
,
video_url
,
audio_url
,
audio_url
,
...
@@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
...
@@ -1671,7 +1566,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
},
},
],
],
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
...
@@ -1699,7 +1593,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
def
test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave
(
# noqa: E501
def
test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave
(
# noqa: E501
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
image_url
,
image_url
,
video_url
,
video_url
,
audio_url
,
audio_url
,
...
@@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
...
@@ -1743,7 +1636,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
},
},
],
],
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
...
@@ -1775,7 +1667,6 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
def
test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave
(
# noqa: E501
def
test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave
(
# noqa: E501
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
image_url
,
image_url
,
video_url
,
video_url
,
audio_url
,
audio_url
,
...
@@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
...
@@ -1811,7 +1702,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
},
},
],
],
qwen25omni_model_config_mm_interleaved
,
qwen25omni_model_config_mm_interleaved
,
qwen25omni_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
...
@@ -1837,7 +1727,6 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
def
test_parse_chat_messages_multiple_images_interleave_with_placeholders
(
def
test_parse_chat_messages_multiple_images_interleave_with_placeholders
(
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
image_url
,
image_url
,
):
):
with
pytest
.
raises
(
with
pytest
.
raises
(
...
@@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
...
@@ -1861,7 +1750,6 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
}
}
],
],
phi3v_model_config_mm_interleaved
,
phi3v_model_config_mm_interleaved
,
phi3v_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
...
@@ -2237,9 +2125,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
assert
resolved_format
==
expected_format
assert
resolved_format
==
expected_format
def
test_parse_chat_messages_include_thinking_chunk
(
def
test_parse_chat_messages_include_thinking_chunk
(
mistral_model_config
):
mistral_model_config
,
mistral_tokenizer
):
messages
=
[
messages
=
[
{
{
"role"
:
"system"
,
"role"
:
"system"
,
...
@@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
...
@@ -2269,7 +2155,6 @@ def test_parse_chat_messages_include_thinking_chunk(
conversation_with_thinking
,
_
,
_
=
parse_chat_messages
(
conversation_with_thinking
,
_
,
_
=
parse_chat_messages
(
messages
,
messages
,
mistral_model_config
,
mistral_model_config
,
mistral_tokenizer
,
content_format
=
"openai"
,
content_format
=
"openai"
,
)
)
...
@@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
...
@@ -2353,7 +2238,6 @@ def test_apply_mistral_chat_template_thinking_chunk():
def
test_parse_chat_messages_single_empty_audio_with_uuid
(
def
test_parse_chat_messages_single_empty_audio_with_uuid
(
qwen2_audio_model_config
,
qwen2_audio_model_config
,
qwen2_audio_tokenizer
,
):
):
audio_uuid
=
"abcd"
audio_uuid
=
"abcd"
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
...
@@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
...
@@ -2371,7 +2255,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
}
}
],
],
qwen2_audio_model_config
,
qwen2_audio_model_config
,
qwen2_audio_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
@@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
...
@@ -2389,7 +2272,6 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_single_empty_audio_with_uuid_async
(
async
def
test_parse_chat_messages_single_empty_audio_with_uuid_async
(
qwen2_audio_model_config
,
qwen2_audio_model_config
,
qwen2_audio_tokenizer
,
):
):
audio_uuid
=
"abcd"
audio_uuid
=
"abcd"
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_future
,
mm_uuids
=
parse_chat_messages_futures
(
...
@@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
...
@@ -2407,7 +2289,6 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
}
}
],
],
qwen2_audio_model_config
,
qwen2_audio_model_config
,
qwen2_audio_tokenizer
,
content_format
=
"string"
,
content_format
=
"string"
,
)
)
...
...
tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
View file @
9ae2f603
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
io
import
json
import
json
import
openai
# use the official client for correctness check
import
openai
# use the official client for correctness check
...
@@ -13,6 +11,7 @@ from transformers import AutoConfig
...
@@ -13,6 +11,7 @@ from transformers import AutoConfig
from
tests.conftest
import
ImageTestAssets
from
tests.conftest
import
ImageTestAssets
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.utils.serial_utils
import
tensor2base64
# any model with a chat template should work here
# any model with a chat template should work here
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
...
@@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
...
@@ -50,18 +49,6 @@ async def client_with_image_embeds(server_with_image_embeds):
yield
async_client
yield
async_client
def
encode_image_embedding_to_base64
(
image_embedding
)
->
str
:
"""
Encode image embedding to base64 string
"""
buffer
=
io
.
BytesIO
()
torch
.
save
(
image_embedding
,
buffer
)
buffer
.
seek
(
0
)
binary_data
=
buffer
.
read
()
base64_image_embedding
=
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
return
base64_image_embedding
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"model_name"
,
[
MODEL_NAME
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
half
,
torch
.
float16
,
torch
.
float32
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
half
,
torch
.
float16
,
torch
.
float32
])
...
@@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
...
@@ -73,7 +60,7 @@ async def test_completions_with_image_embeds(
):
):
# Test case: Single image embeds input
# Test case: Single image embeds input
image_embeds
=
image_assets
[
0
].
image_embeds
.
to
(
dtype
=
dtype
)
image_embeds
=
image_assets
[
0
].
image_embeds
.
to
(
dtype
=
dtype
)
base64_image_embedding
=
en
code_image_embedding_to_
base64
(
image_embeds
)
base64_image_embedding
=
t
en
sor2
base64
(
image_embeds
)
chat_completion
=
await
client_with_image_embeds
.
chat
.
completions
.
create
(
chat_completion
=
await
client_with_image_embeds
.
chat
.
completions
.
create
(
messages
=
[
messages
=
[
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
{
"role"
:
"system"
,
"content"
:
"You are a helpful assistant."
},
...
...
vllm/entrypoints/chat_utils.py
View file @
9ae2f603
...
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
...
@@ -536,7 +536,7 @@ def resolve_hf_chat_template(
def
_resolve_chat_template_content_format
(
def
_resolve_chat_template_content_format
(
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
*
,
*
,
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
)
->
_ChatTemplateContentFormat
:
)
->
_ChatTemplateContentFormat
:
...
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
...
@@ -593,7 +593,7 @@ def resolve_chat_template_content_format(
chat_template
:
str
|
None
,
chat_template
:
str
|
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
given_format
:
ChatTemplateContentFormatOption
,
given_format
:
ChatTemplateContentFormatOption
,
tokenizer
:
TokenizerLike
,
tokenizer
:
TokenizerLike
|
None
,
*
,
*
,
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
)
->
_ChatTemplateContentFormat
:
)
->
_ChatTemplateContentFormat
:
...
@@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
...
@@ -627,11 +627,10 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
maximum per prompt.
maximum per prompt.
"""
"""
def
__init__
(
self
,
model_config
:
ModelConfig
,
tokenizer
:
TokenizerLike
):
def
__init__
(
self
,
model_config
:
ModelConfig
):
super
().
__init__
()
super
().
__init__
()
self
.
_model_config
=
model_config
self
.
_model_config
=
model_config
self
.
_tokenizer
=
tokenizer
self
.
_items_by_modality
=
defaultdict
[
str
,
list
[
_T
|
None
]](
list
)
self
.
_items_by_modality
=
defaultdict
[
str
,
list
[
_T
|
None
]](
list
)
self
.
_uuids_by_modality
=
defaultdict
[
str
,
list
[
str
|
None
]](
list
)
self
.
_uuids_by_modality
=
defaultdict
[
str
,
list
[
str
|
None
]](
list
)
...
@@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
...
@@ -1612,7 +1611,6 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
def
parse_chat_messages
(
def
parse_chat_messages
(
messages
:
list
[
ChatCompletionMessageParam
],
messages
:
list
[
ChatCompletionMessageParam
],
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
tokenizer
:
TokenizerLike
,
content_format
:
_ChatTemplateContentFormat
,
content_format
:
_ChatTemplateContentFormat
,
)
->
tuple
[
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ConversationMessage
],
...
@@ -1620,7 +1618,7 @@ def parse_chat_messages(
...
@@ -1620,7 +1618,7 @@ def parse_chat_messages(
MultiModalUUIDDict
|
None
,
MultiModalUUIDDict
|
None
,
]:
]:
conversation
:
list
[
ConversationMessage
]
=
[]
conversation
:
list
[
ConversationMessage
]
=
[]
mm_tracker
=
MultiModalItemTracker
(
model_config
,
tokenizer
)
mm_tracker
=
MultiModalItemTracker
(
model_config
)
for
msg
in
messages
:
for
msg
in
messages
:
sub_messages
=
_parse_chat_message_content
(
sub_messages
=
_parse_chat_message_content
(
...
@@ -1644,7 +1642,6 @@ def parse_chat_messages(
...
@@ -1644,7 +1642,6 @@ def parse_chat_messages(
def
parse_chat_messages_futures
(
def
parse_chat_messages_futures
(
messages
:
list
[
ChatCompletionMessageParam
],
messages
:
list
[
ChatCompletionMessageParam
],
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
tokenizer
:
TokenizerLike
,
content_format
:
_ChatTemplateContentFormat
,
content_format
:
_ChatTemplateContentFormat
,
)
->
tuple
[
)
->
tuple
[
list
[
ConversationMessage
],
list
[
ConversationMessage
],
...
@@ -1652,7 +1649,7 @@ def parse_chat_messages_futures(
...
@@ -1652,7 +1649,7 @@ def parse_chat_messages_futures(
MultiModalUUIDDict
|
None
,
MultiModalUUIDDict
|
None
,
]:
]:
conversation
:
list
[
ConversationMessage
]
=
[]
conversation
:
list
[
ConversationMessage
]
=
[]
mm_tracker
=
AsyncMultiModalItemTracker
(
model_config
,
tokenizer
)
mm_tracker
=
AsyncMultiModalItemTracker
(
model_config
)
for
msg
in
messages
:
for
msg
in
messages
:
sub_messages
=
_parse_chat_message_content
(
sub_messages
=
_parse_chat_message_content
(
...
...
vllm/entrypoints/llm.py
View file @
9ae2f603
...
@@ -834,7 +834,6 @@ class LLM:
...
@@ -834,7 +834,6 @@ class LLM:
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
msgs
,
msgs
,
model_config
,
model_config
,
tokenizer
,
content_format
=
resolved_content_format
,
content_format
=
resolved_content_format
,
)
)
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
9ae2f603
...
@@ -1088,11 +1088,6 @@ class OpenAIServing:
...
@@ -1088,11 +1088,6 @@ class OpenAIServing:
Sequence
[
RequestPrompt
],
Sequence
[
RequestPrompt
],
list
[
EngineTokensPrompt
],
list
[
EngineTokensPrompt
],
]:
]:
if
tokenizer
is
None
:
raise
ValueError
(
"Unable to get tokenizer because `skip_tokenizer_init=True`"
)
model_config
=
self
.
model_config
model_config
=
self
.
model_config
resolved_content_format
=
resolve_chat_template_content_format
(
resolved_content_format
=
resolve_chat_template_content_format
(
...
@@ -1105,7 +1100,6 @@ class OpenAIServing:
...
@@ -1105,7 +1100,6 @@ class OpenAIServing:
conversation
,
mm_data_future
,
mm_uuids
=
parse_chat_messages_futures
(
conversation
,
mm_data_future
,
mm_uuids
=
parse_chat_messages_futures
(
messages
,
messages
,
model_config
,
model_config
,
tokenizer
,
content_format
=
resolved_content_format
,
content_format
=
resolved_content_format
,
)
)
...
...
vllm/entrypoints/score_utils.py
View file @
9ae2f603
...
@@ -89,12 +89,10 @@ def parse_score_data(
...
@@ -89,12 +89,10 @@ def parse_score_data(
data_1
:
str
|
ScoreContentPartParam
,
data_1
:
str
|
ScoreContentPartParam
,
data_2
:
str
|
ScoreContentPartParam
,
data_2
:
str
|
ScoreContentPartParam
,
model_config
:
ModelConfig
,
model_config
:
ModelConfig
,
tokenizer
:
TokenizerLike
,
)
->
tuple
[
str
,
str
,
MultiModalDataDict
|
None
]:
)
->
tuple
[
str
,
str
,
MultiModalDataDict
|
None
]:
mm_tracker
=
MultiModalItemTracker
(
model_config
,
tokenizer
)
mm_tracker
=
MultiModalItemTracker
(
model_config
)
content_1
=
_parse_score_content
(
data_1
,
mm_tracker
)
content_1
=
_parse_score_content
(
data_1
,
mm_tracker
)
content_2
=
_parse_score_content
(
data_2
,
mm_tracker
)
content_2
=
_parse_score_content
(
data_2
,
mm_tracker
)
def
ensure_str
(
content
:
_ContentPart
|
None
)
->
str
:
def
ensure_str
(
content
:
_ContentPart
|
None
)
->
str
:
...
@@ -188,7 +186,6 @@ def get_score_prompt(
...
@@ -188,7 +186,6 @@ def get_score_prompt(
data_1
,
data_1
,
data_2
,
data_2
,
model_config
,
model_config
,
tokenizer
,
)
)
from
vllm.model_executor.model_loader
import
get_model_cls
from
vllm.model_executor.model_loader
import
get_model_cls
...
...
vllm/model_executor/models/hunyuan_vision.py
View file @
9ae2f603
...
@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
...
@@ -62,6 +62,7 @@ from vllm.multimodal.inputs import (
from
vllm.multimodal.parse
import
(
from
vllm.multimodal.parse
import
(
DictEmbeddingItems
,
DictEmbeddingItems
,
ImageSize
,
ImageSize
,
ModalityDataItems
,
MultiModalDataItems
,
MultiModalDataItems
,
MultiModalDataParser
,
MultiModalDataParser
,
)
)
...
@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
...
@@ -570,7 +571,7 @@ class HunYuanVLMultiModalDataParser(MultiModalDataParser):
def
_parse_image_data
(
def
_parse_image_data
(
self
,
self
,
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
):
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
if
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
dict
):
return
DictEmbeddingItems
(
return
DictEmbeddingItems
(
data
,
data
,
...
...
vllm/model_executor/models/keye.py
View file @
9ae2f603
...
@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
...
@@ -1000,7 +1000,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def
_parse_image_data
(
def
_parse_image_data
(
self
,
self
,
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
)
->
ModalityDataItems
[
Any
,
Any
]:
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
if
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
dict
):
return
DictEmbeddingItems
(
return
DictEmbeddingItems
(
data
,
data
,
...
@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
...
@@ -1017,7 +1017,7 @@ class KeyeMultiModalDataParser(MultiModalDataParser):
def
_parse_video_data
(
def
_parse_video_data
(
self
,
self
,
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
VideoItem
],
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
VideoItem
],
)
->
ModalityDataItems
[
Any
,
Any
]:
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
if
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
dict
):
return
DictEmbeddingItems
(
return
DictEmbeddingItems
(
data
,
data
,
...
...
vllm/model_executor/models/keye_vl1_5.py
View file @
9ae2f603
...
@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
...
@@ -333,7 +333,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def
_parse_image_data
(
def
_parse_image_data
(
self
,
self
,
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
ImageItem
],
)
->
ModalityDataItems
[
Any
,
Any
]:
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
if
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
dict
):
return
DictEmbeddingItems
(
return
DictEmbeddingItems
(
data
,
data
,
...
@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
...
@@ -350,7 +350,7 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser):
def
_parse_video_data
(
def
_parse_video_data
(
self
,
self
,
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
VideoItem
],
data
:
dict
[
str
,
torch
.
Tensor
]
|
ModalityData
[
VideoItem
],
)
->
ModalityDataItems
[
Any
,
Any
]:
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
if
isinstance
(
data
,
dict
):
if
isinstance
(
data
,
dict
):
return
DictEmbeddingItems
(
return
DictEmbeddingItems
(
data
,
data
,
...
...
vllm/multimodal/audio.py
View file @
9ae2f603
...
@@ -11,6 +11,7 @@ import pybase64
...
@@ -11,6 +11,7 @@ import pybase64
import
torch
import
torch
from
vllm.utils.import_utils
import
PlaceholderModule
from
vllm.utils.import_utils
import
PlaceholderModule
from
vllm.utils.serial_utils
import
tensor2base64
from
.base
import
MediaIO
from
.base
import
MediaIO
...
@@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
...
@@ -135,8 +136,4 @@ class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
return
torch
.
load
(
filepath
,
weights_only
=
True
)
return
torch
.
load
(
filepath
,
weights_only
=
True
)
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
def
encode_base64
(
self
,
media
:
torch
.
Tensor
)
->
str
:
buffer
=
BytesIO
()
return
tensor2base64
(
media
)
torch
.
save
(
media
,
buffer
)
buffer
.
seek
(
0
)
binary_data
=
buffer
.
read
()
return
pybase64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
vllm/utils/serial_utils.py
View file @
9ae2f603
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
base64
import
base64
import
io
import
sys
import
sys
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Literal
from
typing
import
Literal
...
@@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"]
...
@@ -52,6 +53,15 @@ Endianness = Literal["native", "big", "little"]
EncodingFormat
=
Literal
[
"float"
,
"base64"
,
"bytes"
]
EncodingFormat
=
Literal
[
"float"
,
"base64"
,
"bytes"
]
def
tensor2base64
(
x
:
torch
.
Tensor
)
->
str
:
with
io
.
BytesIO
()
as
buf
:
torch
.
save
(
x
,
buf
)
buf
.
seek
(
0
)
binary_data
=
buf
.
read
()
return
base64
.
b64encode
(
binary_data
).
decode
(
"utf-8"
)
def
tensor2binary
(
def
tensor2binary
(
tensor
:
torch
.
Tensor
,
embed_dtype
:
EmbedDType
,
endianness
:
Endianness
tensor
:
torch
.
Tensor
,
embed_dtype
:
EmbedDType
,
endianness
:
Endianness
)
->
bytes
:
)
->
bytes
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment