Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
fe63c17a
Unverified
Commit
fe63c17a
authored
Jul 18, 2025
by
Alec
Committed by
GitHub
Jul 19, 2025
Browse files
fix: Revert "feat: add vLLM v1 multi-modal example. Add llama4 Maverick ex… (#2017)
parent
bf1998f0
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
0 additions
and
326 deletions
+0
-326
examples/multimodal_v1/utils/logging.py
examples/multimodal_v1/utils/logging.py
+0
-45
examples/multimodal_v1/utils/model.py
examples/multimodal_v1/utils/model.py
+0
-105
examples/multimodal_v1/utils/protocol.py
examples/multimodal_v1/utils/protocol.py
+0
-176
No files found.
examples/multimodal_v1/utils/logging.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
from
dynamo._core
import
Client
logger
=
logging
.
getLogger
(
__name__
)
async
def
check_required_workers
(
workers_client
:
Client
,
required_workers
:
int
,
on_change
=
True
,
poll_interval
=
0.5
):
"""Wait until the minimum number of workers are ready."""
worker_ids
=
workers_client
.
instance_ids
()
num_workers
=
len
(
worker_ids
)
while
num_workers
<
required_workers
:
await
asyncio
.
sleep
(
poll_interval
)
worker_ids
=
workers_client
.
instance_ids
()
new_count
=
len
(
worker_ids
)
if
(
not
on_change
)
or
new_count
!=
num_workers
:
logger
.
info
(
f
"Waiting for more workers to be ready.
\n
"
f
" Current:
{
new_count
}
,"
f
" Required:
{
required_workers
}
"
)
num_workers
=
new_count
print
(
f
"Workers ready:
{
worker_ids
}
"
)
return
worker_ids
examples/multimodal_v1/utils/model.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
typing
import
Any
,
Dict
,
Tuple
import
torch
from
transformers
import
AutoConfig
from
utils.protocol
import
EncodeResponse
from
vllm
import
AsyncEngineArgs
from
vllm.utils
import
get_distributed_init_method
,
get_ip
,
get_open_port
from
vllm.worker.worker
import
Worker
# from transformers import AutoImageProcessor, LlavaForConditionalGeneration
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
logger
=
logging
.
getLogger
(
__name__
)
def
load_vision_model
(
model_id
:
str
)
->
torch
.
nn
.
Module
:
"""
Load a vision model from a HuggingFace model ID.
"""
engine_args
=
AsyncEngineArgs
(
model
=
model_id
,
trust_remote_code
=
True
)
engine_config
=
engine_args
.
create_engine_config
()
distributed_init_method
=
get_distributed_init_method
(
get_ip
(),
get_open_port
())
worker
=
Worker
(
vllm_config
=
engine_config
,
local_rank
=
0
,
rank
=
0
,
distributed_init_method
=
distributed_init_method
,
is_driver_worker
=
True
,
)
# Initialize the worker.
worker
.
init_device
()
worker
.
load_model
()
return
worker
.
model_runner
.
model
# model = LlavaForConditionalGeneration.from_pretrained(
# model_id, device_map="auto", torch_dtype=torch.float16
# ).eval()
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# model_id, torch_dtype="auto", device_map="auto"
# ).eval()
# return model
def
get_vision_embeddings_info
(
model_id
:
str
,
num_patches
:
int
)
->
Tuple
[
Tuple
[
int
,
int
,
int
],
torch
.
dtype
]:
"""Calculate vision embeddings size and dtype using model config
Returns a tuple of (batch_size, num_patches, hidden_dim), dtype.
"""
config
=
AutoConfig
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
assert
num_patches
>
0
,
"Number of patches must be positive"
if
not
hasattr
(
config
,
"torch_dtype"
):
raise
ValueError
(
"Model config missing required 'torch_dtype' attribute"
)
if
not
hasattr
(
config
,
"hidden_size"
):
logger
.
warning
(
"Model config missing required 'hidden_size' attribute, using 4096"
)
hidden_size
=
4096
else
:
hidden_size
=
config
.
hidden_size
return
(
1
,
num_patches
,
hidden_size
),
config
.
torch_dtype
def
construct_mm_data
(
model
:
str
,
encode_output
:
EncodeResponse
,
image_embeds
:
torch
.
Tensor
,
embeddings_dtype
:
torch
.
dtype
,
)
->
Dict
[
str
,
torch
.
Tensor
|
Dict
[
str
,
Any
]]:
"""Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
image_embeds
=
image_embeds
.
to
(
embeddings_dtype
)
if
"Qwen2"
in
model
:
return
{
"image"
:
{
"image_embeds"
:
image_embeds
.
squeeze
(
0
),
"image_grid_thw"
:
torch
.
tensor
(
encode_output
.
image_grid_thw
).
squeeze
(
0
),
}
}
elif
"MiniCPM-V"
in
model
:
return
{
"image"
:
{
"image_embeds"
:
image_embeds
,
"image_sizes"
:
encode_output
.
image_sizes
,
}
}
else
:
return
{
"image"
:
image_embeds
}
examples/multimodal_v1/utils/protocol.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
from
typing
import
Any
,
List
,
Literal
,
Optional
,
Union
import
connect
import
msgspec
from
pydantic
import
BaseModel
,
ConfigDict
,
field_validator
from
pydantic_core
import
core_schema
from
typing_extensions
import
NotRequired
from
vllm.inputs.data
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
PromptLogprobs
,
RequestMetrics
class
Request
(
BaseModel
):
prompt
:
str
sampling_params
:
dict
class
Tokens
(
BaseModel
):
tokens
:
list
[
int
]
class
PrefillRequest
(
Request
):
request_id
:
str
class
Response
(
BaseModel
):
text
:
str
class
PrefillResponse
(
BaseModel
):
prefilled
:
bool
# Hack to override the type of multi_modal_data in TokensPrompt
# as pydantic doesn't understand generic types
# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
class
PatchedTokensPrompt
(
TokensPrompt
):
multi_modal_data
:
NotRequired
[
Optional
[
Any
]]
# type: ignore
# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
# Sampling params is a mspspec struct
# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
SamplingParams
.
__get_pydantic_core_schema__
=
classmethod
(
lambda
cls
,
source
,
handler
:
core_schema
.
any_schema
()
)
class
vLLMGenerateRequest
(
BaseModel
):
"""
Serializable class of all the fields vLLM engine requires for inference
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
engine_prompt
:
PatchedTokensPrompt
sampling_params
:
SamplingParams
request_id
:
str
prefix_hit_rate
:
Optional
[
float
]
=
0.0
@
field_validator
(
"sampling_params"
,
mode
=
"before"
)
@
classmethod
def
parse_sampling_params
(
cls
,
v
:
Any
)
->
SamplingParams
:
if
isinstance
(
v
,
str
):
v
=
json
.
loads
(
v
)
if
isinstance
(
v
,
dict
):
return
SamplingParams
(
**
v
)
return
v
model_config
=
ConfigDict
(
json_encoders
=
{
SamplingParams
:
lambda
v
:
msgspec
.
json
.
encode
(
v
)}
)
class
TextContent
(
BaseModel
):
type
:
Literal
[
"text"
]
text
:
str
class
ImageURLDetail
(
BaseModel
):
url
:
str
class
ImageContent
(
BaseModel
):
type
:
Literal
[
"image_url"
]
image_url
:
ImageURLDetail
MessageContent
=
Union
[
TextContent
,
ImageContent
]
class
ChatMessage
(
BaseModel
):
role
:
Literal
[
"user"
,
"system"
,
"assistant"
]
content
:
List
[
MessageContent
]
class
MultiModalRequest
(
BaseModel
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
model
:
str
messages
:
List
[
ChatMessage
]
max_tokens
:
Optional
[
int
]
=
None
temperature
:
Optional
[
float
]
=
None
stream
:
Optional
[
bool
]
=
True
class
vLLMMultimodalRequest
(
vLLMGenerateRequest
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
image_url
:
Optional
[
str
]
=
None
# image_features: Optional[List[List[List[float]]]] = None # Remove once have NIXL support
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
class
EncodeRequest
(
BaseModel
):
"""
Serializable class of all the fields vLLM engine requires for inference
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
image_url
:
str
request_id
:
str
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
class
EncodeResponse
(
BaseModel
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
request_id
:
str
image_grid_thw
:
Optional
[
List
[
Any
]]
=
None
image_sizes
:
Optional
[
List
[
Any
]]
=
None
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
image_features
:
List
[
List
[
List
[
float
]]]
# Remove once have NIXL support
class
MyRequestOutput
(
BaseModel
):
"""
RequestOutput from vLLM is not serializable by default
https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
This class is used to serialize the RequestOutput and any recursively defined types
We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
request_id
:
str
prompt
:
Optional
[
str
]
=
None
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
outputs
:
List
[
CompletionOutput
]
finished
:
bool
metrics
:
Optional
[
RequestMetrics
]
=
None
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
# lora_request: Optional[LoRARequest] = None
# encoder_prompt: Optional[str] = None
# encoder_prompt_token_ids: Optional[List[int]] = None
# num_cached_tokens: Optional[int] = None
# multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment