Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
fe63c17a
"fern/vscode:/vscode.git/clone" did not exist on "5c7e66ece16e0dd9f6e19308970587a216360d83"
Unverified
Commit
fe63c17a
authored
Jul 18, 2025
by
Alec
Committed by
GitHub
Jul 19, 2025
Browse files
fix: Revert "feat: add vLLM v1 multi-modal example. Add llama4 Maverick ex… (#2017)
parent
bf1998f0
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
0 additions
and
326 deletions
+0
-326
examples/multimodal_v1/utils/logging.py
examples/multimodal_v1/utils/logging.py
+0
-45
examples/multimodal_v1/utils/model.py
examples/multimodal_v1/utils/model.py
+0
-105
examples/multimodal_v1/utils/protocol.py
examples/multimodal_v1/utils/protocol.py
+0
-176
No files found.
examples/multimodal_v1/utils/logging.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
from
dynamo._core
import
Client
logger
=
logging
.
getLogger
(
__name__
)
async
def
check_required_workers
(
workers_client
:
Client
,
required_workers
:
int
,
on_change
=
True
,
poll_interval
=
0.5
):
"""Wait until the minimum number of workers are ready."""
worker_ids
=
workers_client
.
instance_ids
()
num_workers
=
len
(
worker_ids
)
while
num_workers
<
required_workers
:
await
asyncio
.
sleep
(
poll_interval
)
worker_ids
=
workers_client
.
instance_ids
()
new_count
=
len
(
worker_ids
)
if
(
not
on_change
)
or
new_count
!=
num_workers
:
logger
.
info
(
f
"Waiting for more workers to be ready.
\n
"
f
" Current:
{
new_count
}
,"
f
" Required:
{
required_workers
}
"
)
num_workers
=
new_count
print
(
f
"Workers ready:
{
worker_ids
}
"
)
return
worker_ids
examples/multimodal_v1/utils/model.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
logging
from
typing
import
Any
,
Dict
,
Tuple
import
torch
from
transformers
import
AutoConfig
from
utils.protocol
import
EncodeResponse
from
vllm
import
AsyncEngineArgs
from
vllm.utils
import
get_distributed_init_method
,
get_ip
,
get_open_port
from
vllm.worker.worker
import
Worker
# from transformers import AutoImageProcessor, LlavaForConditionalGeneration
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
logger
=
logging
.
getLogger
(
__name__
)
def
load_vision_model
(
model_id
:
str
)
->
torch
.
nn
.
Module
:
"""
Load a vision model from a HuggingFace model ID.
"""
engine_args
=
AsyncEngineArgs
(
model
=
model_id
,
trust_remote_code
=
True
)
engine_config
=
engine_args
.
create_engine_config
()
distributed_init_method
=
get_distributed_init_method
(
get_ip
(),
get_open_port
())
worker
=
Worker
(
vllm_config
=
engine_config
,
local_rank
=
0
,
rank
=
0
,
distributed_init_method
=
distributed_init_method
,
is_driver_worker
=
True
,
)
# Initialize the worker.
worker
.
init_device
()
worker
.
load_model
()
return
worker
.
model_runner
.
model
# model = LlavaForConditionalGeneration.from_pretrained(
# model_id, device_map="auto", torch_dtype=torch.float16
# ).eval()
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# model_id, torch_dtype="auto", device_map="auto"
# ).eval()
# return model
def
get_vision_embeddings_info
(
model_id
:
str
,
num_patches
:
int
)
->
Tuple
[
Tuple
[
int
,
int
,
int
],
torch
.
dtype
]:
"""Calculate vision embeddings size and dtype using model config
Returns a tuple of (batch_size, num_patches, hidden_dim), dtype.
"""
config
=
AutoConfig
.
from_pretrained
(
model_id
,
trust_remote_code
=
True
)
assert
num_patches
>
0
,
"Number of patches must be positive"
if
not
hasattr
(
config
,
"torch_dtype"
):
raise
ValueError
(
"Model config missing required 'torch_dtype' attribute"
)
if
not
hasattr
(
config
,
"hidden_size"
):
logger
.
warning
(
"Model config missing required 'hidden_size' attribute, using 4096"
)
hidden_size
=
4096
else
:
hidden_size
=
config
.
hidden_size
return
(
1
,
num_patches
,
hidden_size
),
config
.
torch_dtype
def
construct_mm_data
(
model
:
str
,
encode_output
:
EncodeResponse
,
image_embeds
:
torch
.
Tensor
,
embeddings_dtype
:
torch
.
dtype
,
)
->
Dict
[
str
,
torch
.
Tensor
|
Dict
[
str
,
Any
]]:
"""Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
image_embeds
=
image_embeds
.
to
(
embeddings_dtype
)
if
"Qwen2"
in
model
:
return
{
"image"
:
{
"image_embeds"
:
image_embeds
.
squeeze
(
0
),
"image_grid_thw"
:
torch
.
tensor
(
encode_output
.
image_grid_thw
).
squeeze
(
0
),
}
}
elif
"MiniCPM-V"
in
model
:
return
{
"image"
:
{
"image_embeds"
:
image_embeds
,
"image_sizes"
:
encode_output
.
image_sizes
,
}
}
else
:
return
{
"image"
:
image_embeds
}
examples/multimodal_v1/utils/protocol.py
deleted
100644 → 0
View file @
bf1998f0
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
from
typing
import
Any
,
List
,
Literal
,
Optional
,
Union
import
connect
import
msgspec
from
pydantic
import
BaseModel
,
ConfigDict
,
field_validator
from
pydantic_core
import
core_schema
from
typing_extensions
import
NotRequired
from
vllm.inputs.data
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
PromptLogprobs
,
RequestMetrics
class
Request
(
BaseModel
):
prompt
:
str
sampling_params
:
dict
class
Tokens
(
BaseModel
):
tokens
:
list
[
int
]
class
PrefillRequest
(
Request
):
request_id
:
str
class
Response
(
BaseModel
):
text
:
str
class
PrefillResponse
(
BaseModel
):
prefilled
:
bool
# Hack to override the type of multi_modal_data in TokensPrompt
# as pydantic doesn't understand generic types
# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
class
PatchedTokensPrompt
(
TokensPrompt
):
multi_modal_data
:
NotRequired
[
Optional
[
Any
]]
# type: ignore
# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
# Sampling params is a mspspec struct
# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
SamplingParams
.
__get_pydantic_core_schema__
=
classmethod
(
lambda
cls
,
source
,
handler
:
core_schema
.
any_schema
()
)
class
vLLMGenerateRequest
(
BaseModel
):
"""
Serializable class of all the fields vLLM engine requires for inference
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
engine_prompt
:
PatchedTokensPrompt
sampling_params
:
SamplingParams
request_id
:
str
prefix_hit_rate
:
Optional
[
float
]
=
0.0
@
field_validator
(
"sampling_params"
,
mode
=
"before"
)
@
classmethod
def
parse_sampling_params
(
cls
,
v
:
Any
)
->
SamplingParams
:
if
isinstance
(
v
,
str
):
v
=
json
.
loads
(
v
)
if
isinstance
(
v
,
dict
):
return
SamplingParams
(
**
v
)
return
v
model_config
=
ConfigDict
(
json_encoders
=
{
SamplingParams
:
lambda
v
:
msgspec
.
json
.
encode
(
v
)}
)
class
TextContent
(
BaseModel
):
type
:
Literal
[
"text"
]
text
:
str
class
ImageURLDetail
(
BaseModel
):
url
:
str
class
ImageContent
(
BaseModel
):
type
:
Literal
[
"image_url"
]
image_url
:
ImageURLDetail
MessageContent
=
Union
[
TextContent
,
ImageContent
]
class
ChatMessage
(
BaseModel
):
role
:
Literal
[
"user"
,
"system"
,
"assistant"
]
content
:
List
[
MessageContent
]
class
MultiModalRequest
(
BaseModel
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
model
:
str
messages
:
List
[
ChatMessage
]
max_tokens
:
Optional
[
int
]
=
None
temperature
:
Optional
[
float
]
=
None
stream
:
Optional
[
bool
]
=
True
class
vLLMMultimodalRequest
(
vLLMGenerateRequest
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
image_url
:
Optional
[
str
]
=
None
# image_features: Optional[List[List[List[float]]]] = None # Remove once have NIXL support
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
class
EncodeRequest
(
BaseModel
):
"""
Serializable class of all the fields vLLM engine requires for inference
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
image_url
:
str
request_id
:
str
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
class
EncodeResponse
(
BaseModel
):
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
request_id
:
str
image_grid_thw
:
Optional
[
List
[
Any
]]
=
None
image_sizes
:
Optional
[
List
[
Any
]]
=
None
serialized_request
:
Optional
[
connect
.
SerializedRequest
]
=
None
image_features
:
List
[
List
[
List
[
float
]]]
# Remove once have NIXL support
class
MyRequestOutput
(
BaseModel
):
"""
RequestOutput from vLLM is not serializable by default
https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
This class is used to serialize the RequestOutput and any recursively defined types
We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
request_id
:
str
prompt
:
Optional
[
str
]
=
None
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
outputs
:
List
[
CompletionOutput
]
finished
:
bool
metrics
:
Optional
[
RequestMetrics
]
=
None
kv_transfer_params
:
Optional
[
dict
[
str
,
Any
]]
=
None
# lora_request: Optional[LoRARequest] = None
# encoder_prompt: Optional[str] = None
# encoder_prompt_token_ids: Optional[List[int]] = None
# num_cached_tokens: Optional[int] = None
# multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment