Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
24700c34
Unverified
Commit
24700c34
authored
Feb 08, 2025
by
Woosuk Kwon
Committed by
GitHub
Feb 08, 2025
Browse files
[V1] Cache `uses_mrope` in GPUModelRunner (#12969)
parent
d366ccc4
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
10 deletions
+13
-10
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+13
-10
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
24700c34
...
...
@@ -92,6 +92,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Multi-modal data support
self
.
input_registry
=
INPUT_REGISTRY
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
uses_mrope
=
model_config
.
uses_mrope
# NOTE: Initialized input mapper is only used for processing dummy
# multimodal data into multimodal kwargs for GPU memory profiling.
...
...
@@ -147,7 +148,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
device
=
self
.
device
)
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if
self
.
model_config
.
uses_mrope
:
if
self
.
uses_mrope
:
# NOTE: `mrope_positions` is implemented with one additional dummy
# position on purpose to make it non-contiguous so that it can work
# with torch compile.
...
...
@@ -284,7 +285,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
)
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if
self
.
model_config
.
uses_mrope
:
if
self
.
uses_mrope
:
image_grid_thw
=
[]
video_grid_thw
=
[]
second_per_grid_ts
=
[]
...
...
@@ -411,7 +412,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Calculate M-RoPE positions.
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
if
self
.
model_config
.
uses_mrope
:
if
self
.
uses_mrope
:
self
.
_calc_mrope_positions
(
scheduler_output
)
# Get token indices.
...
...
@@ -458,7 +459,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Copy the tensors to the GPU.
self
.
input_ids
[:
total_num_scheduled_tokens
].
copy_
(
self
.
input_ids_cpu
[:
total_num_scheduled_tokens
],
non_blocking
=
True
)
if
self
.
model_config
.
uses_mrope
:
if
self
.
uses_mrope
:
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
self
.
mrope_positions
[:,
:
total_num_scheduled_tokens
].
copy_
(
self
.
mrope_positions_cpu
[:,
:
total_num_scheduled_tokens
],
...
...
@@ -817,13 +818,14 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# then the embedding layer is not included in the CUDA graph.
input_ids
=
self
.
input_ids
[:
num_input_tokens
]
inputs_embeds
=
None
if
self
.
uses_mrope
:
positions
=
self
.
mrope_positions
[:,
:
num_input_tokens
]
else
:
positions
=
self
.
positions
[:
num_input_tokens
]
# Run the decoder.
# Use persistent buffers for CUDA graphs.
with
set_forward_context
(
attn_metadata
,
self
.
vllm_config
):
positions
=
self
.
mrope_positions
[:,
:
num_input_tokens
]
\
if
self
.
model_config
.
uses_mrope
\
else
self
.
positions
[:
num_input_tokens
]
hidden_states
=
self
.
model
(
input_ids
=
input_ids
,
positions
=
positions
,
...
...
@@ -1001,10 +1003,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
else
:
input_ids
=
self
.
input_ids
[:
num_tokens
]
inputs_embeds
=
None
if
self
.
uses_mrope
:
positions
=
self
.
mrope_positions
[:,
:
num_tokens
]
else
:
positions
=
self
.
positions
[:
num_tokens
]
with
set_forward_context
(
None
,
self
.
vllm_config
):
positions
=
self
.
mrope_positions
[:,
:
num_tokens
]
\
if
self
.
model_config
.
uses_mrope
\
else
self
.
positions
[:
num_tokens
]
hidden_states
=
model
(
input_ids
=
input_ids
,
positions
=
positions
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment