Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cd813c6d
Unverified
Commit
cd813c6d
authored
Feb 27, 2025
by
Woosuk Kwon
Committed by
GitHub
Feb 27, 2025
Browse files
[V1][Minor] Minor cleanup for GPU Model Runner (#13983)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
38acae6e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
7 additions
and
6 deletions
+7
-6
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+7
-6
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
cd813c6d
...
@@ -1187,8 +1187,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1187,8 +1187,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# NOTE: Currently model is profiled with a single non-text
# NOTE: Currently model is profiled with a single non-text
# modality with the max possible input tokens even when
# modality with the max possible input tokens even when
# it supports multiple.
# it supports multiple.
max_tokens_by_modality_dict
=
MULTIMODAL_REGISTRY
.
get_max_tokens_per_item_by_nonzero_modality
(
# noqa: E501
max_tokens_by_modality_dict
=
(
self
.
model_config
)
MULTIMODAL_REGISTRY
.
get_max_tokens_per_item_by_nonzero_modality
(
self
.
model_config
))
dummy_data_modality
,
max_tokens_per_mm_item
=
max
(
dummy_data_modality
,
max_tokens_per_mm_item
=
max
(
max_tokens_by_modality_dict
.
items
(),
key
=
lambda
item
:
item
[
1
])
max_tokens_by_modality_dict
.
items
(),
key
=
lambda
item
:
item
[
1
])
...
@@ -1275,15 +1276,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1275,15 +1276,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# maximum num_tokens.
# maximum num_tokens.
num_reqs
=
self
.
scheduler_config
.
max_num_seqs
num_reqs
=
self
.
scheduler_config
.
max_num_seqs
num_tokens
=
self
.
max_num_tokens
num_tokens
=
self
.
max_num_tokens
min_tokens_per_req
:
int
=
num_tokens
//
num_reqs
min_tokens_per_req
=
num_tokens
//
num_reqs
num_scheduled_tokens_list
:
List
[
int
]
=
[
min_tokens_per_req
]
*
num_reqs
num_scheduled_tokens_list
=
[
min_tokens_per_req
]
*
num_reqs
num_scheduled_tokens_list
[
-
1
]
+=
num_tokens
%
num_reqs
num_scheduled_tokens_list
[
-
1
]
+=
num_tokens
%
num_reqs
assert
sum
(
num_scheduled_tokens_list
)
==
num_tokens
assert
sum
(
num_scheduled_tokens_list
)
==
num_tokens
assert
len
(
num_scheduled_tokens_list
)
==
num_reqs
assert
len
(
num_scheduled_tokens_list
)
==
num_reqs
num_scheduled_tokens
:
np
.
ndarray
=
np
.
array
(
num_scheduled_tokens_list
,
num_scheduled_tokens
=
np
.
array
(
num_scheduled_tokens_list
,
dtype
=
np
.
int32
)
dtype
=
np
.
int32
)
logit_indices
=
np
.
cumsum
(
num_scheduled_tokens
)
-
1
logit_indices
=
np
.
cumsum
(
num_scheduled_tokens
)
-
1
with
self
.
maybe_profile_with_lora
(
self
.
lora_config
,
with
self
.
maybe_profile_with_lora
(
self
.
lora_config
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment