Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5e8ca973
"vllm/multimodal/processing/processor.py" did not exist on "0ad9951c416d33c5da4f7a504fb162cbe62386f5"
Unverified
Commit
5e8ca973
authored
Jul 23, 2024
by
William Lin
Committed by
GitHub
Jul 24, 2024
Browse files
[Bugfix] fix flashinfer cudagraph capture for PP (#6708)
parent
87525fab
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
31 additions
and
7 deletions
+31
-7
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+24
-0
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+7
-7
No files found.
tests/distributed/test_pipeline_parallel.py
View file @
5e8ca973
...
...
@@ -61,3 +61,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
tp_args
.
append
(
"--enforce-eager"
)
compare_two_settings
(
MODEL_NAME
,
pp_args
,
tp_args
)
@
pytest
.
mark
.
parametrize
(
"PP_SIZE, MODEL_NAME"
,
[
(
2
,
"JackFram/llama-160m"
),
])
@
pytest
.
mark
.
parametrize
(
"ATTN_BACKEND"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
,
])
def
test_pp_cudagraph
(
PP_SIZE
,
MODEL_NAME
,
ATTN_BACKEND
):
cudagraph_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"float16"
,
"--pipeline-parallel-size"
,
str
(
PP_SIZE
),
"--distributed-executor-backend"
,
"ray"
,
]
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
ATTN_BACKEND
eager_args
=
cudagraph_args
+
[
"--enforce-eager"
]
compare_two_settings
(
MODEL_NAME
,
eager_args
,
cudagraph_args
)
vllm/worker/model_runner.py
View file @
5e8ca973
...
...
@@ -1040,8 +1040,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
parallel_config
.
pipeline_parallel_size
):
for
batch_size
in
reversed
(
batch_size_capture_list
):
if
self
.
attn_backend
.
get_name
()
==
"flashinfer"
:
indptr_buffer
=
indptr_buffer
[:
batch_size
+
1
]
last_page_len_buffer
=
last_page_len_buffer
[:
_
indptr_buffer
=
indptr_buffer
[:
batch_size
+
1
]
_
last_page_len_buffer
=
last_page_len_buffer
[:
batch_size
]
num_qo_heads
=
(
...
...
@@ -1055,8 +1055,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
use_tensor_cores
=
False
decode_wrapper
=
\
CUDAGraphBatchDecodeWithPagedKVCacheWrapper
(
decode_workspace_buffer
,
indptr_buffer
,
indices_buffer
,
last_page_len_buffer
,
"NHD"
,
decode_workspace_buffer
,
_
indptr_buffer
,
indices_buffer
,
_
last_page_len_buffer
,
"NHD"
,
use_tensor_cores
)
kv_cache_dtype
=
get_kv_cache_torch_dtype
(
self
.
kv_cache_dtype
,
self
.
model_config
.
dtype
)
...
...
@@ -1131,10 +1131,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self
.
model
,
self
.
attn_backend
.
get_name
())
if
self
.
attn_backend
.
get_name
()
==
"flashinfer"
:
graph_runner
.
flashinfer_indptr_buffer
=
indptr_buffer
graph_runner
.
flashinfer_indptr_buffer
=
_
indptr_buffer
graph_runner
.
flashinfer_indices_buffer
=
indices_buffer
graph_runner
.
flashinfer_last_page_len_buffer
=
\
last_page_len_buffer
_
last_page_len_buffer
graph_runner
.
flashinfer_decode_workspace_buffer
=
\
decode_workspace_buffer
graph_runner
.
flashinfer_decode_wrapper
=
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment