Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3ea2dc2e
Unverified
Commit
3ea2dc2e
authored
Oct 31, 2024
by
Roger Wang
Committed by
GitHub
Oct 31, 2024
Browse files
[Misc] Remove deprecated arg for cuda graph capture (#9864)
Signed-off-by:
Roger Wang
<
ywang@roblox.com
>
parent
d087bf86
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
1 addition
and
23 deletions
+1
-23
vllm/config.py
vllm/config.py
+0
-7
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+0
-10
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+0
-5
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+1
-1
No files found.
vllm/config.py
View file @
3ea2dc2e
...
@@ -84,9 +84,6 @@ class ModelConfig:
...
@@ -84,9 +84,6 @@ class ModelConfig:
disable CUDA graph and always execute the model in eager mode.
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
If False, we will use CUDA graph and eager execution in hybrid.
If None, the user did not specify, so default to False.
If None, the user did not specify, so default to False.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
When a sequence has context length larger than this, we fall back
to eager mode. Additionally for encoder-decoder models, if the
to eager mode. Additionally for encoder-decoder models, if the
...
@@ -147,7 +144,6 @@ class ModelConfig:
...
@@ -147,7 +144,6 @@ class ModelConfig:
quantization
:
Optional
[
str
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
quantization_param_path
:
Optional
[
str
]
=
None
,
quantization_param_path
:
Optional
[
str
]
=
None
,
enforce_eager
:
Optional
[
bool
]
=
None
,
enforce_eager
:
Optional
[
bool
]
=
None
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
Optional
[
int
]
=
None
,
max_logprobs
:
int
=
20
,
max_logprobs
:
int
=
20
,
disable_sliding_window
:
bool
=
False
,
disable_sliding_window
:
bool
=
False
,
...
@@ -181,9 +177,6 @@ class ModelConfig:
...
@@ -181,9 +177,6 @@ class ModelConfig:
self
.
quantization
=
quantization
self
.
quantization
=
quantization
self
.
quantization_param_path
=
quantization_param_path
self
.
quantization_param_path
=
quantization_param_path
self
.
enforce_eager
=
enforce_eager
self
.
enforce_eager
=
enforce_eager
if
max_context_len_to_capture
is
not
None
:
raise
ValueError
(
"`max_context_len_to_capture` is deprecated. "
"Use `max_seq_len_to_capture` instead."
)
self
.
max_seq_len_to_capture
=
max_seq_len_to_capture
self
.
max_seq_len_to_capture
=
max_seq_len_to_capture
self
.
max_logprobs
=
max_logprobs
self
.
max_logprobs
=
max_logprobs
self
.
disable_sliding_window
=
disable_sliding_window
self
.
disable_sliding_window
=
disable_sliding_window
...
...
vllm/engine/arg_utils.py
View file @
3ea2dc2e
...
@@ -126,7 +126,6 @@ class EngineArgs:
...
@@ -126,7 +126,6 @@ class EngineArgs:
tokenizer_revision
:
Optional
[
str
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
enforce_eager
:
Optional
[
bool
]
=
None
enforce_eager
:
Optional
[
bool
]
=
None
max_context_len_to_capture
:
Optional
[
int
]
=
None
max_seq_len_to_capture
:
int
=
8192
max_seq_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
False
disable_custom_all_reduce
:
bool
=
False
tokenizer_pool_size
:
int
=
0
tokenizer_pool_size
:
int
=
0
...
@@ -504,14 +503,6 @@ class EngineArgs:
...
@@ -504,14 +503,6 @@ class EngineArgs:
help
=
'Always use eager-mode PyTorch. If False, '
help
=
'Always use eager-mode PyTorch. If False, '
'will use eager mode and CUDA graph in hybrid '
'will use eager mode and CUDA graph in hybrid '
'for maximal performance and flexibility.'
)
'for maximal performance and flexibility.'
)
parser
.
add_argument
(
'--max-context-len-to-capture'
,
type
=
int
,
default
=
EngineArgs
.
max_context_len_to_capture
,
help
=
'Maximum context length covered by CUDA '
'graphs. When a sequence has context length '
'larger than this, we fall back to eager mode. '
'(DEPRECATED. Use --max-seq-len-to-capture instead'
')'
)
parser
.
add_argument
(
'--max-seq-len-to-capture'
,
parser
.
add_argument
(
'--max-seq-len-to-capture'
,
type
=
int
,
type
=
int
,
default
=
EngineArgs
.
max_seq_len_to_capture
,
default
=
EngineArgs
.
max_seq_len_to_capture
,
...
@@ -939,7 +930,6 @@ class EngineArgs:
...
@@ -939,7 +930,6 @@ class EngineArgs:
quantization
=
self
.
quantization
,
quantization
=
self
.
quantization
,
quantization_param_path
=
self
.
quantization_param_path
,
quantization_param_path
=
self
.
quantization_param_path
,
enforce_eager
=
self
.
enforce_eager
,
enforce_eager
=
self
.
enforce_eager
,
max_context_len_to_capture
=
self
.
max_context_len_to_capture
,
max_seq_len_to_capture
=
self
.
max_seq_len_to_capture
,
max_seq_len_to_capture
=
self
.
max_seq_len_to_capture
,
max_logprobs
=
self
.
max_logprobs
,
max_logprobs
=
self
.
max_logprobs
,
disable_sliding_window
=
self
.
disable_sliding_window
,
disable_sliding_window
=
self
.
disable_sliding_window
,
...
...
vllm/entrypoints/llm.py
View file @
3ea2dc2e
...
@@ -93,9 +93,6 @@ class LLM:
...
@@ -93,9 +93,6 @@ class LLM:
enforce_eager: Whether to enforce eager execution. If True, we will
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
When a sequence has context length larger than this, we fall back
to eager mode. Additionally for encoder-decoder models, if the
to eager mode. Additionally for encoder-decoder models, if the
...
@@ -152,7 +149,6 @@ class LLM:
...
@@ -152,7 +149,6 @@ class LLM:
swap_space
:
float
=
4
,
swap_space
:
float
=
4
,
cpu_offload_gb
:
float
=
0
,
cpu_offload_gb
:
float
=
0
,
enforce_eager
:
Optional
[
bool
]
=
None
,
enforce_eager
:
Optional
[
bool
]
=
None
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
int
=
8192
,
max_seq_len_to_capture
:
int
=
8192
,
disable_custom_all_reduce
:
bool
=
False
,
disable_custom_all_reduce
:
bool
=
False
,
disable_async_output_proc
:
bool
=
False
,
disable_async_output_proc
:
bool
=
False
,
...
@@ -193,7 +189,6 @@ class LLM:
...
@@ -193,7 +189,6 @@ class LLM:
swap_space
=
swap_space
,
swap_space
=
swap_space
,
cpu_offload_gb
=
cpu_offload_gb
,
cpu_offload_gb
=
cpu_offload_gb
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_context_len_to_capture
=
max_context_len_to_capture
,
max_seq_len_to_capture
=
max_seq_len_to_capture
,
max_seq_len_to_capture
=
max_seq_len_to_capture
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
disable_async_output_proc
=
disable_async_output_proc
,
disable_async_output_proc
=
disable_async_output_proc
,
...
...
vllm/worker/model_runner.py
View file @
3ea2dc2e
...
@@ -995,7 +995,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -995,7 +995,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# Python can be expensive. To optimize this, we cache the block table
# Python can be expensive. To optimize this, we cache the block table
# in numpy and only copy the actual input content at every iteration.
# in numpy and only copy the actual input content at every iteration.
# The shape of the cached block table will be
# The shape of the cached block table will be
# (max batch size to capture, max
context
len to capture / block size).
# (max batch size to capture, max
seq
len to capture / block size).
self
.
graph_block_tables
=
np
.
zeros
(
self
.
graph_block_tables
=
np
.
zeros
(
(
self
.
max_batchsize_to_capture
,
self
.
get_max_block_per_batch
()),
(
self
.
max_batchsize_to_capture
,
self
.
get_max_block_per_batch
()),
dtype
=
np
.
int32
)
dtype
=
np
.
int32
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment