Commit 72273242 authored by zhuwenwen's avatar zhuwenwen
Browse files

update max_seq_len_to_capture

parent 267cc5ff
......@@ -313,7 +313,8 @@ class ModelConfig:
graph and always execute the model in eager mode. If False, we will use
CUDA graph and eager execution in hybrid for maximal performance and
flexibility."""
max_seq_len_to_capture: int = 8192
# max_seq_len_to_capture: int = 8192
max_seq_len_to_capture: bool = None
"""Maximum sequence len covered by CUDA graphs. When a sequence has context
length larger than this, we fall back to eager mode. Additionally for
encoder-decoder models, if the sequence length of the encoder input is
......@@ -973,9 +974,11 @@ class ModelConfig:
"non-quantized models.", self.quantization)
def _verify_cuda_graph(self) -> None:
# self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
# self.max_model_len)
if self.max_seq_len_to_capture is None:
self.max_seq_len_to_capture = self.max_model_len
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len)
# self.max_seq_len_to_capture = self.max_model_len
# CUDAGraph capture not supported for enc-dec models and mllama on ROCm
ROCM_UNSUPPORTED_MODELS = ['mllama']
unsupported_rocm = (self.hf_config.model_type
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment