Commit b35835a1 authored by zhuwenwen's avatar zhuwenwen
Browse files

[Fix] MLA only supports decode-only full CUDAGraph capture. Make sure all...

[Fix] MLA only supports decode-only full CUDAGraph capture. Make sure all cudagraph capture sizes <= max_num_seq.
parent e4a84fdc
...@@ -4745,7 +4745,10 @@ class VllmConfig: ...@@ -4745,7 +4745,10 @@ class VllmConfig:
batch_size_capture_list = [] batch_size_capture_list = []
if self.model_config is not None and \ if self.model_config is not None and \
not self.model_config.enforce_eager: not self.model_config.enforce_eager:
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes if self.model_config.use_mla and self.compilation_config.full_cuda_graph:
cuda_graph_sizes = [256]
else:
cuda_graph_sizes = self.scheduler_config.cuda_graph_sizes
if len(cuda_graph_sizes) == 1: if len(cuda_graph_sizes) == 1:
batch_size_capture_list = [1, 2, 4] + [ batch_size_capture_list = [1, 2, 4] + [
i for i in range(8, cuda_graph_sizes[0] + 1, 8) i for i in range(8, cuda_graph_sizes[0] + 1, 8)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment