Unverified Commit 94e167ea authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix the default chunked prefill size (#2268)

parent 262e370f
...@@ -253,6 +253,8 @@ class Scheduler: ...@@ -253,6 +253,8 @@ class Scheduler:
# Init chunked prefill # Init chunked prefill
self.chunked_prefill_size = server_args.chunked_prefill_size self.chunked_prefill_size = server_args.chunked_prefill_size
if self.chunked_prefill_size <= 0: # -1 means disable
self.chunked_prefill_size = None
self.being_chunked_req = None self.being_chunked_req = None
self.is_mixed_chunk = ( self.is_mixed_chunk = (
self.chunked_prefill_size is not None and server_args.enable_mixed_chunk self.chunked_prefill_size is not None and server_args.enable_mixed_chunk
......
...@@ -118,7 +118,7 @@ class ModelRunner: ...@@ -118,7 +118,7 @@ class ModelRunner:
logger.info( logger.info(
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models." "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
) )
server_args.chunked_prefill_size = None server_args.chunked_prefill_size = -1
self.mem_fraction_static *= 0.95 self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [ if self.model_config.hf_config.architectures == [
...@@ -148,12 +148,14 @@ class ModelRunner: ...@@ -148,12 +148,14 @@ class ModelRunner:
set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3)) set_cpu_offload_max_bytes(int(server_args.cpu_offload_gb * 1024**3))
# Init components # Get memory before model loading
min_per_gpu_memory = self.init_torch_distributed() min_per_gpu_memory = self.init_torch_distributed()
# Load the model
self.sampler = Sampler() self.sampler = Sampler()
self.load_model() self.load_model()
# Apply torch TP if model supports it # Apply torch TP if the model supports it
supports_torch_tp = getattr(self.model, "supports_torch_tp", False) supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
if self.tp_size > 1 and supports_torch_tp: if self.tp_size > 1 and supports_torch_tp:
self.apply_torch_tp() self.apply_torch_tp()
...@@ -161,6 +163,7 @@ class ModelRunner: ...@@ -161,6 +163,7 @@ class ModelRunner:
else: else:
self.torch_tp_applied = False self.torch_tp_applied = False
# Init memory pool and attention backends
if server_args.lora_paths is not None: if server_args.lora_paths is not None:
self.init_lora_manager() self.init_lora_manager()
self.init_memory_pool( self.init_memory_pool(
......
...@@ -58,7 +58,7 @@ class ServerArgs: ...@@ -58,7 +58,7 @@ class ServerArgs:
mem_fraction_static: Optional[float] = None mem_fraction_static: Optional[float] = None
max_running_requests: Optional[int] = None max_running_requests: Optional[int] = None
max_total_tokens: Optional[int] = None max_total_tokens: Optional[int] = None
chunked_prefill_size: int = 8192 chunked_prefill_size: Optional[int] = None
max_prefill_tokens: int = 16384 max_prefill_tokens: int = 16384
schedule_policy: str = "lpm" schedule_policy: str = "lpm"
schedule_conservativeness: float = 1.0 schedule_conservativeness: float = 1.0
...@@ -128,7 +128,7 @@ class ServerArgs: ...@@ -128,7 +128,7 @@ class ServerArgs:
enable_dp_attention: bool = False enable_dp_attention: bool = False
enable_torch_compile: bool = False enable_torch_compile: bool = False
torch_compile_max_bs: int = 32 torch_compile_max_bs: int = 32
cuda_graph_max_bs: int = 160 cuda_graph_max_bs: Optional[int] = None
torchao_config: str = "" torchao_config: str = ""
enable_nan_detection: bool = False enable_nan_detection: bool = False
enable_p2p_check: bool = False enable_p2p_check: bool = False
...@@ -144,14 +144,15 @@ class ServerArgs: ...@@ -144,14 +144,15 @@ class ServerArgs:
if self.served_model_name is None: if self.served_model_name is None:
self.served_model_name = self.model_path self.served_model_name = self.model_path
if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0:
# Disable chunked prefill
self.chunked_prefill_size = None
if self.random_seed is None: if self.random_seed is None:
self.random_seed = random.randint(0, 1 << 30) self.random_seed = random.randint(0, 1 << 30)
# Mem fraction depends on the tensor parallelism size if is_hip():
gpu_mem = get_amdgpu_memory_capacity()
else:
gpu_mem = get_nvgpu_memory_capacity()
# Set mem fraction static, which depends on the tensor parallelism size
if self.mem_fraction_static is None: if self.mem_fraction_static is None:
if self.tp_size >= 16: if self.tp_size >= 16:
self.mem_fraction_static = 0.79 self.mem_fraction_static = 0.79
...@@ -164,18 +165,21 @@ class ServerArgs: ...@@ -164,18 +165,21 @@ class ServerArgs:
else: else:
self.mem_fraction_static = 0.88 self.mem_fraction_static = 0.88
# Adjust for GPUs with small memory capacities # Set chunked prefill size, which depends on the gpu memory capacity
if is_hip(): if self.chunked_prefill_size is None:
gpu_mem = get_amdgpu_memory_capacity() if gpu_mem < 25_000:
else: self.chunked_prefill_size = 2048
gpu_mem = get_nvgpu_memory_capacity() else:
self.chunked_prefill_size = 8192
if gpu_mem < 25000: # Set cuda graph max batch size
logger.warning( if self.cuda_graph_max_bs is None:
"Your GPU has less than 25GB memory. You may want to set a smaller --chunked-prefill-size (e.g., 512) to improve performance." if gpu_mem < 25_000:
) self.cuda_graph_max_bs = 8
else:
self.cuda_graph_max_bs = 160
# Choose kernel backends # Set kernel backends
if not is_flashinfer_available(): if not is_flashinfer_available():
self.attention_backend = "triton" self.attention_backend = "triton"
self.sampling_backend = "pytorch" self.sampling_backend = "pytorch"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment