Unverified Commit 6406408a authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Clean up server_args.py (#7037)

parent 019851d0
...@@ -118,7 +118,7 @@ def _compile_warning_1(): ...@@ -118,7 +118,7 @@ def _compile_warning_1():
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE: if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
logger.warning( logger.warning(
"Entering DeepGEMM JIT Pre-Compile session. " "Entering DeepGEMM JIT Pre-Compile session. "
"And it may takes a long time(Typically 10-20 mins) " "It may takes a long time (typically 10-20 mins) "
"if you have not run `sglang.compile_deep_gemm`. " "if you have not run `sglang.compile_deep_gemm`. "
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`" "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
" for pre-compilation to reduce the overhead if you have not run it before. " " for pre-compilation to reduce the overhead if you have not run it before. "
......
...@@ -72,32 +72,33 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5 ...@@ -72,32 +72,33 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
GLOBAL_SERVER_ARGS_KEYS = [ GLOBAL_SERVER_ARGS_KEYS = [
"attention_backend", "attention_backend",
"mm_attention_backend",
"debug_tensor_dump_inject", "debug_tensor_dump_inject",
"debug_tensor_dump_output_folder", "debug_tensor_dump_output_folder",
"chunked_prefill_size", "chunked_prefill_size",
"deepep_mode",
"device", "device",
"disable_chunked_prefix_cache", "disable_chunked_prefix_cache",
"disable_radix_cache", "disable_radix_cache",
"enable_deepep_moe",
"enable_dp_attention", "enable_dp_attention",
"enable_two_batch_overlap", "enable_two_batch_overlap",
"enable_dp_lm_head", "enable_dp_lm_head",
"enable_deepep_moe",
"deepep_mode",
"enable_ep_moe", "enable_ep_moe",
"moe_dense_tp_size",
"ep_dispatch_algorithm",
"deepep_config", "deepep_config",
"ep_num_redundant_experts",
"enable_nan_detection", "enable_nan_detection",
"flashinfer_mla_disable_ragged", "flashinfer_mla_disable_ragged",
"max_micro_batch_size", "max_micro_batch_size",
"moe_dense_tp_size",
"ep_dispatch_algorithm",
"disable_shared_experts_fusion", "disable_shared_experts_fusion",
"sampling_backend", "sampling_backend",
"speculative_accept_threshold_acc", "speculative_accept_threshold_acc",
"speculative_accept_threshold_single", "speculative_accept_threshold_single",
"torchao_config", "torchao_config",
"triton_attention_reduce_in_fp32", "triton_attention_reduce_in_fp32",
"ep_num_redundant_experts", "num_reserved_decode_tokens",
"mm_attention_backend",
] ]
# Put some global args for easy access # Put some global args for easy access
......
...@@ -17,12 +17,14 @@ from __future__ import annotations ...@@ -17,12 +17,14 @@ from __future__ import annotations
import bisect import bisect
import inspect import inspect
import logging
import os import os
from contextlib import contextmanager from contextlib import contextmanager
from typing import TYPE_CHECKING, Callable, Optional, Union from typing import TYPE_CHECKING, Callable, Optional, Union
import torch import torch
import tqdm import tqdm
from torch.profiler import ProfilerActivity, profile
from sglang.srt.custom_op import CustomOp from sglang.srt.custom_op import CustomOp
from sglang.srt.distributed import get_tensor_model_parallel_rank from sglang.srt.distributed import get_tensor_model_parallel_rank
...@@ -40,11 +42,14 @@ from sglang.srt.model_executor.forward_batch_info import ( ...@@ -40,11 +42,14 @@ from sglang.srt.model_executor.forward_batch_info import (
from sglang.srt.patch_torch import monkey_patch_torch_compile from sglang.srt.patch_torch import monkey_patch_torch_compile
from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
from sglang.srt.utils import ( from sglang.srt.utils import (
empty_context,
get_available_gpu_memory, get_available_gpu_memory,
get_device_memory_capacity, get_device_memory_capacity,
rank0_log, rank0_log,
) )
logger = logging.getLogger(__name__)
if TYPE_CHECKING: if TYPE_CHECKING:
from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.model_executor.model_runner import ModelRunner
...@@ -207,6 +212,9 @@ class CudaGraphRunner: ...@@ -207,6 +212,9 @@ class CudaGraphRunner:
model_runner.server_args.enable_two_batch_overlap model_runner.server_args.enable_two_batch_overlap
) )
self.speculative_algorithm = model_runner.server_args.speculative_algorithm self.speculative_algorithm = model_runner.server_args.speculative_algorithm
self.enable_profile_cuda_graph = (
model_runner.server_args.enable_profile_cuda_graph
)
self.tp_size = model_runner.server_args.tp_size self.tp_size = model_runner.server_args.tp_size
self.dp_size = model_runner.server_args.dp_size self.dp_size = model_runner.server_args.dp_size
self.pp_size = model_runner.server_args.pp_size self.pp_size = model_runner.server_args.pp_size
...@@ -339,44 +347,67 @@ class CudaGraphRunner: ...@@ -339,44 +347,67 @@ class CudaGraphRunner:
return is_bs_supported and is_encoder_lens_supported and is_tbo_supported return is_bs_supported and is_encoder_lens_supported and is_tbo_supported
def capture(self): def capture(self) -> None:
with graph_capture() as graph_capture_context: profile_context = empty_context()
self.stream = graph_capture_context.stream if self.enable_profile_cuda_graph:
avail_mem = get_available_gpu_memory( profile_context = profile(
self.model_runner.device, self.model_runner.gpu_id, empty_cache=False activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
record_shapes=True,
) )
# Reverse the order to enable better memory sharing across cuda graphs.
capture_range = (
tqdm.tqdm(list(reversed(self.capture_bs)))
if get_tensor_model_parallel_rank() == 0
else reversed(self.capture_bs)
)
for bs in capture_range:
if get_tensor_model_parallel_rank() == 0:
avail_mem = get_available_gpu_memory(
self.model_runner.device,
self.model_runner.gpu_id,
empty_cache=False,
)
capture_range.set_description(
f"Capturing batches ({avail_mem=:.2f} GB)"
)
with patch_model(
self.model_runner.model,
bs in self.compile_bs,
num_tokens=bs * self.num_tokens_per_bs,
tp_group=self.model_runner.tp_group,
) as forward:
(
graph,
output_buffers,
) = self.capture_one_batch_size(bs, forward)
self.graphs[bs] = graph
self.output_buffers[bs] = output_buffers
# Save gemlite cache after each capture with graph_capture() as graph_capture_context:
save_gemlite_cache() with profile_context as prof:
self.stream = graph_capture_context.stream
avail_mem = get_available_gpu_memory(
self.model_runner.device,
self.model_runner.gpu_id,
empty_cache=False,
)
# Reverse the order to enable better memory sharing across cuda graphs.
capture_range = (
tqdm.tqdm(list(reversed(self.capture_bs)))
if get_tensor_model_parallel_rank() == 0
else reversed(self.capture_bs)
)
for i, bs in enumerate(capture_range):
if get_tensor_model_parallel_rank() == 0:
avail_mem = get_available_gpu_memory(
self.model_runner.device,
self.model_runner.gpu_id,
empty_cache=False,
)
capture_range.set_description(
f"Capturing batches ({avail_mem=:.2f} GB)"
)
with patch_model(
self.model_runner.model,
bs in self.compile_bs,
num_tokens=bs * self.num_tokens_per_bs,
tp_group=self.model_runner.tp_group,
) as forward:
(
graph,
output_buffers,
) = self.capture_one_batch_size(bs, forward)
self.graphs[bs] = graph
self.output_buffers[bs] = output_buffers
# Save gemlite cache after each capture
save_gemlite_cache()
if self.enable_profile_cuda_graph:
log_message = (
"Sorted by CUDA Time:\n"
+ prof.key_averages(group_by_input_shape=True).table(
sort_by="cuda_time_total", row_limit=10
)
+ "\n\nSorted by CPU Time:\n"
+ prof.key_averages(group_by_input_shape=True).table(
sort_by="cpu_time_total", row_limit=10
)
)
logger.info(log_message)
def capture_one_batch_size(self, bs: int, forward: Callable): def capture_one_batch_size(self, bs: int, forward: Callable):
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
...@@ -443,7 +474,7 @@ class CudaGraphRunner: ...@@ -443,7 +474,7 @@ class CudaGraphRunner:
token_to_kv_pool=self.model_runner.token_to_kv_pool, token_to_kv_pool=self.model_runner.token_to_kv_pool,
attn_backend=self.model_runner.attn_backend, attn_backend=self.model_runner.attn_backend,
out_cache_loc=out_cache_loc, out_cache_loc=out_cache_loc,
seq_lens_sum=seq_lens.sum(), seq_lens_sum=seq_lens.sum().item(),
encoder_lens=encoder_lens, encoder_lens=encoder_lens,
return_logprob=False, return_logprob=False,
positions=positions, positions=positions,
......
...@@ -112,14 +112,12 @@ class ServerArgs: ...@@ -112,14 +112,12 @@ class ServerArgs:
file_storage_path: str = "sglang_storage" file_storage_path: str = "sglang_storage"
enable_cache_report: bool = False enable_cache_report: bool = False
reasoning_parser: Optional[str] = None reasoning_parser: Optional[str] = None
tool_call_parser: Optional[str] = None
# Data parallelism # Data parallelism
dp_size: int = 1 dp_size: int = 1
load_balance_method: str = "round_robin" load_balance_method: str = "round_robin"
# Expert parallelism
ep_size: int = 1
# Multi-node distributed serving # Multi-node distributed serving
dist_init_addr: Optional[str] = None dist_init_addr: Optional[str] = None
nnodes: int = 1 nnodes: int = 1
...@@ -138,6 +136,7 @@ class ServerArgs: ...@@ -138,6 +136,7 @@ class ServerArgs:
attention_backend: Optional[str] = None attention_backend: Optional[str] = None
sampling_backend: Optional[str] = None sampling_backend: Optional[str] = None
grammar_backend: Optional[str] = None grammar_backend: Optional[str] = None
mm_attention_backend: Optional[str] = None
# Speculative decoding # Speculative decoding
speculative_algorithm: Optional[str] = None speculative_algorithm: Optional[str] = None
...@@ -149,6 +148,26 @@ class ServerArgs: ...@@ -149,6 +148,26 @@ class ServerArgs:
speculative_accept_threshold_acc: float = 1.0 speculative_accept_threshold_acc: float = 1.0
speculative_token_map: Optional[str] = None speculative_token_map: Optional[str] = None
# Expert parallelism
ep_size: int = 1
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
init_expert_location: str = "trivial"
enable_eplb: bool = False
eplb_algorithm: str = "auto"
eplb_rebalance_num_iterations: int = 1000
eplb_rebalance_layers_per_chunk: Optional[int] = None
expert_distribution_recorder_mode: Optional[
Literal["stat", "stat_approx", "per_pass", "per_token"]
] = None
expert_distribution_recorder_buffer_size: Optional[int] = None
enable_expert_distribution_metrics: bool = False
deepep_config: Optional[str] = None
moe_dense_tp_size: Optional[int] = None
# Double Sparsity # Double Sparsity
enable_double_sparsity: bool = False enable_double_sparsity: bool = False
ds_channel_config_path: Optional[str] = None ds_channel_config_path: Optional[str] = None
...@@ -159,38 +178,24 @@ class ServerArgs: ...@@ -159,38 +178,24 @@ class ServerArgs:
# Optimization/debug options # Optimization/debug options
disable_radix_cache: bool = False disable_radix_cache: bool = False
cuda_graph_max_bs: Optional[int] = None
cuda_graph_bs: Optional[List[int]] = None
disable_cuda_graph: bool = False disable_cuda_graph: bool = False
disable_cuda_graph_padding: bool = False disable_cuda_graph_padding: bool = False
enable_profile_cuda_graph: bool = False
enable_nccl_nvls: bool = False enable_nccl_nvls: bool = False
enable_tokenizer_batch_encode: bool = False enable_tokenizer_batch_encode: bool = False
disable_outlines_disk_cache: bool = False disable_outlines_disk_cache: bool = False
disable_custom_all_reduce: bool = False disable_custom_all_reduce: bool = False
enable_mscclpp: bool = False enable_mscclpp: bool = False
disable_overlap_schedule: bool = False disable_overlap_schedule: bool = False
disable_overlap_cg_plan: bool = False
enable_mixed_chunk: bool = False enable_mixed_chunk: bool = False
enable_dp_attention: bool = False enable_dp_attention: bool = False
enable_dp_lm_head: bool = False enable_dp_lm_head: bool = False
enable_two_batch_overlap: bool = False enable_two_batch_overlap: bool = False
enable_ep_moe: bool = False
enable_deepep_moe: bool = False
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
ep_num_redundant_experts: int = 0
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
init_expert_location: str = "trivial"
enable_eplb: bool = False
eplb_algorithm: str = "auto"
eplb_rebalance_num_iterations: int = 1000
eplb_rebalance_layers_per_chunk: Optional[int] = None
expert_distribution_recorder_mode: Optional[
Literal["stat", "stat_approx", "per_pass", "per_token"]
] = None
expert_distribution_recorder_buffer_size: Optional[int] = None
enable_expert_distribution_metrics: bool = False
deepep_config: Optional[str] = None
enable_torch_compile: bool = False enable_torch_compile: bool = False
torch_compile_max_bs: int = 32 torch_compile_max_bs: int = 32
cuda_graph_max_bs: Optional[int] = None
cuda_graph_bs: Optional[List[int]] = None
torchao_config: str = "" torchao_config: str = ""
enable_nan_detection: bool = False enable_nan_detection: bool = False
enable_p2p_check: bool = False enable_p2p_check: bool = False
...@@ -201,29 +206,28 @@ class ServerArgs: ...@@ -201,29 +206,28 @@ class ServerArgs:
enable_memory_saver: bool = False enable_memory_saver: bool = False
allow_auto_truncate: bool = False allow_auto_truncate: bool = False
enable_custom_logit_processor: bool = False enable_custom_logit_processor: bool = False
tool_call_parser: Optional[str] = None
enable_hierarchical_cache: bool = False enable_hierarchical_cache: bool = False
hicache_ratio: float = 2.0 hicache_ratio: float = 2.0
hicache_size: int = 0 hicache_size: int = 0
hicache_write_policy: str = "write_through_selective" hicache_write_policy: str = "write_through_selective"
flashinfer_mla_disable_ragged: bool = False flashinfer_mla_disable_ragged: bool = False
warmups: Optional[str] = None
moe_dense_tp_size: Optional[int] = None
disable_shared_experts_fusion: bool = False disable_shared_experts_fusion: bool = False
disable_chunked_prefix_cache: bool = False disable_chunked_prefix_cache: bool = False
disable_fast_image_processor: bool = False disable_fast_image_processor: bool = False
mm_attention_backend: Optional[str] = None warmups: Optional[str] = None
# Debug tensor dumps # Debug tensor dumps
debug_tensor_dump_output_folder: Optional[str] = None debug_tensor_dump_output_folder: Optional[str] = None
debug_tensor_dump_input_file: Optional[str] = None debug_tensor_dump_input_file: Optional[str] = None
debug_tensor_dump_inject: bool = False debug_tensor_dump_inject: bool = False
debug_tensor_dump_prefill_only: bool = False
# For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only) # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
disaggregation_mode: str = "null" disaggregation_mode: str = "null"
disaggregation_bootstrap_port: int = 8998
disaggregation_transfer_backend: str = "mooncake" disaggregation_transfer_backend: str = "mooncake"
disaggregation_bootstrap_port: int = 8998
disaggregation_ib_device: Optional[str] = None disaggregation_ib_device: Optional[str] = None
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
pdlb_url: Optional[str] = None pdlb_url: Optional[str] = None
def __post_init__(self): def __post_init__(self):
...@@ -390,7 +394,7 @@ class ServerArgs: ...@@ -390,7 +394,7 @@ class ServerArgs:
if self.enable_eplb and (self.expert_distribution_recorder_mode is None): if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
self.expert_distribution_recorder_mode = "stat" self.expert_distribution_recorder_mode = "stat"
logger.info( logger.info(
f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set." "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
) )
if (self.enable_eplb or (self.init_expert_location is not None)) and ( if (self.enable_eplb or (self.init_expert_location is not None)) and (
...@@ -398,7 +402,7 @@ class ServerArgs: ...@@ -398,7 +402,7 @@ class ServerArgs:
): ):
self.ep_dispatch_algorithm = "static" self.ep_dispatch_algorithm = "static"
logger.info( logger.info(
f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured." "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
) )
if self.enable_expert_distribution_metrics and ( if self.enable_expert_distribution_metrics and (
...@@ -929,6 +933,13 @@ class ServerArgs: ...@@ -929,6 +933,13 @@ class ServerArgs:
default=ServerArgs.reasoning_parser, default=ServerArgs.reasoning_parser,
help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.", help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
) )
parser.add_argument(
"--tool-call-parser",
type=str,
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
default=ServerArgs.tool_call_parser,
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
# Data parallelism # Data parallelism
parser.add_argument( parser.add_argument(
...@@ -949,15 +960,6 @@ class ServerArgs: ...@@ -949,15 +960,6 @@ class ServerArgs:
], ],
) )
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
# Multi-node distributed serving # Multi-node distributed serving
parser.add_argument( parser.add_argument(
"--dist-init-addr", "--dist-init-addr",
...@@ -1038,21 +1040,6 @@ class ServerArgs: ...@@ -1038,21 +1040,6 @@ class ServerArgs:
default=ServerArgs.grammar_backend, default=ServerArgs.grammar_backend,
help="Choose the backend for grammar-guided decoding.", help="Choose the backend for grammar-guided decoding.",
) )
parser.add_argument(
"--enable-flashinfer-mla",
action=DeprecatedAction,
help="--enable-flashinfer-mla is deprecated. Please use '--attention-backend flashinfer' instead.",
)
parser.add_argument(
"--enable-flashmla",
action=DeprecatedAction,
help="--enable-flashmla is deprecated. Please use '--attention-backend flashmla' instead.",
)
parser.add_argument(
"--flashinfer-mla-disable-ragged",
action="store_true",
help="Not using ragged prefill wrapper when running flashinfer mla",
)
# Speculative decoding # Speculative decoding
parser.add_argument( parser.add_argument(
...@@ -1102,6 +1089,109 @@ class ServerArgs: ...@@ -1102,6 +1089,109 @@ class ServerArgs:
help="The path of the draft model's small vocab table.", help="The path of the draft model's small vocab table.",
default=ServerArgs.speculative_token_map, default=ServerArgs.speculative_token_map,
) )
parser.add_argument(
"--mm-attention-backend",
type=str,
choices=["sdpa", "fa3", "triton_attn"],
default=ServerArgs.mm_attention_backend,
help="Set multimodal attention backend.",
)
# Expert parallelism
parser.add_argument(
"--expert-parallel-size",
"--ep-size",
type=int,
default=ServerArgs.ep_size,
help="The expert parallelism size.",
)
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument(
"--enable-deepep-moe",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal", "low_latency", "auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
)
parser.add_argument(
"--eplb-rebalance-layers-per-chunk",
type=int,
default=ServerArgs.eplb_rebalance_layers_per_chunk,
help="Number of layers to rebalance per forward pass.",
)
parser.add_argument(
"--expert-distribution-recorder-mode",
type=str,
default=ServerArgs.expert_distribution_recorder_mode,
help="Mode of expert distribution recorder.",
)
parser.add_argument(
"--expert-distribution-recorder-buffer-size",
type=int,
default=ServerArgs.expert_distribution_recorder_buffer_size,
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
action="store_true",
help="Enable logging metrics for expert balancedness",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
# Double Sparsity # Double Sparsity
parser.add_argument( parser.add_argument(
...@@ -1146,6 +1236,18 @@ class ServerArgs: ...@@ -1146,6 +1236,18 @@ class ServerArgs:
action="store_true", action="store_true",
help="Disable RadixAttention for prefix caching.", help="Disable RadixAttention for prefix caching.",
) )
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument( parser.add_argument(
"--disable-cuda-graph", "--disable-cuda-graph",
action="store_true", action="store_true",
...@@ -1156,6 +1258,11 @@ class ServerArgs: ...@@ -1156,6 +1258,11 @@ class ServerArgs:
action="store_true", action="store_true",
help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.", help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
) )
parser.add_argument(
"--enable-profile-cuda-graph",
action="store_true",
help="Enable profiling of cuda graph capture.",
)
parser.add_argument( parser.add_argument(
"--enable-nccl-nvls", "--enable-nccl-nvls",
action="store_true", action="store_true",
...@@ -1186,6 +1293,11 @@ class ServerArgs: ...@@ -1186,6 +1293,11 @@ class ServerArgs:
action="store_true", action="store_true",
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.", help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
) )
parser.add_argument(
"--disable-overlap-cg-plan",
action="store_true",
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
)
parser.add_argument( parser.add_argument(
"--enable-mixed-chunk", "--enable-mixed-chunk",
action="store_true", action="store_true",
...@@ -1201,11 +1313,6 @@ class ServerArgs: ...@@ -1201,11 +1313,6 @@ class ServerArgs:
action="store_true", action="store_true",
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.", help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
) )
parser.add_argument(
"--enable-ep-moe",
action="store_true",
help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
)
parser.add_argument( parser.add_argument(
"--enable-two-batch-overlap", "--enable-two-batch-overlap",
action="store_true", action="store_true",
...@@ -1222,18 +1329,6 @@ class ServerArgs: ...@@ -1222,18 +1329,6 @@ class ServerArgs:
default=ServerArgs.torch_compile_max_bs, default=ServerArgs.torch_compile_max_bs,
help="Set the maximum batch size when using torch compile.", help="Set the maximum batch size when using torch compile.",
) )
parser.add_argument(
"--cuda-graph-max-bs",
type=int,
default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
)
parser.add_argument(
"--cuda-graph-bs",
type=int,
nargs="+",
help="Set the list of batch sizes for cuda graph.",
)
parser.add_argument( parser.add_argument(
"--torchao-config", "--torchao-config",
type=str, type=str,
...@@ -1290,13 +1385,6 @@ class ServerArgs: ...@@ -1290,13 +1385,6 @@ class ServerArgs:
action="store_true", action="store_true",
help="Enable users to pass custom logit processors to the server (disabled by default for security)", help="Enable users to pass custom logit processors to the server (disabled by default for security)",
) )
parser.add_argument(
"--tool-call-parser",
type=str,
choices=["qwen25", "mistral", "llama3", "deepseekv3", "pythonic"],
default=ServerArgs.tool_call_parser,
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', and 'pythonic'.",
)
parser.add_argument( parser.add_argument(
"--enable-hierarchical-cache", "--enable-hierarchical-cache",
action="store_true", action="store_true",
...@@ -1322,86 +1410,9 @@ class ServerArgs: ...@@ -1322,86 +1410,9 @@ class ServerArgs:
help="The write policy of hierarchical cache.", help="The write policy of hierarchical cache.",
) )
parser.add_argument( parser.add_argument(
"--enable-deepep-moe", "--flashinfer-mla-disable-ragged",
action="store_true",
help="Enabling DeepEP MoE implementation for EP MoE.",
)
parser.add_argument(
"--moe-dense-tp-size",
type=int,
default=ServerArgs.moe_dense_tp_size,
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
parser.add_argument(
"--deepep-mode",
type=str,
choices=["normal", "low_latency", "auto"],
default="auto",
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
)
parser.add_argument(
"--ep-num-redundant-experts",
type=int,
default=ServerArgs.ep_num_redundant_experts,
help="Allocate this number of redundant experts in expert parallel.",
)
parser.add_argument(
"--ep-dispatch-algorithm",
type=str,
default=ServerArgs.ep_dispatch_algorithm,
help="The algorithm to choose ranks for redundant experts in expert parallel.",
)
parser.add_argument(
"--init-expert-location",
type=str,
default=ServerArgs.init_expert_location,
help="Initial location of EP experts.",
)
parser.add_argument(
"--enable-eplb",
action="store_true",
help="Enable EPLB algorithm",
)
parser.add_argument(
"--eplb-algorithm",
type=str,
default=ServerArgs.eplb_algorithm,
help="Chosen EPLB algorithm",
)
parser.add_argument(
"--eplb-rebalance-num-iterations",
type=int,
default=ServerArgs.eplb_rebalance_num_iterations,
help="Number of iterations to automatically trigger a EPLB re-balance.",
)
parser.add_argument(
"--eplb-rebalance-layers-per-chunk",
type=int,
default=ServerArgs.eplb_rebalance_layers_per_chunk,
help="Number of layers to rebalance per forward pass.",
)
parser.add_argument(
"--expert-distribution-recorder-mode",
type=str,
default=ServerArgs.expert_distribution_recorder_mode,
help="Mode of expert distribution recorder.",
)
parser.add_argument(
"--expert-distribution-recorder-buffer-size",
type=int,
default=ServerArgs.expert_distribution_recorder_buffer_size,
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
)
parser.add_argument(
"--enable-expert-distribution-metrics",
action="store_true", action="store_true",
help="Enable logging metrics for expert balancedness", help="Not using ragged prefill wrapper when running flashinfer mla",
)
parser.add_argument(
"--deepep-config",
type=str,
default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
) )
parser.add_argument( parser.add_argument(
"--disable-shared-experts-fusion", "--disable-shared-experts-fusion",
...@@ -1418,8 +1429,6 @@ class ServerArgs: ...@@ -1418,8 +1429,6 @@ class ServerArgs:
action="store_true", action="store_true",
help="Adopt base image processor instead of fast image processor.", help="Adopt base image processor instead of fast image processor.",
) )
# Server warmups
parser.add_argument( parser.add_argument(
"--warmups", "--warmups",
type=str, type=str,
...@@ -1447,6 +1456,11 @@ class ServerArgs: ...@@ -1447,6 +1456,11 @@ class ServerArgs:
default=ServerArgs.debug_tensor_dump_inject, default=ServerArgs.debug_tensor_dump_inject,
help="Inject the outputs from jax as the input of every layer.", help="Inject the outputs from jax as the input of every layer.",
) )
parser.add_argument(
"--debug-tensor-dump-prefill-only",
action="store_true",
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
)
# Disaggregation # Disaggregation
parser.add_argument( parser.add_argument(
...@@ -1456,12 +1470,6 @@ class ServerArgs: ...@@ -1456,12 +1470,6 @@ class ServerArgs:
choices=["null", "prefill", "decode"], choices=["null", "prefill", "decode"],
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated', help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
) )
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",
)
parser.add_argument( parser.add_argument(
"--disaggregation-transfer-backend", "--disaggregation-transfer-backend",
type=str, type=str,
...@@ -1469,6 +1477,12 @@ class ServerArgs: ...@@ -1469,6 +1477,12 @@ class ServerArgs:
choices=["mooncake", "nixl"], choices=["mooncake", "nixl"],
help="The backend for disaggregation transfer. Default is mooncake.", help="The backend for disaggregation transfer. Default is mooncake.",
) )
parser.add_argument(
"--disaggregation-bootstrap-port",
type=int,
default=ServerArgs.disaggregation_bootstrap_port,
help="Bootstrap server port on the prefill server. Default is 8998.",
)
parser.add_argument( parser.add_argument(
"--disaggregation-ib-device", "--disaggregation-ib-device",
type=str, type=str,
...@@ -1477,6 +1491,12 @@ class ServerArgs: ...@@ -1477,6 +1491,12 @@ class ServerArgs:
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). " "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
"Default is None, which triggers automatic device detection when mooncake backend is enabled.", "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
) )
parser.add_argument(
"--num-reserved-decode-tokens",
type=int,
default=ServerArgs.num_reserved_decode_tokens,
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
)
parser.add_argument( parser.add_argument(
"--pdlb-url", "--pdlb-url",
type=str, type=str,
...@@ -1484,14 +1504,6 @@ class ServerArgs: ...@@ -1484,14 +1504,6 @@ class ServerArgs:
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.", help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
) )
parser.add_argument(
"--mm-attention-backend",
type=str,
choices=["sdpa", "fa3", "triton_attn"],
default=ServerArgs.mm_attention_backend,
help="Set multimodal attention backend.",
)
@classmethod @classmethod
def from_cli_args(cls, args: argparse.Namespace): def from_cli_args(cls, args: argparse.Namespace):
args.tp_size = args.tensor_parallel_size args.tp_size = args.tensor_parallel_size
......
...@@ -41,6 +41,9 @@ class EAGLEDraftCudaGraphRunner: ...@@ -41,6 +41,9 @@ class EAGLEDraftCudaGraphRunner:
self.tp_size = self.model_runner.tp_size self.tp_size = self.model_runner.tp_size
self.topk = model_runner.server_args.speculative_eagle_topk self.topk = model_runner.server_args.speculative_eagle_topk
self.speculative_num_steps = model_runner.server_args.speculative_num_steps self.speculative_num_steps = model_runner.server_args.speculative_num_steps
self.enable_profile_cuda_graph = (
model_runner.server_args.enable_profile_cuda_graph
)
server_args = model_runner.server_args server_args = model_runner.server_args
# Batch sizes to capture # Batch sizes to capture
......
...@@ -39,6 +39,9 @@ class EAGLEDraftExtendCudaGraphRunner: ...@@ -39,6 +39,9 @@ class EAGLEDraftExtendCudaGraphRunner:
self.dp_size = model_runner.server_args.dp_size self.dp_size = model_runner.server_args.dp_size
self.speculative_num_steps = model_runner.server_args.speculative_num_steps self.speculative_num_steps = model_runner.server_args.speculative_num_steps
self.topk = model_runner.server_args.speculative_eagle_topk self.topk = model_runner.server_args.speculative_eagle_topk
self.enable_profile_cuda_graph = (
model_runner.server_args.enable_profile_cuda_graph
)
self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner) self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
self.padded_static_len = -1 self.padded_static_len = -1
......
...@@ -837,6 +837,7 @@ class CustomCacheManager(FileCacheManager): ...@@ -837,6 +837,7 @@ class CustomCacheManager(FileCacheManager):
def set_ulimit(target_soft_limit=65535): def set_ulimit(target_soft_limit=65535):
# number of open files
resource_type = resource.RLIMIT_NOFILE resource_type = resource.RLIMIT_NOFILE
current_soft, current_hard = resource.getrlimit(resource_type) current_soft, current_hard = resource.getrlimit(resource_type)
...@@ -846,6 +847,18 @@ def set_ulimit(target_soft_limit=65535): ...@@ -846,6 +847,18 @@ def set_ulimit(target_soft_limit=65535):
except ValueError as e: except ValueError as e:
logger.warning(f"Fail to set RLIMIT_NOFILE: {e}") logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
# stack size
resource_type = resource.RLIMIT_STACK
current_soft, current_hard = resource.getrlimit(resource_type)
target_soft_limit_stack_size = 1024 * target_soft_limit
if current_soft < target_soft_limit_stack_size:
try:
resource.setrlimit(
resource_type, (target_soft_limit_stack_size, current_hard)
)
except ValueError as e:
logger.warning(f"Fail to set RLIMIT_STACK: {e}")
def add_api_key_middleware(app, api_key: str): def add_api_key_middleware(app, api_key: str):
@app.middleware("http") @app.middleware("http")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment