Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
"""
# Handle deprecated arguments.
self._handle_deprecated_args()
# Set missing default values.
self._handle_missing_default_values()
# Get GPU memory capacity, which is a common dependency for several configuration steps.
gpu_mem=get_device_memory_capacity(self.device)
# Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
self._handle_gpu_memory_settings(gpu_mem)
# Handle device-specific backends.
self._handle_hpu_backends()
self._handle_cpu_backends()
# Apply model-specific adjustments.
self._handle_model_specific_adjustments()
# Set kernel backends.
self._handle_sampling_backend()
self._handle_attention_backend_compatibility()
self._handle_page_size()
self._handle_amd_specifics()
self._handle_grammar_backend()
# Handle data parallelism.
self._handle_data_parallelism()
# Handle MoE configurations.
self._handle_moe_kernel_config()
self._handle_deepep_moe()
self._handle_eplb_and_dispatch()
self._handle_expert_distribution_metrics()
# Handle pipeline parallelism.
self._handle_pipeline_parallelism()
# Handle Hicache settings.
self._handle_hicache()
# Handle speculative decoding logic.
self._handle_speculative_decoding()
# Handle model loading format.
self._handle_load_format()
# Handle PD disaggregation.
self._handle_disaggregation()
# Validate tokenizer settings.
self._handle_tokenizer_batching()
# Propagate environment variables.
self._handle_environment_variables()
# Validate cache settings.
self._handle_cache_compatibility()
# Mamba cache
max_mamba_cache_size:Optional[int]=None
mamba_ssm_dtype:str="float32"
# Validate metrics labels.
self._handle_metrics_labels()
# For deterministic inference
enable_deterministic_inference:bool=False
# Handle deterministic inference.
self._handle_deterministic_inference()
# NSA attention backend
nsa_prefill:str=NSA_DEFAULT_PREFILL
nsa_decode:str=NSA_DEFAULT_DECODE
# Handle any other necessary validations.
self._handle_other_validations()
# Deprecated arguments
enable_ep_moe:bool=False
enable_deepep_moe:bool=False
enable_flashinfer_cutlass_moe:bool=False
enable_flashinfer_cutedsl_moe:bool=False
enable_flashinfer_trtllm_moe:bool=False
enable_triton_kernel_moe:bool=False
enable_flashinfer_mxfp4_moe:bool=False
def_handle_deprecated_args(self):
pass
ifself.enable_ep_moe:
self.ep_size=self.tp_size
print_deprecated_warning(
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
)
ifself.enable_deepep_moe:
self.moe_a2a_backend="deepep"
print_deprecated_warning(
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
)
ifself.enable_triton_kernel_moe:
self.moe_runner_backend="triton_kernel"
print_deprecated_warning(
"NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
)
ifself.enable_flashinfer_cutedsl_moe:
self.moe_runner_backend="flashinfer_cutedsl"
print_deprecated_warning(
"NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
)
ifself.enable_flashinfer_cutlass_moe:
self.moe_runner_backend="flashinfer_cutlass"
print_deprecated_warning(
"NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
)
ifself.enable_flashinfer_trtllm_moe:
self.moe_runner_backend="flashinfer_trtllm"
print_deprecated_warning(
"NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
)
ifself.enable_flashinfer_mxfp4_moe:
self.moe_runner_backend="flashinfer_mxfp4"
print_deprecated_warning(
"NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
)
def_handle_missing_default_values(self):
ifself.tokenizer_pathisNone:
...
...
@@ -521,174 +510,85 @@ class ServerArgs:
ifself.random_seedisNone:
self.random_seed=random.randint(0,1<<30)
def_handle_gpu_memory_settings(self,gpu_mem):
"""
Configure GPU memory-dependent settings including
chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
Here are our heuristics:
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
This is because GPUs with more memory are generally more powerful, we need to use a larger
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
The activation memory is proportional to the chunked_prefill_size.
The cuda graph memory is proportional to the cuda_graph_max_bs.
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
"""
def_handle_mem_fraction_static(self,gpu_mem):
ifself.mem_fraction_staticisNone:
ifgpu_memisnotNone:
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
)
...
...
@@ -964,15 +786,8 @@ class ServerArgs:
def_handle_hicache(self):
ifself.hicache_storage_backend=="mooncake":
ifself.hicache_mem_layout=="layer_first":
ifself.hicache_io_backend=="direct":
self.hicache_mem_layout="page_first_direct"
elifself.hicache_io_backend=="kernel":
self.hicache_io_backend="kernel"
self.hicache_mem_layout="page_first"
logger.warning(
f"Mooncake storage backend does not support layer_first layout, "
f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
)
ifself.hicache_mem_layout=="page_first_direct":
ifself.hicache_io_backend!="direct":
...
...
@@ -1007,6 +822,7 @@ class ServerArgs:
model_arch=self.get_hf_config().architectures[0]
ifmodel_archin[
"DeepseekV32ForCausalLM",
"DeepseekV3ForCausalLM",
"Glm4MoeForCausalLM",
"BailingMoeForCausalLM",
...
...
@@ -1058,23 +874,23 @@ class ServerArgs:
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
)
ifself.speculative_algorithm=="NGRAM":
ifself.speculative_algorithm=="LOOKAHEAD":
ifnotself.device.startswith("cuda"):
raiseValueError(
"Ngram speculative decoding only supports CUDA device."
"Lookahead speculative decoding only supports CUDA device."
"The overlap scheduler and mixed chunked prefill are disabled because of "
"using ngram speculative decoding."
"using lookahead speculative decoding."
)
if(
...
...
@@ -1086,9 +902,9 @@ class ServerArgs:
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
)
ifself.enable_dp_attention:
# TODO: support dp attention for ngram speculative decoding
# TODO: support dp attention for lookahead speculative decoding
raiseValueError(
"Currently ngram speculative decoding does not support dp attention."
"Currently lookahead speculative decoding does not support dp attention."
)
def_handle_load_format(self):
...
...
@@ -1166,55 +982,120 @@ class ServerArgs:
"and cannot be used at the same time. Please use only one of them."
)
if(
self.disaggregation_decode_enable_offload_kvcache
andself.disaggregation_mode!="decode"
):
raiseValueError(
"The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
)
def_handle_metrics_labels(self):
if(
notself.tokenizer_metrics_custom_labels_header
andself.tokenizer_metrics_allowed_custom_labels
andself.tokenizer_metrics_allowed_customer_labels
):
raiseValueError(
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
)
def_handle_deterministic_inference(self):
ifself.enable_deterministic_inference:
# Check sampling backend
self.sampling_backend="pytorch"
logger.warning(
"Sampling backend is set to pytorch for deterministic inference."
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
"batch_invariant_ops is not installed. Please install it from https://github.com/thinking-machines-lab/batch_invariant_ops/."
)
# Check some settings
self.sampling_backend="pytorch"
logger.warning(
"Sampling backend is set to pytorch for deterministic inference."
)
# Currently, only FA3 supports radix cache. Support for other backends is in progress
ifself.attention_backend!="fa3":
self.disable_radix_cache=True
logger.warning(
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
"Currently radix cache is disabled for deterministic inference. It will be supported in the future."
)
# Check TP size
ifself.tp_size>1:
os.environ["NCCL_ALGO"]="allreduce:tree"
self.disable_custom_all_reduce=True
logger.warning(
"NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."