help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
)
# Mamba Cache
parser.add_argument(
"--max-mamba-cache-size",
type=int,
default=ServerArgs.max_mamba_cache_size,
help="It is used for mamba cache memory static allocation.",
help="The maximum size of the mamba cache.",
)
parser.add_argument(
"--mamba-ssm-dtype",
type=str,
default=ServerArgs.mamba_ssm_dtype,
choices=["float32","bfloat16"],
help="It is used to tune mamba ssm dtype",
help="The data type of the SSM states in mamba cache.",