help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
help="The fraction of the memory used for static allocation (model weights and KV cache memory pool). Use a smaller value if you see out-of-memory errors.",
)
)
parser.add_argument(
parser.add_argument(
"--max-prefill-num-token",
"--max-prefill-tokens",
type=int,
type=int,
default=ServerArgs.max_prefill_num_token,
default=ServerArgs.max_prefill_tokens,
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",
help="The maximum number of tokens in a prefill batch. The real bound will be the maximum of this value and the model's maximum context length.",