@@ -139,7 +139,7 @@ sky status --endpoint 30000 sglang
...
@@ -139,7 +139,7 @@ sky status --endpoint 30000 sglang
### Common Notes
### Common Notes
-[FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by`--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
-[FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please disable it by adding`--disable-flashinfer --disable-flashinfer-sampling` and open an issue on GitHub.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
@@ -92,5 +92,5 @@ sky status --endpoint 30000 sglang
...
@@ -92,5 +92,5 @@ sky status --endpoint 30000 sglang
</details>
</details>
### Common Notes
### Common Notes
-[FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by`--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
-[FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please disable it by adding`--disable-flashinfer --disable-flashinfer-sampling` and open an issue on GitHub.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
"Not sure why, the tokenizer will add an additional token at the end of the prompt when trust_remote_mode=True"
)
self.trust_remote_code=False
if"gemma-2"inself.model_path.lower():
logger.info("When using sliding window in gemma-2, turn on flashinfer.")
self.attention_backend="flashinfer"
@staticmethod
@staticmethod
defadd_cli_args(parser:argparse.ArgumentParser):
defadd_cli_args(parser:argparse.ArgumentParser):
parser.add_argument(
parser.add_argument(
...
@@ -214,11 +240,6 @@ class ServerArgs:
...
@@ -214,11 +240,6 @@ class ServerArgs:
action="store_true",
action="store_true",
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
)
)
parser.add_argument(
"--is-embedding",
action="store_true",
help="Whether to use a CausalLM as an embedding model.",
)
parser.add_argument(
parser.add_argument(
"--context-length",
"--context-length",
type=int,
type=int,
...
@@ -253,6 +274,11 @@ class ServerArgs:
...
@@ -253,6 +274,11 @@ class ServerArgs:
default=ServerArgs.chat_template,
default=ServerArgs.chat_template,
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
)
)
parser.add_argument(
"--is-embedding",
action="store_true",
help="Whether to use a CausalLM as an embedding model.",
)
parser.add_argument(
parser.add_argument(
"--mem-fraction-static",
"--mem-fraction-static",
type=float,
type=float,
...
@@ -265,17 +291,12 @@ class ServerArgs:
...
@@ -265,17 +291,12 @@ class ServerArgs:
default=ServerArgs.max_running_requests,
default=ServerArgs.max_running_requests,
help="The maximum number of running requests.",
help="The maximum number of running requests.",
)
)
parser.add_argument(
"--max-num-reqs",
type=int,
default=ServerArgs.max_num_reqs,
help="The maximum number of requests to serve in the memory pool. If the model have a large context length, you may need to decrease this value to avoid out-of-memory errors.",
)
parser.add_argument(
parser.add_argument(
"--max-total-tokens",
"--max-total-tokens",
type=int,
type=int,
default=ServerArgs.max_total_tokens,
default=ServerArgs.max_total_tokens,
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. This option is typically used for development and debugging purposes.",
help="The maximum number of tokens in the memory pool. If not specified, it will be automatically calculated based on the memory usage fraction. "
"This option is typically used for development and debugging purposes.",
)
)
parser.add_argument(
parser.add_argument(
"--chunked-prefill-size",
"--chunked-prefill-size",
...
@@ -395,15 +416,29 @@ class ServerArgs:
...
@@ -395,15 +416,29 @@ class ServerArgs:
)
)
# Optimization/debug options
# Optimization/debug options
parser.add_argument(
"--attention-backend",
type=str,
choices=["flashinfer","triton"],
default=ServerArgs.attention_backend,
help="Choose the kernels for attention layers.",
)
parser.add_argument(
"--sampling-backend",
type=str,
choices=["flashinfer","pytorch"],
default=ServerArgs.sampling_backend,
help="Choose the kernels for sampling layers.",
)
parser.add_argument(
parser.add_argument(
"--disable-flashinfer",
"--disable-flashinfer",
action="store_true",
action="store_true",
help="Disable flashinfer attention kernels.",
help="Disable flashinfer attention kernels. This option will be deprecated in the next release. Please use '--attention-backend triton' instead.",
)
)
parser.add_argument(
parser.add_argument(
"--disable-flashinfer-sampling",
"--disable-flashinfer-sampling",
action="store_true",
action="store_true",
help="Disable flashinfer sampling kernels.",
help="Disable flashinfer sampling kernels. This option will be deprecated in the next release. Please use '--sampling-backend pytorch' instead.",