Commit 25e16eea authored by zhuwenwen's avatar zhuwenwen
Browse files

set VLLM_USE_FLASH_ATTN_FP8=1 and VLLM_USE_FLASH_MLA_FP8=1

parent a50ece3a
......@@ -1069,7 +1069,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will use FLASH ATTN fp8 attention optimizations.
"VLLM_USE_FLASH_ATTN_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_ATTN_FP8", "0"))),
lambda: bool(int(os.getenv("VLLM_USE_FLASH_ATTN_FP8", "1"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
......@@ -1077,7 +1077,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will use FLASH MLA fp8 attention optimizations.
"VLLM_USE_FLASH_MLA_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA_FP8", "0"))),
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA_FP8", "1"))),
# flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment