Commit 25e16eea authored by zhuwenwen's avatar zhuwenwen
Browse files

set VLLM_USE_FLASH_ATTN_FP8=1 and VLLM_USE_FLASH_MLA_FP8=1

parent a50ece3a
...@@ -1069,7 +1069,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1069,7 +1069,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will use FLASH ATTN fp8 attention optimizations. # If set, vLLM will use FLASH ATTN fp8 attention optimizations.
"VLLM_USE_FLASH_ATTN_FP8": "VLLM_USE_FLASH_ATTN_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_ATTN_FP8", "0"))), lambda: bool(int(os.getenv("VLLM_USE_FLASH_ATTN_FP8", "1"))),
# If set, vLLM will use FLASH MLA attention optimizations. # If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA": "VLLM_USE_FLASH_MLA":
...@@ -1077,7 +1077,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1077,7 +1077,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will use FLASH MLA fp8 attention optimizations. # If set, vLLM will use FLASH MLA fp8 attention optimizations.
"VLLM_USE_FLASH_MLA_FP8": "VLLM_USE_FLASH_MLA_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA_FP8", "0"))), lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA_FP8", "1"))),
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP": "VLLM_USE_OPT_OP":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment