Unverified Commit ee7a66dd authored by Lucia Fang's avatar Lucia Fang Committed by GitHub
Browse files

allow disable flashinfer prefill (#25276)


Signed-off-by: default avatarLu Fang <fanglu@fb.com>
parent 431535b5
...@@ -32,6 +32,7 @@ if TYPE_CHECKING: ...@@ -32,6 +32,7 @@ if TYPE_CHECKING:
VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
VLLM_NO_USAGE_STATS: bool = False VLLM_NO_USAGE_STATS: bool = False
VLLM_DISABLE_FLASHINFER_PREFILL: bool = False
VLLM_DO_NOT_TRACK: bool = False VLLM_DO_NOT_TRACK: bool = False
VLLM_USAGE_SOURCE: str = "" VLLM_USAGE_SOURCE: str = ""
VLLM_CONFIGURE_LOGGING: int = 1 VLLM_CONFIGURE_LOGGING: int = 1
...@@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"),
"VLLM_NO_USAGE_STATS": "VLLM_NO_USAGE_STATS":
lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1",
"VLLM_DISABLE_FLASHINFER_PREFILL":
lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1",
"VLLM_DO_NOT_TRACK": "VLLM_DO_NOT_TRACK":
lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get(
"DO_NOT_TRACK", None) or "0") == "1", "DO_NOT_TRACK", None) or "0") == "1",
......
...@@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata) ...@@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata)
def use_flashinfer_prefill() -> bool: def use_flashinfer_prefill() -> bool:
# For blackwell default to flashinfer prefill if it's available since # For blackwell default to flashinfer prefill if it's available since
# it is faster than FA2. # it is faster than FA2.
return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available
and not envs.VLLM_USE_CUDNN_PREFILL
and current_platform.is_device_capability(100)) and current_platform.is_device_capability(100))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment