Commit 66a7ebd8 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip arch reason

parent 49be5e62
......@@ -38,24 +38,24 @@ _ROCM_UNSUPPORTED_MODELS: List[str] = []
# Models partially supported by ROCm.
# Architecture -> Reason.
_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
"Triton flash attention. For half-precision SWA support, "
"please use CK flash attention by setting "
"`VLLM_USE_TRITON_FLASH_ATTN=0`")
# _ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in "
# "Triton flash attention. For half-precision SWA support, "
# "please use CK flash attention by setting "
# "`VLLM_USE_TRITON_FLASH_ATTN=0`")
_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = {
"Qwen2ForCausalLM":
_ROCM_SWA_REASON,
"MistralForCausalLM":
_ROCM_SWA_REASON,
"MixtralForCausalLM":
_ROCM_SWA_REASON,
# "Qwen2ForCausalLM":
# _ROCM_SWA_REASON,
# "MistralForCausalLM":
# _ROCM_SWA_REASON,
# "MixtralForCausalLM":
# _ROCM_SWA_REASON,
"PaliGemmaForConditionalGeneration":
("ROCm flash attention does not yet "
"fully support 32-bit precision on PaliGemma"),
"Phi3VForCausalLM":
("ROCm Triton flash attention may run into compilation errors due to "
"excessive use of shared memory. If this happens, disable Triton FA "
"by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
# "Phi3VForCausalLM":
# ("ROCm Triton flash attention may run into compilation errors due to "
# "excessive use of shared memory. If this happens, disable Triton FA "
# "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment