[Bugfix] Fix try-catch conditions to import correct Flash Attention Backend in Draft Model (#9101)

23fea871 · TJian · GitHub · f4dd830e · 23fea871
Unverified Commit 23fea871 authored Oct 05, 2024 by TJian Committed by GitHub Oct 06, 2024
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 5 deletions

vllm/spec_decode/draft_model_runner.py vllm/spec_decode/draft_model_runner.py +10 -5

No files found.
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -6,11 +6,16 @@ from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.sampler import SamplerOutput
 try:
+    try:
        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
-except ModuleNotFoundError:
+    except (ModuleNotFoundError, ImportError):
-    # vllm_flash_attn is not installed, use the identical ROCm FA metadata
+        # vllm_flash_attn is not installed, try the ROCm FA metadata
        from vllm.attention.backends.rocm_flash_attn import (
            ROCmFlashAttentionMetadata as FlashAttentionMetadata)
+except (ModuleNotFoundError, ImportError) as err:
+    raise RuntimeError(
+        "Draft model speculative decoding currently only supports"
+        "CUDA and ROCm flash attention backend.") from err
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ObservabilityConfig, ParallelConfig,