[Misc] Allow passing logits_soft_cap for xformers backend (#11252)

Signed-off-by: Isotr0py <2037008807@qq.com>

[Misc] Allow passing logits_soft_cap for xformers backend (#11252)
Signed-off-by: Isotr0py <2037008807@qq.com>
f9ecbb18 · Isotr0py · GitHub · 02222a02 · f9ecbb18
Unverified Commit f9ecbb18 authored Dec 17, 2024 by Isotr0py Committed by GitHub Dec 17, 2024
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 5 deletions

vllm/attention/backends/xformers.py vllm/attention/backends/xformers.py +3 -5

No files found.
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -17,9 +17,7 @@ from vllm.attention.backends.utils import (
    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                           PagedAttentionMetadata)
-from vllm.logger import init_logger
+from vllm.utils import print_warning_once
-logger = init_logger(__name__)
 class XFormersBackend(AttentionBackend):
@@ -386,8 +384,8 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]):
            raise ValueError(
                "XFormers does not support block-sparse attention.")
        if logits_soft_cap is not None:
-            raise ValueError(
+            print_warning_once("XFormers does not support logits soft cap. "
-                "XFormers does not support attention logits soft capping.")
+                               "Outputs may be slightly off.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)