update rocm.py

72501097 · xuxzh1 · d4bccff3 · 72501097
Commit 72501097 authored Jan 24, 2025 by xuxzh1 🎱
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 3 deletions

server/text_generation_server/layers/attention/rocm.py server/text_generation_server/layers/attention/rocm.py +2 -3

No files found.
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -80,7 +80,6 @@ def paged_attention(
        _PARTITION_SIZE = _PARTITION_SIZE_V1V2
    else:
        _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
-    _PARTITION_SIZE = 512
    max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
@@ -234,7 +233,7 @@ def attention(
            softcap = 0.0
        # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
-        return flash_attn_2_cuda.varlen_fwd(
+            return flash_attn_2_cuda.varlen_fwd(
            query,
            key,
            value,
@@ -257,7 +256,7 @@ def attention(
            False,
            None,
        )[0]
    elif ENGINE == "triton":
        from .flash_attn_triton import triton_attention