flashinfer: reminder to remove contiguous call in the future (#2685)

1b914f37 · Daniël de Kok · GitHub · 41c26237 · 1b914f37
Unverified Commit 1b914f37 authored Oct 24, 2024 by Daniël de Kok Committed by GitHub Oct 24, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

server/text_generation_server/layers/attention/cuda.py server/text_generation_server/layers/attention/cuda.py +2 -0

No files found.
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -55,6 +55,7 @@ def paged_attention(
        from text_generation_server.layers.attention.flashinfer import decode_state

        return decode_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
            query.contiguous(),
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,
@@ -220,6 +221,7 @@ def attention(
            softcap = 0.0

        return prefill_with_paged_kv_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
            query.contiguous(),
            causal=causal,
            paged_kv_cache=(kv_cache.key, kv_cache.value),