Improve support for GPUs with capability < 8 (#2575)

* Improve support for GPUs with capability < 8 - For models that cannot use flashinfer, use flash-attn v1 + paged attention for models with a compute capability older than 8. - Disable prefix caching when using paged attention. - When using flash-attn v1, pass the key/value, rather than the cache, since v1 cannot use block tables. * nix: add flash-attn-v1 to the server environment * Move disabling prefix caching into the block of exceptions * Capability as `usize`s

Improve support for GPUs with capability < 8 (#2575)
* Improve support for GPUs with capability < 8 - For models that cannot use flashinfer, use flash-attn v1 + paged attention for models with a compute capability older than 8. - Disable prefix caching when using paged attention. - When using flash-attn v1, pass the key/value, rather than the cache, since v1 cannot use block tables. * nix: add flash-attn-v1 to the server environment * Move disabling prefix caching into the block of exceptions * Capability as `usize`s
5b6b74e2 · Daniël de Kok · GitHub · 0aa66d69 · 5b6b74e2 · 5b6b74e2
Unverified Commit 5b6b74e2 authored Sep 27, 2024 by Daniël de Kok Committed by GitHub Sep 27, 2024
10 changed files
--- a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -38,6 +38,7 @@ from text_generation_server.layers import (
    SpeculativeHead,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
@@ -192,8 +193,8 @@ class FlashGPTJAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else key,
-                kv_cache[1] if SYSTEM != "ipex" else value,
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key,
+                kv_cache[1] if PREFILL_IN_KV_CACHE else value,
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -27,6 +27,7 @@ import torch.distributed
 from torch import nn
 from transformers.activations import ACT2FN

+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
    paged_attention,
@@ -220,8 +221,8 @@ class FlashLlamaAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -41,6 +41,7 @@ from text_generation_server.layers import (
    TensorParallelMultiAdapterLinear,
    TensorParallelAdapterRowLinear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
@@ -218,8 +219,8 @@ class MistralAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv_to_cache[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv_to_cache[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -39,10 +39,10 @@ from text_generation_server.layers.attention import (
    paged_attention,
    reshape_and_cache,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import FastRMSNorm
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.weights import UnquantizedWeight


@@ -267,8 +267,8 @@ class MixtralAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv_to_cache[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv_to_cache[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -26,7 +26,6 @@ from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.gpt_neox import GPTNeoXConfig as TransformersGPTNeoXConfig
 from typing import Optional, List, Tuple
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
    paged_attention,
    attention,
@@ -40,6 +39,7 @@ from text_generation_server.layers import (
    SpeculativeHead,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
@@ -172,8 +172,8 @@ class FlashNeoxAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                qkv[:, 0],
-                kv_cache[0] if SYSTEM != "ipex" else qkv[:, 1],
-                kv_cache[1] if SYSTEM != "ipex" else qkv[:, 2],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else qkv[:, 1],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else qkv[:, 2],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -19,13 +19,13 @@ from text_generation_server.layers import (
    SpeculativeHead,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
 from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
-from text_generation_server.utils.import_utils import SYSTEM


 class PhiConfig(PretrainedConfig):
@@ -194,8 +194,8 @@ class FlashPhiAttention(torch.nn.Module):
        if cu_seqlen_prefill is not None:
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -17,11 +17,11 @@ from text_generation_server.layers import (
    TensorParallelEmbedding,
    SpeculativeHead,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )
-from text_generation_server.utils.import_utils import SYSTEM


 def load_attention(config, prefix, weights):
@@ -137,8 +137,8 @@ class Qwen2Attention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv_to_cache[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv_to_cache[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -5,7 +5,6 @@ import torch.distributed
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_utils import PreTrainedModel
-from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers import (
    SpeculativeHead,
    TensorParallelColumnLinear,
@@ -13,6 +12,7 @@ from text_generation_server.layers import (
    TensorParallelRowLinear,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import FastLayerNorm
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.attention import (
@@ -207,8 +207,8 @@ class FlashRWAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,
@@ -325,8 +325,8 @@ class FlashRWLargeAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv[:, :, 0].contiguous(),
-                kv_cache[1] if SYSTEM != "ipex" else kv[:, :, 1].contiguous(),
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv[:, :, 0].contiguous(),
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv[:, :, 1].contiguous(),
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -18,11 +18,11 @@ from text_generation_server.layers import (
    TensorParallelEmbedding,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.gptq import GPTQWeightsLoader
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
-from text_generation_server.utils.import_utils import SYSTEM


 def load_multi_mqa(
@@ -293,8 +293,8 @@ class FlashMQAttention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else key_value[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else key_value[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else key_value[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else key_value[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,

--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -39,6 +39,7 @@ from text_generation_server.layers import (
    SpeculativeHead,
    get_linear,
 )
+from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import (
    FastLayerNorm,
    FastRMSNorm,
@@ -47,7 +48,6 @@ from text_generation_server.layers.rotary import (
    PositionRotaryEmbedding,
 )
 from text_generation_server.utils.weights import UnquantizedWeight
-from text_generation_server.utils.import_utils import SYSTEM


 class Starcoder2Config(PretrainedConfig):
@@ -242,8 +242,8 @@ class Starcoder2Attention(torch.nn.Module):
            # flash attention
            attn_output = attention(
                query,
-                kv_cache[0] if SYSTEM != "ipex" else kv_to_cache[:, 0],
-                kv_cache[1] if SYSTEM != "ipex" else kv_to_cache[:, 1],
+                kv_cache[0] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
+                kv_cache[1] if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
                seqlen,
                block_tables,
                self.softmax_scale,