Refactor dead code - Removing all `flash_xxx.py` files. (#2166)

* Refactor dead code. * First working step. * Remove a lot of duplicated code. * More dead code. * More cleanup. * Fix Santacoder test. * Fixing the simple tests. * Fixing sharding. * Fixes for VLM. * Fixing santacoder (num_kv_heads hardcoded). * Removing more dead code. * Fixing `config.n_head`. * Stopping earlier because of `<end_of_utterance>` in idefics2. * Addresses comments. * Removing the dead code. * Fuse back mistral into FlashCausalLM. * Finish removal. * Fixing docs + causal_lm `batch_class`. * Fixing docs + causal.lm. * Add default to Gemma Causality. * Default value for gemma/gemma2. * Wrong default.

Refactor dead code - Removing all `flash_xxx.py` files. (#2166)
* Refactor dead code. * First working step. * Remove a lot of duplicated code. * More dead code. * More cleanup. * Fix Santacoder test. * Fixing the simple tests. * Fixing sharding. * Fixes for VLM. * Fixing santacoder (num_kv_heads hardcoded). * Removing more dead code. * Fixing `config.n_head`. * Stopping earlier because of `<end_of_utterance>` in idefics2. * Addresses comments. * Removing the dead code. * Fuse back mistral into FlashCausalLM. * Finish removal. * Fixing docs + causal_lm `batch_class`. * Fixing docs + causal.lm. * Add default to Gemma Causality. * Default value for gemma/gemma2. * Wrong default.
fb2f74e2 · Nicolas Patry · GitHub · c6bcadf8 · fb2f74e2 · fb2f74e2
Unverified Commit fb2f74e2 authored Jul 05, 2024 by Nicolas Patry Committed by GitHub Jul 05, 2024
20 changed files
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "2.1.1-dev0"
+    "version": "2.1.2-dev0"
  },
  "paths": {
    "/": {

--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -10,6 +10,7 @@ Text Generation Inference enables serving optimized models on specific hardware
 - [Llama](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
+- [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
 - [Gemma2](https://huggingface.co/google/gemma2-9b)
 - [Cohere](https://huggingface.co/CohereForAI/c4ai-command-r-plus)
 - [Dbrx](https://huggingface.co/databricks/dbrx-instruct)

--- a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
 {
  "details": {
    "best_of_sequences": null,
-    "finish_reason": "length",
+    "finish_reason": "eos_token",
-    "generated_tokens": 20,
+    "generated_tokens": 19,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 415,
-        "logprob": -0.039886475,
+        "logprob": -0.03665161,
        "special": false,
        "text": " The"
      },
      {
        "id": 12072,
-        "logprob": -0.1430664,
+        "logprob": -0.13549805,
        "special": false,
        "text": " cow"
      },
      {
        "id": 349,
-        "logprob": -0.056488037,
+        "logprob": -0.05819702,
        "special": false,
        "text": " is"
      },
      {
        "id": 6328,
-        "logprob": -0.6855469,
+        "logprob": -0.6826172,
        "special": false,
        "text": " standing"
      },
      {
        "id": 356,
-        "logprob": -0.1685791,
+        "logprob": -0.1607666,
        "special": false,
        "text": " on"
      },
      {
        "id": 272,
-        "logprob": -0.50097656,
+        "logprob": -0.5073242,
        "special": false,
        "text": " the"
      },
      {
        "id": 10305,
-        "logprob": -0.017303467,
+        "logprob": -0.016418457,
        "special": false,
        "text": " beach"
      },
      {
        "id": 304,
-        "logprob": -1.3564453,
+        "logprob": -1.3916016,
        "special": false,
        "text": " and"
      },
      {
        "id": 272,
-        "logprob": -0.017868042,
+        "logprob": -0.020217896,
        "special": false,
        "text": " the"
      },
      {
        "id": 13088,
-        "logprob": -0.0027103424,
+        "logprob": -0.0028133392,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 349,
-        "logprob": -0.003156662,
+        "logprob": -0.003145218,
        "special": false,
        "text": " is"
      },
      {
        "id": 6398,
-        "logprob": -0.37304688,
+        "logprob": -0.37060547,
        "special": false,
        "text": " sitting"
      },
      {
        "id": 356,
-        "logprob": -0.034576416,
+        "logprob": -0.034851074,
        "special": false,
        "text": " on"
      },
      {
        "id": 264,
-        "logprob": -0.29418945,
+        "logprob": -0.2878418,
        "special": false,
        "text": " a"
      },
      {
        "id": 17972,
-        "logprob": -0.042877197,
+        "logprob": -0.046051025,
        "special": false,
        "text": " pile"
      },
      {
        "id": 302,
-        "logprob": -0.00028443336,
+        "logprob": -0.00028848648,
        "special": false,
        "text": " of"
      },
      {
        "id": 2445,
-        "logprob": -0.023223877,
+        "logprob": -0.025772095,
        "special": false,
        "text": " money"
      },
      {
        "id": 28723,
-        "logprob": -0.018157959,
+        "logprob": -0.018127441,
        "special": false,
        "text": "."
      },
      {
        "id": 32002,
-        "logprob": -0.00018393993,
+        "logprob": -0.00019824505,
        "special": true,
        "text": "<end_of_utterance>"
-      },
-      {
-        "id": 2,
-        "logprob": -1.1920929e-07,
-        "special": true,
-        "text": "</s>"
      }
    ],
    "top_tokens": null

--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@@ -57,7 +57,7 @@ async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot)
        response.generated_text
        == " The cow is standing on the beach and the chicken is sitting on a pile of money."
    ), f"{repr(response.generated_text)}"
-    assert response.details.generated_tokens == 20
+    assert response.details.generated_tokens == 19
    assert response == response_snapshot

--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@@ -8,6 +8,9 @@ from text_generation_server.pb import generate_pb2
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.utils import weight_hub_files, download_weights
 from text_generation_server.models.bloom import BloomCausalLMBatch, BLOOMSharded
+from text_generation_server.models.custom_modeling.bloom_modeling import (
+    BloomForCausalLM,
+)
 @pytest.fixture(scope="session")
@@ -16,7 +19,10 @@ def default_bloom():
    revision = "main"
    filenames = weight_hub_files(model_id, revision, ".safetensors")
    download_weights(filenames, model_id, revision)
-    return BLOOMSharded(model_id)
+    return BLOOMSharded(
+        model_id,
+        model_class=BloomForCausalLM,
+    )
 @pytest.fixture(scope="session")

--- a/server/tests/models/test_causal_lm.py
+++ b/server/tests/models/test_causal_lm.py
@@ -10,7 +10,7 @@ from text_generation_server.models.causal_lm import CausalLM, CausalLMBatch
 @pytest.fixture(scope="session")
 def default_causal_lm():
-    return CausalLM("gpt2")
+    return CausalLM.fallback("gpt2")
 @pytest.fixture(scope="session")

--- a/server/tests/models/test_santacoder.py
+++ b/server/tests/models/test_santacoder.py
 import pytest
 from text_generation_server.pb import generate_pb2
-from text_generation_server.models.causal_lm import CausalLMBatch
+from text_generation_server.models.causal_lm import CausalLMBatch, CausalLM
-from text_generation_server.models.santacoder import SantaCoder
 @pytest.fixture(scope="session")
 def default_santacoder():
-    return SantaCoder("bigcode/santacoder")
+    return CausalLM.fallback(model_id="bigcode/santacoder")
 @pytest.fixture

--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -20,7 +20,7 @@ def mt0_small_tokenizer():
 @pytest.fixture(scope="session")
 def default_seq2seq_lm():
-    return Seq2SeqLM("bigscience/mt0-small")
+    return Seq2SeqLM.fallback("bigscience/mt0-small")
 @pytest.fixture

--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -4,22 +4,12 @@ import torch.distributed
 from typing import Optional, Type
 from transformers import (
-    AutoTokenizer,
-    AutoConfig,
    PreTrainedTokenizerBase,
 )
-from text_generation_server.models.custom_modeling.bloom_modeling import (
-    BloomForCausalLM,
-)
 from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.pb import generate_pb2
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
 class BloomCausalLMBatch(CausalLMBatch):
@@ -37,69 +27,6 @@ class BloomCausalLMBatch(CausalLMBatch):
 class BLOOMSharded(CausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            device = torch.device("cpu")
-            dtype = torch.float32 if dtype is None else dtype
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        config = AutoConfig.from_pretrained(
-            model_id,
-            revision=revision,
-            slow_but_exact=False,
-            tp_parallel=True,
-            trust_remote_code=trust_remote_code,
-        )
-        config.pad_token_id = 3
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(
-            filenames,
-            device=device,
-            dtype=dtype,
-            process_group=self.process_group,
-            prefix="transformer",
-        )
-        if config.quantize in ["gptq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-        model = BloomForCausalLM(config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(CausalLM, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            requires_padding=True,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return BloomCausalLMBatch

--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
 import torch
 import time
+import torch.distributed
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    PreTrainedTokenizerBase,
+)
 from typing import Optional, Tuple, List, Type, Dict
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+)
 from text_generation_server.models import Model
 from text_generation_server.utils.chunks import concat_text_chunks
+from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models.types import (
    Batch,
@@ -478,10 +490,87 @@ class CausalLMBatch(Batch):
        return len(self.requests)
+@dataclass
+class CausalLMBatchKeysLast(Batch):
+    keys_head_dim_last: bool = False
 class CausalLM(Model):
    def __init__(
        self,
        model_id: str,
+        model_class,
+        revision: Optional[str] = None,
+        quantize: Optional[str] = None,
+        speculator: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+        default_dtype=torch.float16,
+        trust_remote_code: bool = False,
+        tokenizer_class=AutoTokenizer,
+        config_class=AutoConfig,
+        batch_class=CausalLMBatch,
+    ):
+        self.batch_class = batch_class
+        self.process_group, rank, world_size = initialize_torch_distributed()
+        if torch.cuda.is_available():
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            device = torch.device("cpu")
+            dtype = torch.float32 if dtype is None else dtype
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        config = config_class.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=trust_remote_code,
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        if tokenizer.pad_token_id is None:
+            tokenizer.pad_token_id = config.pad_token_id
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames, device=device, dtype=dtype, process_group=self.process_group
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+        model = model_class(config, weights)
+        torch.distributed.barrier(group=self.process_group)
+        super().__init__(
+            model_id=model_id,
+            model=model,
+            tokenizer=tokenizer,
+            requires_padding=True,
+            dtype=dtype,
+            device=device,
+            rank=rank,
+            world_size=world_size,
+        )
+    @classmethod
+    def fallback(
+        cls,
+        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        speculator: Optional[str] = None,
@@ -537,7 +626,12 @@ class CausalLM(Model):
            else:
                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        super(CausalLM, self).__init__(
+        self = cls.__new__(
+            cls,
+        )
+        self.batch_class = CausalLMBatch
+        super().__init__(
+            self,
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
@@ -545,15 +639,11 @@ class CausalLM(Model):
            dtype=dtype,
            device=device,
        )
+        return self
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
-        return CausalLMBatch
+        return self.batch_class
-    def decode(self, generated_ids: List[int]) -> str:
-        return self.tokenizer.decode(
-            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None

--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -442,7 +442,7 @@ class FlashGemma2Model(torch.nn.Module):
 class FlashGemma2ForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix, config, weights, *, causal: bool = True):
        super().__init__()
        embed_norm = config.hidden_size**0.5

--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -419,7 +419,7 @@ class FlashGemmaModel(torch.nn.Module):
 class FlashGemmaForCausalLM(torch.nn.Module):
-    def __init__(self, prefix, config, weights, causal: bool):
+    def __init__(self, prefix, config, weights, *, causal: bool = True):
        super().__init__()
        embed_norm = config.hidden_size**0.5

--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -464,8 +464,9 @@ class FlashSantacoderModel(nn.Module):
 class FlashSantacoderForCausalLM(nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights):
        super().__init__()
+        config.transpose = config.architectures[0].startswith("GPT2")
        self.transformer = FlashSantacoderModel(config, weights)
        self.lm_head = SpeculativeHead.load(
            config, prefix="transformer.wte", weights=weights

--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -136,7 +136,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
        self.config = config
        config.text_config.quantize = config.quantize
        config.text_config.speculator = config.speculator
-        self.language_model = load_text_model(
+        self.text_model = load_text_model(
            prefix="language_model" if not prefix else f"{prefix}.language_model",
            config=config.text_config,
            weights=weights,
@@ -180,7 +180,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
        image_sizes: Optional[torch.LongTensor] = None,
        adapter_data: Optional[torch.Tensor] = None,
    ):
-        inputs_embeds = self.language_model.embed_tokens(input_ids)
+        inputs_embeds = self.text_model.embed_tokens(input_ids)
        if pixel_values is not None and len(pixel_values) > 0:
            # num_special_image_tokens = (input_ids == self.config.image_token_index).sum()
            # assert num_special_image_tokens == len(pixel_values), f"Received {num_special_image_tokens} for {len(pixel_values)} images, this is invalid"
@@ -269,7 +269,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
                input_ids, inputs_embeds, image_features
            )
-        hidden_states = self.language_model.model(
+        hidden_states = self.text_model.model(
            inputs_embeds=inputs_embeds,
            position_ids=position_ids,
            cu_seqlen_prefill=cu_seqlen_prefill,
@@ -283,5 +283,5 @@ class LlavaNextForConditionalGeneration(nn.Module):
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
-        logits, speculative_logits = self.language_model.lm_head(hidden_states)
+        logits, speculative_logits = self.text_model.lm_head(hidden_states)
        return logits, speculative_logits
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -10,7 +10,12 @@ import numpy as np
 from loguru import logger
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import PreTrainedTokenizerBase
+from transformers import (
+    PreTrainedTokenizerBase,
+    AutoConfig,
+    AutoTokenizer,
+    GenerationConfig,
+)
 from typing import Iterable, Optional, Tuple, List, Type, Dict
 from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
@@ -21,6 +26,12 @@ from text_generation_server.models import Model
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.dist import RANK
 from text_generation_server.utils.speculate import get_speculate
+from text_generation_server.utils import (
+    initialize_torch_distributed,
+    weight_files,
+    Weights,
+    hub,
+)
 from text_generation_server.models.types import (
    Batch,
    Tokens,
@@ -799,29 +810,110 @@ class FlashCausalLMBatch(Batch):
        return len(self.requests)
+ADAPTER_LAYERS = [
+    "q_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "down_proj",
+]
+ROW_PARALLEL = {"o_proj", "down_proj", "lm_head"}
 class FlashCausalLM(Model):
    def __init__(
        self,
        model_id: str,
-        model: torch.nn.Module,
+        model_class,
-        tokenizer: PreTrainedTokenizerBase,
+        revision: Optional[str] = None,
-        num_layers: int,
+        quantize: Optional[str] = None,
-        num_kv_heads: int,
+        speculator: Optional[str] = None,
-        head_size: int,
+        dtype: Optional[torch.dtype] = None,
-        dtype: torch.dtype,
+        trust_remote_code: bool = False,
-        device: torch.device,
+        lora_adapter_ids: Optional[list] = [],
-        rank: int = 0,
+        tokenizer_class: PreTrainedTokenizerBase = AutoTokenizer,
-        world_size: int = 1,
+        config_class: PreTrainedTokenizerBase = AutoConfig,
-        sliding_window: Optional[int] = None,
+        default_dtype=torch.float16,
+        aliases=None,
+        # Used for Santacoder override of config
+        num_kv_heads=None,
+        skip_special_tokens: bool = True,
    ):
-        self.num_layers = num_layers
+        self.process_group, rank, world_size = initialize_torch_distributed()
-        self.num_kv_heads = num_kv_heads
+        if torch.cuda.is_available():
-        self.head_size = head_size
+            device = torch.device(f"cuda:{rank}")
+            dtype = default_dtype if dtype is None else dtype
+        elif SYSTEM == "ipex":
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                device = torch.device(f"xpu:{rank}")
+                dtype = default_dtype if dtype is None else dtype
+            else:
+                device = torch.device("cpu")
+                # Float16 doesn't exist on target.
+                dtype = torch.bfloat16 if dtype is None else dtype
+        else:
+            raise NotImplementedError(f"{model_class} is only available on GPU")
+        tokenizer = tokenizer_class.from_pretrained(
+            model_id,
+            revision=revision,
+            padding_side="left",
+            truncation_side="left",
+            trust_remote_code=trust_remote_code,
+        )
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                model_id, revision=revision, trust_remote_code=trust_remote_code
+            )
+            if isinstance(generation_config.eos_token_id, (list, set)):
+                # TODO Huge hack
+                tokenizer._eos_token_ids = set(generation_config.eos_token_id)
+        except Exception:
+            pass
+        config = config_class.from_pretrained(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        config.quantize = quantize
+        config.speculator = speculator
+        if getattr(config, "sliding_window", None) is not None:
+            set_sliding_window(config.sliding_window)
+        else:
+            config.sliding_window = None
+        torch.distributed.barrier(group=self.process_group)
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
+        weights = Weights(
+            filenames, device, dtype, process_group=self.process_group, aliases=aliases
+        )
+        if config.quantize in ["awq", "exl2", "gptq", "marlin"]:
+            weights._set_gptq_params(model_id, revision)
+        prefix = ""
+        model = model_class(prefix, config, weights)
+        torch.distributed.barrier(group=self.process_group)
+        # VLM models define the config we care about in their text_config
+        text_config = getattr(config, "text_config", None)
+        if text_config is not None:
+            config = text_config
+        self.num_layers = config.num_hidden_layers
+        # Validation is done in the model itself
+        if num_kv_heads is None:
+            num_kv_heads = getattr(config, "num_key_value_heads", None)
+            if num_kv_heads is None:
+                # Final overide for GPT2
+                num_kv_heads = config.n_head
+        self.num_kv_heads = num_kv_heads // self.process_group.size()
+        self.head_size = config.hidden_size // config.num_attention_heads
        self.cuda_graphs = {}
        self.kv_cache = []
-        super(FlashCausalLM, self).__init__(
+        super().__init__(
            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
@@ -830,7 +922,7 @@ class FlashCausalLM(Model):
            device=device,
            rank=rank,
            world_size=world_size,
-            sliding_window=sliding_window,
+            sliding_window=config.sliding_window,
        )
    @property
@@ -1578,3 +1670,72 @@ class FlashCausalLM(Model):
        forward_ns = start_decode - start
        decode_ns = time.time_ns() - start_decode
        return generations, batch, (forward_ns, decode_ns)
+    @property
+    def supports_adapter_loading(self) -> bool:
+        return True
+    def adapter_target_to_layer(self) -> Dict[str, Tuple[str, torch.Tensor]]:
+        layer_weights = {}
+        prefix = "model.layers"
+        # This accounts for VLMs (e.g. LlavaNext, Idefics2)
+        # that have a language_model inside of the larger model.
+        if hasattr(self.model, "language_model"):
+            _model = self.model.language_model
+        elif hasattr(self.model, "text_model"):
+            _model = self.model.text_model
+        else:
+            _model = self.model
+        for i, layer in enumerate(_model.model.layers):
+            layer_weights[(i, "q_proj")] = (
+                f"{prefix}.{i}.self_attn.q_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "k_proj")] = (
+                f"{prefix}.{i}.self_attn.k_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "v_proj")] = (
+                f"{prefix}.{i}.self_attn.v_proj",
+                layer.self_attn.query_key_value,
+            )
+            layer_weights[(i, "o_proj")] = (
+                f"{prefix}.{i}.self_attn.o_proj",
+                layer.self_attn.o_proj,
+            )
+            # TODO: this is a hack to avoid the gate_proj for
+            # FlashStarcoder2 that doesnt have these layers
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "gate_up_proj"):
+                layer_weights[(i, "gate_proj")] = (
+                    f"{prefix}.{i}.mlp.gate_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "up_proj")] = (
+                    f"{prefix}.{i}.mlp.up_proj",
+                    layer.mlp.gate_up_proj,
+                )
+                layer_weights[(i, "down_proj")] = (
+                    f"{prefix}.{i}.mlp.down_proj",
+                    layer.mlp.down_proj,
+                )
+        layer_weights[(0, "lm_head")] = ("lm_head", _model.lm_head)
+        return layer_weights
+    @property
+    def adapter_layers(self) -> List[str]:
+        return ADAPTER_LAYERS
+    @property
+    def default_traced_adapter_layers(self) -> List[str]:
+        return ["q_proj", "v_proj"]
+    def get_num_layers_for_type(self, layer_type: str) -> int:
+        return 1 if layer_type == "lm_head" else len(self.model.model.layers)
+    def is_row_parallel(self, layer_type: str) -> bool:
+        return layer_type in ROW_PARALLEL
--- a/server/text_generation_server/models/flash_cohere.py
+++ b/server/text_generation_server/models/flash_cohere.py
-import torch
-import torch.distributed
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoTokenizer, AutoConfig
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_cohere_modeling import (
-    FlashCohereForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-tracer = trace.get_tracer(__name__)
-class FlashCohere(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.float16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashCohere is only available on GPU")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-            use_fast=True,
-            from_slow=False,
-        )
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-        model = FlashCohereForCausalLM(config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashCohere, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
--- a/server/text_generation_server/models/flash_dbrx.py
+++ b/server/text_generation_server/models/flash_dbrx.py
-import torch
-import torch.distributed
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoTokenizer
-from transformers.models.gpt2 import GPT2TokenizerFast
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_dbrx_modeling import (
-    FlashDbrxForCausalLM,
-    DbrxConfig,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-tracer = trace.get_tracer(__name__)
-class FlashDbrx(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashDBRX is only available on GPU")
-        try:
-            tokenizer = GPT2TokenizerFast.from_pretrained(
-                model_id,
-                revision=revision,
-                padding_side="left",
-                truncation_side="left",
-                trust_remote_code=trust_remote_code,
-                use_fast=True,
-                from_slow=False,
-            )
-        except:
-            try:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_id,
-                    revision=revision,
-                    padding_side="left",
-                    truncation_side="left",
-                    trust_remote_code=trust_remote_code,
-                    use_fast=True,
-                    from_slow=False,
-                )
-            except:
-                # FIXME: change back to model id once the tokenizer.json is merged
-                tokenizer = GPT2TokenizerFast.from_pretrained(
-                    "Xenova/dbrx-instruct-tokenizer",
-                    revision=revision,
-                    padding_side="left",
-                    truncation_side="left",
-                    trust_remote_code=trust_remote_code,
-                    use_fast=True,
-                    from_slow=False,
-                )
-        config = DbrxConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-        model = FlashDbrxForCausalLM(config, weights)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashDbrx, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
--- a/server/text_generation_server/models/flash_gemma.py
+++ b/server/text_generation_server/models/flash_gemma.py
-import torch
-import torch.distributed
-from opentelemetry import trace
-from typing import Optional
-from transformers import AutoConfig, AutoTokenizer
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_gemma_modeling import (
-    FlashGemmaForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-tracer = trace.get_tracer(__name__)
-class FlashGemma(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashGemma is only available on GPU")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        config = AutoConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-        # TODO hardcoded
-        prefix = ""
-        model = FlashGemmaForCausalLM(prefix, config, weights, causal=True)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashGemma, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )
--- a/server/text_generation_server/models/flash_gemma2.py
+++ b/server/text_generation_server/models/flash_gemma2.py
-import torch
-import torch.distributed
-from opentelemetry import trace
-from typing import Optional
-from transformers import PretrainedConfig, AutoTokenizer
-from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.custom_modeling.flash_gemma2_modeling import (
-    FlashGemma2ForCausalLM,
-)
-from text_generation_server.utils import (
-    initialize_torch_distributed,
-    weight_files,
-    Weights,
-)
-from text_generation_server.utils.import_utils import SYSTEM
-tracer = trace.get_tracer(__name__)
-class FlashGemma2(FlashCausalLM):
-    def __init__(
-        self,
-        model_id: str,
-        revision: Optional[str] = None,
-        quantize: Optional[str] = None,
-        speculator: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        trust_remote_code: bool = False,
-    ):
-        self.process_group, rank, world_size = initialize_torch_distributed()
-        if torch.cuda.is_available():
-            device = torch.device(f"cuda:{rank}")
-            dtype = torch.bfloat16 if dtype is None else dtype
-        elif SYSTEM == "ipex":
-            if hasattr(torch, "xpu") and torch.xpu.is_available():
-                device = torch.device(f"xpu:{rank}")
-                dtype = torch.float16 if dtype is None else dtype
-            else:
-                device = torch.device("cpu")
-                dtype = torch.bfloat16 if dtype is None else dtype
-        else:
-            raise NotImplementedError("FlashGemma2 is only available on GPU")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_id,
-            revision=revision,
-            padding_side="left",
-            truncation_side="left",
-            trust_remote_code=trust_remote_code,
-        )
-        config = PretrainedConfig.from_pretrained(
-            model_id, revision=revision, trust_remote_code=trust_remote_code
-        )
-        config.quantize = quantize
-        config.speculator = speculator
-        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
-        weights = Weights(filenames, device, dtype, process_group=self.process_group)
-        if config.quantize in ["gptq", "awq", "marlin"]:
-            weights._set_gptq_params(model_id, revision)
-        # TODO hardcoded
-        prefix = ""
-        model = FlashGemma2ForCausalLM(prefix, config, weights, causal=True)
-        torch.distributed.barrier(group=self.process_group)
-        super(FlashGemma2, self).__init__(
-            model_id=model_id,
-            model=model,
-            tokenizer=tokenizer,
-            num_layers=len(model.model.layers),
-            num_kv_heads=model.model.num_key_value_heads,
-            head_size=model.model.head_size,
-            dtype=dtype,
-            device=device,
-            rank=rank,
-            world_size=world_size,
-        )