last

efd602c8 · xuxzh1 · f1b779fc · efd602c8 · efd602c8 · efd602c8
Commit efd602c8 authored Oct 29, 2024 by xuxzh1 🎱
14 changed files
--- a/server/text_generation_server/layers/mlp.py
+++ b/server/text_generation_server/layers/mlp.py
+import torch
+import math
+from torch import nn
+from torch.nn import functional as F
+from typing import Optional, Tuple
+from text_generation_server.layers import TensorParallelEmbedding, FastLinear
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.utils.speculate import get_speculate
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    elementwise_scale_weight : torch.Tensor
+        learned scaling term after normalization?
+    elementwise_shift_bias : torch.Tensor
+        learned bias term after normalization?
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        prefix,
+        config,
+        weights,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = weights.get_tensor(f"{prefix}.weight")
+        self.bias = weights.get_tensor(f"{prefix}.bias")
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+
+class MLPSpeculatorModel(torch.nn.Module):
+    def __init__(self, config, prefix, weights):
+        super().__init__()
+        self.config = config
+        self.n_predict = get_speculate()
+        self.hidden_size = config.hidden_size
+        self.emb = nn.ModuleList(
+            [
+                TensorParallelEmbedding(f"{prefix}.emb.{i}", weights)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.proj = [
+            FastLinear.load(
+                config,
+                prefix=f"{prefix}.proj.{i}",
+                weights=weights,
+                bias=False,
+            )
+            for i in range(self.n_predict)
+        ]
+        self.head = nn.ModuleList(
+            [
+                FastLinear.load(config, f"{prefix}.head.{i}", weights, bias=False)
+                for i in range(self.n_predict)
+            ]
+        )
+        self.ln = nn.ModuleList(
+            [
+                MLPSpeculatorLayerNorm(
+                    prefix=f"{prefix}.ln.{i}",
+                    config=config,
+                    weights=weights,
+                )
+                for i in range(self.n_predict)
+            ]
+        )
+
+        # Weights ensure that state_0 accounts for 50% of state magnitude by final head in expectation
+        self.state_weight = 0.5 ** (0.5 / self.n_predict)
+        self.emb_weight = math.sqrt(1 - self.state_weight**2)
+        self.activation = nn.GELU()
+        # TODO
+        self.vsize = config.vocab_size
+        self.inner_dim = config.speculator_config["inner_dim"]
+        self.top_k_tokens_per_head = [1] * self.n_predict
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_ids: torch.Tensor,
+    ):
+        top_k_tokens_per_head = self.top_k_tokens_per_head
+
+        # k indicates # of candidates
+        # h indicates # of generated tokens
+        state = hidden_states
+        b = state.size(0)
+        ind = input_ids.unsqueeze(0)
+        all_probs = torch.empty(
+            b, self.n_predict, self.vsize, device=state.device
+        )  # b k h v
+        assert (
+            len(top_k_tokens_per_head) == self.n_predict
+        ), f"You must provide a topk number for each head ({self.n_predict} heads, {len(top_k_tokens_per_head)} provided)"
+        for i in range(self.n_predict):
+            # Project and predict
+            z = self.emb[i](ind)
+            z = z.mul(self.emb_weight * math.sqrt(self.inner_dim / 2))  # b k d
+            state = self.proj[i](state) * self.state_weight + z
+            state = self.activation(self.ln[i](state))  # b k d
+            probs = F.log_softmax(self.head[i](state), dim=-1)  # b k v
+            _probs, preds = probs.topk(top_k_tokens_per_head[i], dim=-1)  # b k k'
+
+            # Update candidate set with new predictions
+
+            # Update distribution set with new logits
+            all_probs[:, i] = probs.exp()
+
+            # Update state, log_probs and ind for new predictions
+            state = state.unsqueeze(2).expand(
+                -1, -1, top_k_tokens_per_head[i], -1
+            )  # b k k' d
+            state = state.reshape(-1, b, state.size(3))  # b kk' d
+            ind = preds.view(-1, b)  # b kk'
+
+        speculative_logits = all_probs
+        return speculative_logits
+
+
+class MLPSpeculatorHead(nn.Module):
+    def __init__(self, lm_head, mlp_speculator):
+        super().__init__()
+        self.lm_head = lm_head
+        self.mlp_speculator = mlp_speculator
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        logits = self.lm_head(input)
+        # If we have too many tokens, we skip speculative logits
+        if input.shape[0] > 128:
+            return logits, None
+
+        input_ids = logits.argmax(dim=-1)
+        speculative_logits = self.mlp_speculator(input, input_ids)
+        return logits, speculative_logits
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        from pathlib import Path
+        from safetensors import safe_open
+
+        speculator_path = config.speculator["path"]
+
+        for fname in config.speculator["model_paths"]:
+            filename = str(Path(speculator_path) / fname)
+            routing = weights.routing
+            with safe_open(filename, framework="pytorch") as f:
+                for k in f.keys():
+                    if k in routing and routing[k] != filename:
+                        raise RuntimeError(
+                            f"Key {k} was found in multiple files: {filename} and {routing[k]}"
+                        )
+                    routing[k] = filename
+
+        mlp_speculator = MLPSpeculatorModel(config, "speculator", weights)
+        lm_head = TensorParallelHead.load(config, prefix, weights)
+        return MLPSpeculatorHead(lm_head, mlp_speculator)
--- a/server/text_generation_server/layers/rotary.py
+++ b/server/text_generation_server/layers/rotary.py
+import os
+import torch
+from torch import nn
+
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "cuda":
+    from flash_attn.layers.rotary import RotaryEmbedding
+    import rotary_emb
+elif SYSTEM == "rocm":
+    from vllm import _custom_ops
+elif SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+def _create_inv_freq(dim, base, device):
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
+    )
+    return inv_freq
+
+
+def _get_rope_config(config):
+    if os.getenv("ROPE_SCALING", None) is not None:
+        rope_scaling = {
+            "type": os.environ["ROPE_SCALING"],
+            "factor": float(os.environ["ROPE_FACTOR"]),
+        }
+        return rope_scaling
+    return getattr(config, "rope_scaling", None)
+
+
+class PositionRotaryEmbedding(nn.Module):
+    def __init__(self, inv_freq, scaling_factor):
+        super().__init__()
+        self.inv_freq = inv_freq
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.scaling_factor = scaling_factor
+        self.dynamic_args = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ):
+        # Such controlflows may add some overhead.
+        if SYSTEM == "cuda":
+            rotary_dim = cos.shape[-1]
+            q1 = query[..., :rotary_dim]
+            q2 = query[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(q1, q2, cos, sin, q1, q2, False)
+
+            k1 = key[..., :rotary_dim]
+            k2 = key[..., rotary_dim : 2 * rotary_dim]
+
+            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
+        elif SYSTEM == "rocm":
+            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
+            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
+
+            head_size = query.shape[-1]
+
+            # Inplace operation, updating query and key.
+            torch.ops._C.rotary_embedding_tgi(query, key, head_size, cos, sin, True)
+        elif SYSTEM == "ipex":
+            ipex.llm.functional.rotary_embedding(
+                query, key, sin, cos, query.size(-1), True
+            )
+        else:
+            raise ValueError(
+                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
+            )
+
+    @classmethod
+    def static(cls, config, dim, base, device):
+        inv_freq = _create_inv_freq(dim, base, device)
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                scaling_factor = rope_scaling["factor"]
+                return DynamicPositionRotaryEmbedding(
+                    dim=dim,
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=base,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                scaling_factor = rope_scaling["factor"]
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            elif rope_scaling["type"] == "su":
+                short_factor = torch.tensor(
+                    rope_scaling["short_factor"], dtype=torch.float32, device=device
+                )
+                short_inv_freq = 1.0 / (
+                    short_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+                long_factor = torch.tensor(
+                    rope_scaling["long_factor"], dtype=torch.float32, device=device
+                )
+                long_inv_freq = 1.0 / (
+                    long_factor
+                    * base
+                    ** (
+                        torch.arange(0, dim, 2, device=device, dtype=torch.float32)
+                        / dim
+                    )
+                )
+
+                original_max_position_embeddings = (
+                    config.original_max_position_embeddings
+                )
+                max_position_embeddings = config.max_position_embeddings
+                if max_position_embeddings <= original_max_position_embeddings:
+                    scaling_factor = 1.0
+                else:
+                    scale = max_position_embeddings / original_max_position_embeddings
+                    scaling_factor = math.sqrt(
+                        1 + math.log(scale) / math.log(original_max_position_embeddings)
+                    )
+
+                return SuRotaryEmbedding(
+                    short_inv_freq=short_inv_freq,
+                    long_inv_freq=long_inv_freq,
+                    scaling_factor=scaling_factor,
+                    original_max_position_embeddings=original_max_position_embeddings,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    @classmethod
+    def load(cls, config, prefix, weights):
+        # XXX: Always load this in float32 !
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        inv_freq = weights.get_tensor(f"{prefix}.inv_freq")
+        weights.dtype = dtype
+
+        scaling_factor = None
+        rope_scaling = _get_rope_config(config)
+        if rope_scaling is not None:
+            scaling_factor = rope_scaling["factor"]
+            if rope_scaling["type"] == "linear":
+                pass
+            elif rope_scaling["type"] == "dynamic":
+                return DynamicPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=config.max_position_embeddings,
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                )
+            elif rope_scaling["type"] == "yarn":
+                return YarnPositionRotaryEmbedding(
+                    dim=2 * inv_freq.shape[0],
+                    max_position_embeddings=rope_scaling[
+                        "original_max_position_embeddings"
+                    ],
+                    base=10000.0,
+                    device=inv_freq.device,
+                    scaling_factor=scaling_factor,
+                    extrapolation_factor=1,
+                    attn_factor=1,
+                    beta_fast=32,
+                    beta_slow=1,
+                )
+            else:
+                raise NotImplementedError(
+                    f"rope scaling type {rope_scaling['type']} is not implemented or invalid"
+                )
+        return cls(inv_freq, scaling_factor)
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            if self.scaling_factor is not None:
+                t /= self.scaling_factor
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+    def get_cos_sin(self, position_ids: torch.Tensor, max_s: int, dtype: torch.dtype):
+        """
+        Return cos and sin for the asked position ids
+        """
+        if SYSTEM == "rocm":
+            # For RoCm, we always use float cos/sin to avoid a cast.
+            # For NVIDIA, for some reason, the flash-attn rotary kernel requires cos/sin and query/key to be of same dtype: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary.cpp#L26
+            # But later on goes and cast cos/sin to float anyway: https://github.com/Dao-AILab/flash-attention/blob/017716451d446e464dde9aca3a3c1ed2209caaa9/csrc/rotary/rotary_cuda.cu#L29, which looks suboptimal.
+            dtype = torch.float32
+
+        self._update_cos_sin_cache(dtype, position_ids.device, max_s)
+
+        cos = torch.index_select(self._cos_cached, 0, position_ids)
+        sin = torch.index_select(self._sin_cached, 0, position_ids)
+
+        # Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
+        return cos.unsqueeze(1), sin.unsqueeze(1)
+
+
+class SuRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        short_inv_freq,
+        long_inv_freq,
+        scaling_factor,
+        original_max_position_embeddings,
+    ):
+        super(PositionRotaryEmbedding, self).__init__()
+        self.short_inv_freq = short_inv_freq
+        self.long_inv_freq = long_inv_freq
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+        self.dynamic_args = None
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            self._seq_len_cached = seqlen
+
+            t = torch.arange(seqlen, device=device, dtype=self.short_inv_freq.dtype)
+            short_freqs = torch.outer(
+                t[: self.original_max_position_embeddings],
+                self.short_inv_freq.to(device=t.device),
+            )
+            long_freqs = torch.outer(
+                t[self.original_max_position_embeddings :],
+                self.long_inv_freq.to(device=t.device),
+            )
+
+            freqs = torch.cat([short_freqs, long_freqs])
+
+            self._cos_cached = (torch.cos(freqs) * self.scaling_factor).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.scaling_factor).to(dtype)
+
+
+class DynamicPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings, base, device, scaling_factor):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                newbase = self.base * (
+                    (self.scaling_factor * seqlen / self.max_position_embeddings)
+                    - (self.scaling_factor - 1)
+                ) ** (self.dim / (self.dim - 2))
+                self.inv_freq = _create_inv_freq(
+                    self.dim, newbase, self.inv_freq.device
+                )
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+
+
+# Inverse dim formula to find dim based on number of rotations
+import math
+
+
+def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+    high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_mscale(scale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings,
+        base,
+        device,
+        scaling_factor,
+        *,
+        extrapolation_factor,
+        attn_factor,
+        beta_fast,
+        beta_slow,
+    ):
+        inv_freq = _create_inv_freq(dim, base, device)
+        super().__init__(inv_freq, scaling_factor)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = float(
+            get_mscale(self.scaling_factor) * self.attn_factor
+        )  # Get n-d magnitude scaling corrected for interpolation
+
+    def _update_cos_sin_cache(self, dtype, device, seqlen):
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        if (
+            seqlen > self._seq_len_cached
+            or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+        ):
+            if seqlen > self.max_position_embeddings:
+                inv_freq_extrapolation = _create_inv_freq(
+                    self.dim, self.base, self.inv_freq.device
+                )
+                freqs = 1.0 / inv_freq_extrapolation
+                inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
+                low, high = find_correction_range(
+                    self.beta_fast,
+                    self.beta_slow,
+                    self.dim,
+                    self.base,
+                    self.max_position_embeddings,
+                )
+                inv_freq_mask = (
+                    1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                inv_freq = (
+                    inv_freq_interpolation * (1 - inv_freq_mask)
+                    + inv_freq_extrapolation * inv_freq_mask
+                )
+
+                self.inv_freq = inv_freq
+                self.mscale = float(
+                    get_mscale(self.scaling_factor) * self.attn_factor
+                )  # Get n-d magnitude scaling corrected for interpolation
+
+            self._seq_len_cached = seqlen
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            # Don't do einsum, it converts fp32 to fp16
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+
+            freqs = torch.outer(t, self.inv_freq.to(device=t.device))
+            self._cos_cached = (torch.cos(freqs) * self.mscale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * self.mscale).to(dtype)
--- a/server/text_generation_server/layers/speculative.py
+++ b/server/text_generation_server/layers/speculative.py
+import torch
+import json
+from typing import Tuple, Optional
+from text_generation_server.layers.tensor_parallel import TensorParallelHead
+from text_generation_server.layers.medusa import MedusaHeadV1, MedusaHeadV2
+from text_generation_server.layers.mlp import MLPSpeculatorHead
+
+
+class SpeculativeHead(torch.nn.Module):
+    def __init__(self, lm_head, speculator):
+        super().__init__()
+        self.head = lm_head
+        self.speculator = speculator
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        speculator = config.speculator
+        if speculator:
+            speculator_path = config.speculator["path"]
+            speculator_config = str(speculator_path / "config.json")
+
+            with open(speculator_config, "r") as f:
+                speculator_config = json.load(f)
+
+            config.speculator_config = speculator_config
+            try:
+                architecture = speculator_config["architectures"][0]
+
+                if architecture == "MLPSpeculatorPreTrainedModel":
+                    speculator = MLPSpeculatorHead.load(config, prefix, weights)
+                else:
+                    speculator = None
+            except KeyError:
+                try:
+                    speculator = MedusaHeadV1.load(config, prefix, weights)
+                except:
+                    speculator = MedusaHeadV2(config, prefix, weights)
+            lm_head = None
+        else:
+            lm_head = TensorParallelHead.load(config, prefix, weights)
+            speculator = None
+        return SpeculativeHead(lm_head, speculator)
+
+    def forward(
+        self, input: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        if self.speculator is not None:
+            return self.speculator(input)
+
+        assert self.head is not None
+        logits = self.head(input)
+        return logits, None
--- a/server/text_generation_server/layers/tensor_parallel.py
+++ b/server/text_generation_server/layers/tensor_parallel.py
+import torch
+from torch.nn import functional as F
+from typing import Iterable, List
+from text_generation_server.layers.linear import get_linear, FastLinear
+from text_generation_server.layers.exl2 import Exl2Weight
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    import intel_extension_for_pytorch as ipex
+
+
+class LayerConcat(torch.nn.Module):
+    """
+    Apply multiple layers to the input and concatenate their
+    outputs.
+    """
+
+    def __init__(self, layers: Iterable[torch.nn.Module], dim: int = -1):
+        """
+        `dim` is the dimension along which layer outputs are concatenated.
+        """
+        super().__init__()
+        self.layers = layers
+        self.dim = dim
+
+    def forward(self, x: torch.Tensor):
+        outputs = [layer(x) for layer in self.layers]
+        return torch.cat(outputs, self.dim)
+
+
+class SuperLayer(torch.nn.Module):
+    def __init__(self, linear):
+        super().__init__()
+        self.linear = linear
+
+    def forward(self, x):
+        return self.linear.forward(x)
+
+
+class TensorParallelHead(SuperLayer):
+    def __init__(self, linear, process_group, should_gather: bool):
+        super().__init__(linear)
+        self.process_group = process_group
+        self.should_gather = should_gather
+
+    @staticmethod
+    def load(config, prefix: str, weights):
+        if config.quantize == "exl2":
+            try:
+                # If the piece and LM head embeddings are shared, we have
+                # non-quantized weights...
+                weight = weights.get_tensor(f"{prefix}.weight")
+            except:
+                # ...otherwise they are quantized.
+                weight = weights.get_weights_col(prefix, config.quantize)
+            should_gather = weights.process_group.size() > 1
+        elif weights.process_group.size() > 1:
+            try:
+                weight = weights.get_sharded(f"{prefix}.weight", dim=0)
+                should_gather = True
+            except AssertionError:
+                # If the vocab size is not divisible by number of shards
+                # just load the entire thing.
+                weight = weights.get_tensor(f"{prefix}.weight")
+                should_gather = False
+        else:
+            weight = weights.get_tensor(f"{prefix}.weight")
+            should_gather = False
+
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
+        if config.quantize in ["gptq", "awq", "eetq", "marlin"]:
+            quantize = None
+        # See above, exl2 LM head can be quantized or not.
+        elif config.quantize == "exl2" and not isinstance(weight, Exl2Weight):
+            quantize = None
+        else:
+            quantize = config.quantize
+
+        return TensorParallelHead(
+            get_linear(weight, bias=None, quantize=quantize),
+            process_group=weights.process_group,
+            should_gather=should_gather,
+        )
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if not self.should_gather:
+            return super().forward(input)
+
+        world_size = self.process_group.size()
+        if len(input.shape) == 2 and isinstance(self.linear, FastLinear):
+            out_dim = self.linear.weight.shape[0]
+
+            if input.shape[0] == 1:
+                world_out = input.new_empty(1, out_dim * world_size)
+                local_out = input.new_empty(1, out_dim)
+                gather_input = local_out
+            else:
+                world_out = input.new_empty(out_dim * world_size, input.shape[0])
+                gather_input = input.new_empty(out_dim, input.shape[0])
+                local_out = gather_input.T
+
+            torch.mm(input, self.linear.weight.T, out=local_out)
+            if SYSTEM == "ipex":
+                ipex.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+            else:
+                torch.distributed.all_gather_into_tensor(
+                    world_out, gather_input, group=self.process_group
+                )
+
+            if input.shape[0] == 1:
+                return world_out
+            return world_out.T
+
+        output = super().forward(input)
+        world_output = [
+            torch.empty_like(output) for _ in range(self.process_group.size())
+        ]
+        if SYSTEM == "ipex":
+            ipex.distributed.all_gather(world_output, output, group=self.process_group)
+        else:
+            torch.distributed.all_gather(world_output, output, group=self.process_group)
+        world_output = torch.cat(world_output, dim=-1)
+        return world_output
+
+
+class TensorParallelColumnLinear(SuperLayer):
+    @classmethod
+    def load_gate_up(cls, config, prefix: str, weights, bias: bool):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_gate_up(
+            prefix, quantize=config.quantize
+        )
+        if bias:
+            raise NotImplementedError("packed_gate_up only implemented without bias")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_qkv(
+        cls,
+        config,
+        prefix: str,
+        weights,
+        bias: bool,
+        num_heads: int,
+        num_key_value_heads: int,
+    ):
+        """Specific method when the QKV was joined after the fact"""
+        weight = weights.get_weights_col_packed_qkv(
+            prefix,
+            quantize=config.quantize,
+            num_heads=num_heads,
+            num_key_value_heads=num_key_value_heads,
+        )
+        if bias:
+            raise NotImplementedError("packed_qkv only implemented for baichuan")
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_weights_col(prefix, config.quantize)
+        if bias:
+            bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+        else:
+            bias = None
+        linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+    @classmethod
+    def load_multi(cls, config, prefixes: List[str], weights, bias: bool, dim: int):
+        if config.quantize == "exl2":
+            linears = []
+            for prefix in prefixes:
+                weight = weights.get_weights_col(prefix, config.quantize)
+                b = weights.get_tensor(f"{prefix}.bias") if bias else None
+                linears.append(get_linear(weight, b, config.quantize))
+            linear = LayerConcat(linears)
+        else:
+            weight = weights.get_multi_weights_col(
+                prefixes, quantize=config.quantize, dim=dim
+            )
+            if bias:
+                b = [weights.get_sharded(f"{p}.bias", dim=0) for p in prefixes]
+                bias = torch.cat(b, dim=dim)
+            else:
+                bias = None
+            linear = get_linear(weight, bias, config.quantize)
+        return cls(linear)
+
+
+class TensorParallelRowLinear(SuperLayer):
+    def __init__(self, linear, process_group):
+        super().__init__(linear)
+        self.process_group = process_group
+
+    @classmethod
+    def load(cls, config, prefix: str, weights, bias: bool):
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+
+        if bias and weights.process_group.rank() == 0:
+            # Rank is only on the first rank process
+            bias = weights.get_tensor(f"{prefix}.bias")
+        else:
+            bias = None
+        return cls(
+            get_linear(weight, bias, config.quantize),
+            process_group=weights.process_group,
+        )
+
+    def forward(self, input: torch.Tensor, reduce: bool = True) -> torch.Tensor:
+        out = super().forward(input)
+        if self.process_group.size() > 1 and reduce:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
+
+
+class TensorParallelEmbedding(torch.nn.Module):
+    def __init__(self, prefix: str, weights, reduce=True):
+        super().__init__()
+        weight = weights.get_partial_sharded(f"{prefix}.weight", dim=0)
+        num_embeddings = weights.get_shape(f"{prefix}.weight")[0]
+
+        process_group = weights.process_group
+
+        world_size = process_group.size()
+        rank = process_group.rank()
+
+        block_size = (num_embeddings + world_size - 1) // world_size
+        self.min_id = rank * block_size
+        self.max_id = min(num_embeddings, (rank + 1) * block_size)
+        self.null_idx = weight.shape[
+            0
+        ]  # Usually block_size, might be less in non even vocab_size.
+        self.process_group = weights.process_group
+        self.reduce = reduce
+
+        """Additional 0 entry used for masking"""
+        self.weight = torch.nn.Parameter(F.pad(weight, (0, 0, 0, 1)))
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # default all out of bounds values to `self.null_idx` that will then be mapped to 0
+        # translate for [0, self.max_id - self.min_id[
+        input = torch.where(
+            (self.min_id > input) | (input >= self.max_id),
+            self.null_idx,
+            input - self.min_id,
+        )
+        out = torch.nn.functional.embedding(input, self.weight)
+        if self.reduce and self.process_group.size() > 1:
+            if SYSTEM == "ipex":
+                ipex.distributed.all_reduce(out, group=self.process_group)
+            else:
+                torch.distributed.all_reduce(out, group=self.process_group)
+        return out
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
 import torch
+import enum
+import os

 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
-from huggingface_hub import hf_hub_download
-from typing import Optional
+from huggingface_hub import hf_hub_download, HfApi
+from typing import Optional, List
 from pathlib import Path

 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM
-from text_generation_server.models.flash_causal_lm import FlashCausalLM
 from text_generation_server.models.bloom import BLOOMSharded
 from text_generation_server.models.mpt import MPTSharded
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
@@ -22,6 +23,8 @@ from text_generation_server.models.t5 import T5Sharded
 from text_generation_server.models.gpt_neox import GPTNeoxSharded
 from text_generation_server.models.phi import Phi

+from text_generation_server.utils.import_utils import SYSTEM
+
 # The flag below controls whether to allow TF32 on matmul. This flag defaults to False
 # in PyTorch 1.12 and later.
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -49,7 +52,9 @@ FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
 FLASH_ATTENTION = True

 try:
+    from text_generation_server.models.flash_causal_lm import FlashCausalLM
    from text_generation_server.models.flash_rw import FlashRWSharded
+    from text_generation_server.models.flash_gpt2 import FlashGPT2
    from text_generation_server.models.flash_neox import FlashNeoXSharded
    from text_generation_server.models.flash_llama import (
        FlashLlama,
@@ -63,6 +68,12 @@ try:
    from text_generation_server.models.flash_gemma import (
        FlashGemma,
    )
+    from text_generation_server.models.flash_gemma2 import (
+        FlashGemma2,
+    )
+    from text_generation_server.models.pali_gemma import (
+        PaliGemma,
+    )
    from text_generation_server.models.flash_santacoder import (
        FlashSantacoderSharded,
    )
@@ -70,30 +81,32 @@ try:
    from text_generation_server.models.llava_next import LlavaNext
    from text_generation_server.models.idefics2 import Idefics2
    from text_generation_server.models.flash_mistral import FlashMistral
-    # from text_generation_server.models.flash_mixtral import FlashMixtral
+    from text_generation_server.models.flash_mixtral import FlashMixtral
    from text_generation_server.models.flash_phi import FlashPhi
    from text_generation_server.models.flash_starcoder2 import FlashStarcoder2
-    # from text_generation_server.models.flash_dbrx import FlashDbrx
-    from text_generation_server.utils.flash_attn import HAS_FLASH_ATTN_V2_CUDA
-
+    from text_generation_server.models.flash_dbrx import FlashDbrx
+    from text_generation_server.layers.attention import SUPPORTS_WINDOWING
 except ImportError as e:
    logger.warning(f"Could not import Flash Attention enabled models: {e}")
+    SUPPORTS_WINDOWING = False
    FLASH_ATTENTION = False
-    HAS_FLASH_ATTN_V2_CUDA = False

 if FLASH_ATTENTION:
+    __all__.append(FlashCausalLM)
+    __all__.append(FlashGPT2)
    __all__.append(FlashNeoXSharded)
    __all__.append(FlashRWSharded)
    __all__.append(FlashSantacoderSharded)
    __all__.append(FlashLlama)
    __all__.append(IDEFICSSharded)
    __all__.append(FlashMistral)
-    # __all__.append(FlashMixtral)
-    # __all__.append(FlashDbrx)
+    __all__.append(FlashMixtral)
+    __all__.append(FlashDbrx)
    __all__.append(FlashPhi)
    __all__.append(FlashQwen2)
    __all__.append(FlashStarcoder2)
    __all__.append(FlashGemma)
+    __all__.append(FlashGemma2)
    __all__.append(FlashCohere)

 MAMBA_AVAILABLE = True
@@ -107,16 +120,164 @@ if MAMBA_AVAILABLE:
    __all__.append(Mamba)


+class ModelType(enum.Enum):
+    IDEFICS2 = {
+        "type": "idefics2",
+        "name": "Idefics 2",
+        "url": "https://huggingface.co/HuggingFaceM4/idefics2-8b",
+        "multimodal": True,
+    }
+    LLAVA_NEXT = {
+        "type": "llava_next",
+        "name": "Llava Next (1.6)",
+        "url": "https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf",
+        "multimodal": True,
+    }
+    LLAMA = {
+        "type": "llama",
+        "name": "Llama",
+        "url": "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
+    }
+    PHI3 = {
+        "type": "phi3",
+        "name": "Phi 3",
+        "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
+    }
+    GEMMA = {
+        "type": "gemma",
+        "name": "Gemma",
+        "url": "https://huggingface.co/google/gemma-7b",
+    }
+    GEMMA2 = {
+        "type": "gemma2",
+        "name": "Gemma2",
+        "url": "https://huggingface.co/google/gemma2-9b",
+    }
+    COHERE = {
+        "type": "cohere",
+        "name": "Cohere",
+        "url": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
+    }
+    DBRX = {
+        "type": "dbrx",
+        "name": "Dbrx",
+        "url": "https://huggingface.co/databricks/dbrx-instruct",
+    }
+    MAMBA = {
+        "type": "ssm",
+        "name": "Mamba",
+        "url": "https://huggingface.co/state-spaces/mamba-2.8b-slimpj",
+    }
+    MISTRAL = {
+        "type": "mistral",
+        "name": "Mistral",
+        "url": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2",
+    }
+    MIXTRAL = {
+        "type": "mixtral",
+        "name": "Mixtral",
+        "url": "https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1",
+    }
+    GPT_BIGCODE = {
+        "type": "gpt_bigcode",
+        "name": "Gpt Bigcode",
+        "url": "https://huggingface.co/bigcode/gpt_bigcode-santacoder",
+    }
+    PHI = {
+        "type": "phi",
+        "name": "Phi",
+        "url": "https://huggingface.co/microsoft/phi-1_5",
+    }
+    BAICHUAN = {
+        "type": "baichuan",
+        "name": "Baichuan",
+        "url": "https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat",
+    }
+    FALCON = {
+        "type": "falcon",
+        "name": "Falcon",
+        "url": "https://huggingface.co/tiiuae/falcon-7b-instruct",
+    }
+    STARCODER2 = {
+        "type": "starcoder2",
+        "name": "StarCoder 2",
+        "url": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
+    }
+    QWEN2 = {
+        "type": "qwen2",
+        "name": "Qwen 2",
+        "url": "https://huggingface.co/collections/Qwen/qwen2-6659360b33528ced941e557f",
+    }
+    OPT = {
+        "type": "opt",
+        "name": "Opt",
+        "url": "https://huggingface.co/facebook/opt-6.7b",
+    }
+    T5 = {
+        "type": "t5",
+        "name": "T5",
+        "url": "https://huggingface.co/google/flan-t5-xxl",
+    }
+    GALACTICA = {
+        "type": "galactica",
+        "name": "Galactica",
+        "url": "https://huggingface.co/facebook/galactica-120b",
+    }
+    SANTACODER = {
+        "type": "santacoder",
+        "name": "SantaCoder",
+        "url": "https://huggingface.co/bigcode/santacoder",
+    }
+    BLOOM = {
+        "type": "bloom",
+        "name": "Bloom",
+        "url": "https://huggingface.co/bigscience/bloom-560m",
+    }
+    MPT = {
+        "type": "mpt",
+        "name": "Mpt",
+        "url": "https://huggingface.co/mosaicml/mpt-7b-instruct",
+    }
+    GPT2 = {
+        "type": "gpt2",
+        "name": "Gpt2",
+        "url": "https://huggingface.co/openai-community/gpt2",
+    }
+    GPT_NEOX = {
+        "type": "gpt_neox",
+        "name": "Gpt Neox",
+        "url": "https://huggingface.co/EleutherAI/gpt-neox-20b",
+    }
+    IDEFICS = {
+        "type": "idefics",
+        "name": "Idefics",
+        "url": "https://huggingface.co/HuggingFaceM4/idefics-9b",
+        "multimodal": True,
+    }
+
+
+__GLOBALS = locals()
+for data in ModelType:
+    __GLOBALS[data.name] = data.value["type"]
+
+
 def get_model(
    model_id: str,
+    lora_adapter_ids: Optional[List[str]],
    revision: Optional[str],
    sharded: bool,
    quantize: Optional[str],
    speculate: Optional[int],
    dtype: Optional[str],
    trust_remote_code: bool,
+    max_input_tokens: int,
 ) -> Model:
+    global FLASH_ATTENTION
    if dtype is None:
+        if quantize in ["awq", "exl2", "gptq", "marlin"]:
+            # These quantizers only work with float16 params.
+            dtype = torch.float16
+        else:
            # Keep it as default for now and let
            # every model resolve their own default dtype.
            dtype = None
@@ -135,8 +296,9 @@ def get_model(
    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
+    model_type = config_dict.get("model_type", None)

-    use_medusa = None
+    speculator = None
    if "medusa_num_heads" in config_dict:
        medusa_model_id = model_id
        medusa_revision = revision
@@ -156,6 +318,8 @@ def get_model(
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
        is_local = Path(medusa_model_id).exists()
        if not is_local:
            medusa_config = hf_hub_download(
@@ -166,11 +330,70 @@ def get_model(
                revision=medusa_revision,
                filename="medusa_lm_head.safetensors",
            )
-            use_medusa = Path(medusa_config).parent
+            speculator = {
+                "path": Path(medusa_config).parent,
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }
        else:
-            use_medusa = Path(medusa_model_id)
+            speculator = {
+                "path": Path(medusa_model_id),
+                "model_paths": ["medusa_lm_head.safetensors"],
+            }

        method = "medusa"
+    elif model_type == "mlp_speculator":
+        mlp_model_id = model_id
+        mlp_revision = revision
+        model_id = config_dict["base_model_name_or_path"]
+        revision = "main"
+        speculate_mlp = config_dict["n_predict"]
+        if speculate is not None:
+            if speculate > speculate_mlp:
+                raise RuntimeError(
+                    f"Speculate is set to `{speculate}` but this mlp_speculator models only has `{speculate_mlp}` heads, please make them match"
+                )
+            else:
+                set_speculate(speculate)
+        else:
+            set_speculate(speculate_mlp)
+
+        config_dict, _ = PretrainedConfig.get_config_dict(
+            model_id, revision=revision, trust_remote_code=trust_remote_code
+        )
+        # Reload model type from parent.
+        model_type = config_dict.get("model_type", None)
+        is_local = Path(mlp_model_id).exists()
+        extension = ".safetensors"
+        if not is_local:
+            mlp_speculator_config = hf_hub_download(
+                mlp_model_id, revision=mlp_revision, filename="config.json"
+            )
+            api = HfApi()
+            info = api.model_info(mlp_model_id, revision=mlp_revision)
+            filenames = [
+                s.rfilename
+                for s in info.siblings
+                if s.rfilename.endswith(extension)
+                and len(s.rfilename.split("/")) == 1
+                and "arguments" not in s.rfilename
+                and "args" not in s.rfilename
+                and "training" not in s.rfilename
+            ]
+            for filename in filenames:
+                hf_hub_download(
+                    mlp_model_id,
+                    revision=mlp_revision,
+                    filename=filename,
+                )
+            speculator = {
+                "path": Path(mlp_speculator_config).parent,
+                "model_paths": filenames,
+            }
+        else:
+            speculator = Path(mlp_model_id)
+            filenames = [p for p in os.listdir(speculator) if p.endswith(extension)]
+            speculator = {"path": speculator, "model_paths": filenames}
+        method = "mlp_speculator"
    else:
        method = "n-gram"

@@ -178,7 +401,6 @@ def get_model(
    if speculate > 0:
        logger.info(f"Using speculation {method} with {speculate} input ids.")

-    model_type = config_dict.get("model_type", None)
    if model_type is None:
        # TODO: fix how we determine model type for Mamba
        if "ssm_cfg" in config_dict:
@@ -191,18 +413,33 @@ def get_model(
    quantization_config = config_dict.get("quantization_config", None)
    if quantization_config is not None and quantize is None:
        method = quantization_config.get("quant_method", None)
-        if method in {"gptq", "awq"}:
+        if method in {"gptq", "awq", "exl2"}:
            logger.info(f"Auto selecting quantization method {method}")
            quantize = method
        else:
            logger.info(f"Unknown quantization method {method}")

-    if model_type == "ssm":
+    if quantize == "exl2" and sharded:
+        raise RuntimeError(
+            "Sharding is currently not supported with `exl2` quantization"
+        )
+    sliding_window = config_dict.get("sliding_window", -1)
+
+    if (
+        (sliding_window is not None and sliding_window != -1)
+        and not SUPPORTS_WINDOWING
+        and max_input_tokens > sliding_window
+    ):
+        raise ValueError(
+            f"The backend {SYSTEM} does not support sliding window attention that is used by the model type {model_type}. To use this model nonetheless with the {SYSTEM} backend, please launch TGI with the argument `--max-input-tokens` smaller than sliding_window={sliding_window} (got here max_input_tokens={max_input_tokens})."
+        )
+
+    if model_type == MAMBA:
        return Mamba(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -212,14 +449,14 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )

    if (
-        model_type == "gpt_bigcode"
-        or model_type == "gpt2"
+        model_type == GPT_BIGCODE
+        or model_type == GPT2
        and model_id.startswith("bigcode/")
    ):
        if FLASH_ATTENTION:
@@ -227,7 +464,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -240,37 +477,69 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "bloom":
+    if model_type == BLOOM:
        return BLOOMSharded(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
-    elif model_type == "mpt":
+    elif model_type == MPT:
        return MPTSharded(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
-
-    elif model_type == "gpt_neox":
+    elif model_type == GPT2:
+        if FLASH_ATTENTION:
+            try:
+                return FlashGPT2(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+            except RuntimeError as e:
+                # Lots of legacy models with various weight names.
+                logger.warning(f"Couldn't load flash gpt2 variant: {e}")
+                return CausalLM(
+                    model_id,
+                    revision,
+                    quantize=quantize,
+                    speculator=speculator,
+                    dtype=dtype,
+                    trust_remote_code=trust_remote_code,
+                )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded GPT-2"))
+        else:
+            return CausalLM(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    elif model_type == GPT_NEOX:
        if FLASH_ATTENTION:
            return FlashNeoXSharded(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -279,7 +548,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -288,18 +557,18 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    elif model_type == "phi":
+    elif model_type == PHI:
        if FLASH_ATTENTION:
            return FlashPhi(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -308,7 +577,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -323,20 +592,21 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    elif model_type == "llama" or model_type == "baichuan" or model_type == "phi3":
+    elif model_type == LLAMA or model_type == BAICHUAN or model_type == PHI3:
        if FLASH_ATTENTION:
            return FlashLlama(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
+                lora_adapter_ids=lora_adapter_ids,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
@@ -345,17 +615,17 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
-    if model_type == "gemma":
+    if model_type == GEMMA:
        if FLASH_ATTENTION:
            return FlashGemma(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -366,18 +636,39 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+    elif model_type == GEMMA2:
+        if FLASH_ATTENTION:
+            return FlashGemma2(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        elif sharded:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
+        else:
+            return CausalLM(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "cohere":
+    if model_type == COHERE:
        if FLASH_ATTENTION:
            return FlashCohere(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -388,18 +679,18 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "dbrx":
+    if model_type == DBRX:
        if FLASH_ATTENTION:
            return FlashDbrx(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -410,12 +701,12 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type in ["RefinedWeb", "RefinedWebModel", "falcon"]:
+    if model_type in ["RefinedWeb", "RefinedWebModel", FALCON]:
        if sharded:
            if FLASH_ATTENTION:
                if config_dict.get("alibi", False):
@@ -424,7 +715,7 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
@@ -435,7 +726,7 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
@@ -444,21 +735,18 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
-                    use_medusa=use_medusa,
+                    speculator=speculator,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )

-    if model_type == "mistral":
-        sliding_window = config_dict.get("sliding_window", -1)
-        if (
-            (sliding_window is None or sliding_window == -1) and FLASH_ATTENTION
-        ) or HAS_FLASH_ATTN_V2_CUDA:
+    if model_type == MISTRAL:
+        if FLASH_ATTENTION:
            return FlashMistral(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -469,21 +757,18 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "mixtral":
-        sliding_window = config_dict.get("sliding_window", -1)
-        if (
-            (sliding_window is None or sliding_window == -1) and FLASH_ATTENTION
-        ) or HAS_FLASH_ATTN_V2_CUDA:
+    if model_type == MIXTRAL:
+        if FLASH_ATTENTION:
            return FlashMixtral(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -494,16 +779,13 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "starcoder2":
-        sliding_window = config_dict.get("sliding_window", -1)
-        if (
-            (sliding_window is None or sliding_window == -1) and FLASH_ATTENTION
-        ) or HAS_FLASH_ATTN_V2_CUDA:
+    if model_type == STARCODER2:
+        if FLASH_ATTENTION:
            return FlashStarcoder2(
                model_id,
                revision,
@@ -520,18 +802,13 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "qwen2":
-        sliding_window = config_dict.get("sliding_window", -1)
-        use_sliding_window = config_dict.get("use_sliding_window", False)
-        sliding_window = sliding_window if use_sliding_window else None
-        if (
-            (sliding_window is None or sliding_window == -1) and FLASH_ATTENTION
-        ) or HAS_FLASH_ATTN_V2_CUDA:
+    if model_type == QWEN2:
+        if FLASH_ATTENTION:
            return FlashQwen2(
                model_id,
                revision,
@@ -546,62 +823,74 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

-    if model_type == "opt":
+    if model_type == OPT:
        return OPTSharded(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )

-    if model_type == "t5":
+    if model_type == T5:
        return T5Sharded(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
-    if model_type == "idefics":
+    if model_type == IDEFICS:
        if FLASH_ATTENTION:
            return IDEFICSSharded(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
-    if model_type == "idefics2":
+    if model_type == IDEFICS2:
        if FLASH_ATTENTION:
            return Idefics2(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
+                dtype=dtype,
+                trust_remote_code=trust_remote_code,
+            )
+        else:
+            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))
+    if model_type == "paligemma":
+        if FLASH_ATTENTION:
+            return PaliGemma(
+                model_id,
+                revision,
+                quantize=quantize,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
        else:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Idefics"))

-    if model_type == "llava_next":
+    if model_type == LLAVA_NEXT:
        if FLASH_ATTENTION:
            return LlavaNext(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -620,12 +909,14 @@ def get_model(
        raise NotImplementedError("4bit quantization is not supported for AutoModel")
    elif quantize == "eetq":
        raise NotImplementedError("Eetq quantization is not supported for AutoModel")
+    elif quantize == "exl2":
+        raise NotImplementedError("exl2 quantization is not supported for AutoModel")
    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
        return CausalLM(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -634,7 +925,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
-            use_medusa=use_medusa,
+            speculator=speculator,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -646,7 +937,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -655,7 +946,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
-                use_medusa=use_medusa,
+                speculator=speculator,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -42,10 +42,11 @@ class BLOOMSharded(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
+
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
@@ -71,7 +72,7 @@ class BLOOMSharded(CausalLM):
        )
        config.pad_token_id = 3
        config.quantize = quantize
-        config.use_medusa = use_medusa
+        config.speculator = speculator

        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
@@ -82,13 +83,14 @@ class BLOOMSharded(CausalLM):
            process_group=self.process_group,
            prefix="transformer",
        )
-        if config.quantize == "gptq":
+        if config.quantize in ["gptq", "marlin"]:
            weights._set_gptq_params(model_id, revision)

        model = BloomForCausalLM(config, weights)

        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
+            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,

--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenize
 from typing import Optional, Tuple, List, Type, Dict

 from text_generation_server.models import Model
+from text_generation_server.utils.chunks import concat_text_chunks
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.models.types import (
    Batch,
@@ -86,7 +87,8 @@ class CausalLMBatch(Batch):
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
-            inputs.append(r.inputs)
+            inputs.append(concat_text_chunks(r.input_chunks.chunks))
+
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
@@ -482,12 +484,12 @@ class CausalLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
-        use_medusa: Optional[str] = None,
+        speculator: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
-        if use_medusa:
-            raise RuntimeError("Medusa decoding is not enabled for AutoModel")
+        if speculator:
+            raise RuntimeError("Speculator decoding is not enabled for AutoModel")

        if torch.cuda.is_available():
            device = torch.device("cuda")
@@ -536,6 +538,7 @@ class CausalLM(Model):
                tokenizer.add_special_tokens({"pad_token": "[PAD]"})

        super(CausalLM, self).__init__(
+            model_id=model_id,
            model=model,
            tokenizer=tokenizer,
            requires_padding=True,

--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -32,7 +32,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import BloomConfig, PreTrainedModel

-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,

--- a/server/text_generation_server/models/custom_modeling/clip.py
+++ b/server/text_generation_server/models/custom_modeling/clip.py
@@ -15,7 +15,7 @@ from transformers.modeling_outputs import (
 )
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig

-from text_generation_server.utils.layers import (
+from text_generation_server.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,

--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -25,19 +25,28 @@ from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple

-from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.import_utils import IS_ROCM_SYSTEM, IS_CUDA_SYSTEM
-from text_generation_server.utils.layers import (
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.models.globals import FLASH_DECODING
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
+)
+from text_generation_server.layers.layernorm import (
    FastLayerNorm,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)

-if IS_CUDA_SYSTEM:
+if SYSTEM == "cuda":
    import dropout_layer_norm
 else:
    dropout_layer_norm = None
@@ -52,7 +61,7 @@ class CohereRotary(PositionRotaryEmbedding):
        sin: torch.Tensor,
    ):
        # Such controlflows may add some overhead.
-        if IS_CUDA_SYSTEM:
+        if SYSTEM == "cuda":
            import rotary_emb

            q1 = query[..., ::2]
@@ -64,8 +73,8 @@ class CohereRotary(PositionRotaryEmbedding):
            k2 = key[..., 1::2]

            rotary_emb.apply_rotary(k1, k2, cos, sin, k1, k2, False)
-        elif IS_ROCM_SYSTEM:
-            from vllm import pos_encoding_ops
+        elif SYSTEM == "rocm":
+            from vllm._C import ops

            # NOTE: On RoCm systems, we use a ROPE implementatation adapted from VLLM which launches a single kernel for both query/key, contrary to flash-attn implementation used on NVIDIA systems.
            # Compiling flash-attn rotary on RoCm, it appears hipcc is unable to unroll loops, resulting in an even slower inference compared to eager: https://github.com/pytorch/pytorch/issues/113773
@@ -73,7 +82,7 @@ class CohereRotary(PositionRotaryEmbedding):
            head_size = query.shape[-1]

            # Inplace operation, updating query and key.
-            pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, False)
+            ops.rotary_embedding(query, key, head_size, cos, sin, False)
        else:
            raise ValueError(
                "Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
@@ -90,7 +99,7 @@ class CohereLayerNorm(nn.Module):
        self.eps = eps

    def forward(self, hidden_states):
-        if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
+        if hidden_states.shape[-1] > 8192 or SYSTEM == "rocm":
            hidden_states = hidden_states.reshape(
                -1, self.weight.shape[0], self.weight.shape[1]
            )
@@ -158,7 +167,7 @@ def _load_gqa(config, prefix: str, weights):
        dim=0,
    )

-    if config.quantize not in ["gptq", "awq"]:
+    if config.quantize not in ["gptq", "awq", "marlin"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.hidden_size // config.num_attention_heads
@@ -251,8 +260,8 @@ class FlashCohereAttention(torch.nn.Module):
        cu_seqlen_prefill,
        kv_cache,
        block_tables,
-        slots,
        input_lengths,
+        slots,
        max_s,
    ):
        qkv = self.query_key_value(hidden_states)
@@ -277,7 +286,7 @@ class FlashCohereAttention(torch.nn.Module):

        self.rotary_emb(query, key, cos, sin)

-        paged_attention.reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)

        # output tensor
        attn_output = torch.empty_like(query)
@@ -285,7 +294,7 @@ class FlashCohereAttention(torch.nn.Module):
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
+            attention(
                query,
                key,
                value,
@@ -296,7 +305,7 @@ class FlashCohereAttention(torch.nn.Module):
            )
        # Decode
        else:
-            paged_attention.attention(
+            attn_output = paged_attention(
                attn_output,
                query,
                kv_cache[0],
@@ -456,6 +465,7 @@ class FlashCohereModel(torch.nn.Module):
        )

        residual = None
+
        for i, layer in enumerate(self.layers):
            hidden_states, residual = layer(
                hidden_states,
@@ -504,7 +514,9 @@ class FlashCohereForCausalLM(torch.nn.Module):
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,

--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,22 +20,30 @@ from torch import nn
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
-from loguru import logger
-from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
+from text_generation_server.utils.import_utils import SYSTEM

-if not IS_XPU_SYSTEM:
+if SYSTEM != "ipex":
    from vllm.model_executor.layers.fused_moe import fused_moe
-from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
    FastLinear,
-    FastLayerNorm,
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
 )
+from text_generation_server.layers.rotary import (
+    PositionRotaryEmbedding,
+)
+from text_generation_server.layers.layernorm import (
+    FastLayerNorm,
+)
 from text_generation_server.utils.log import log_once


@@ -155,114 +163,13 @@ def promote_scalar(x: torch.Tensor) -> torch.Tensor:


 def load_attention(config, prefix, weights):
-    if config.n_heads != config.attn_config.kv_n_heads:
-        return _load_gqa(config, prefix, weights)
-    else:
    return TensorParallelColumnLinear.load_qkv(
        config,
        prefix=f"{prefix}.Wqkv",
        weights=weights,
        bias=False,
-        )
-
-
-def _load_gqa(config, prefix: str, weights):
-    assert config.d_model % config.n_heads == 0
-    assert config.n_heads % weights.process_group.size() == 0
-
-    head_dim = config.d_model // config.n_heads
-    world_size = weights.process_group.size()
-    rank = weights.process_group.rank()
-
-    q_block_size = config.d_model // world_size
-    q_start = rank * q_block_size
-    q_stop = (rank + 1) * q_block_size
-
-    kv_block_size = (config.attn_config.kv_n_heads * head_dim) // world_size
-    k_offset = config.d_model
-    k_start = k_offset + rank * kv_block_size
-    k_stop = k_offset + (rank + 1) * kv_block_size
-
-    v_offset = config.d_model + config.attn_config.kv_n_heads * head_dim
-    v_start = v_offset + rank * kv_block_size
-    v_stop = v_offset + (rank + 1) * kv_block_size
-
-    if config.quantize in ["gptq", "awq"]:
-        try:
-            qweight_slice = weights._get_slice(f"{prefix}.qweight")
-            q_qweight = qweight_slice[:, q_start:q_stop]
-            k_qweight = qweight_slice[:, k_start:k_stop]
-            v_qweight = qweight_slice[:, v_start:v_stop]
-
-            qweight = torch.cat([q_qweight, k_qweight, v_qweight], dim=1)
-        except RuntimeError:
-            raise RuntimeError(
-                f"Cannot load `{config.quantize}` weight, make sure the model is already quantized"
-            )
-
-        qzeros_slice = weights._get_slice(f"{prefix}.qzeros")
-        q_qzeros = qzeros_slice[:, q_start:q_stop]
-        k_qzeros = qzeros_slice[:, k_start:k_stop]
-        v_qzeros = qzeros_slice[:, v_start:v_stop]
-
-        qzeros = torch.cat([q_qzeros, k_qzeros, v_qzeros], dim=1)
-
-        scales_slice = weights._get_slice(f"{prefix}.scales")
-        q_scales = scales_slice[:, q_start:q_stop]
-        k_scales = scales_slice[:, k_start:k_stop]
-        v_scales = scales_slice[:, v_start:v_stop]
-
-        scales = torch.cat([q_scales, k_scales, v_scales], dim=1)
-
-        bits, groupsize, desc_act, quant_method = weights._get_gptq_params()
-
-        from text_generation_server.utils.layers import HAS_EXLLAMA
-
-        use_exllama = (
-            bits == 4 and HAS_EXLLAMA and config.quantize == "gptq" and not desc_act
-        )
-
-        if config.quantize == "gptq" and quant_method == "gptq":
-            g_idx_slice = weights._get_slice(f"{prefix}.g_idx")
-            q_g_idx = g_idx_slice[:, q_start:q_stop]
-            k_g_idx = g_idx_slice[:, k_start:k_stop]
-            v_g_idx = g_idx_slice[:, v_start:v_stop]
-
-            w = [q_g_idx, k_g_idx, v_g_idx]
-            for w2 in w[1:]:
-                torch.testing.assert_close(w2, w[0])
-            g_idx = w[0]
-        elif config.quantize == "gptq" and quant_method == "awq":
-            log_once(
-                logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
-            )
-            from text_generation_server.utils.awq.conversion_utils import (
-                fast_awq_to_gptq,
-            )
-
-            qweight, qzeros = fast_awq_to_gptq(qweight, qzeros)
-            if use_exllama:
-                g_idx = None
-            else:
-                g_idx = (
-                    torch.arange(qweight.shape[0] * (32 // bits), device=qweight.device)
-                    // groupsize
-                ).to(dtype=torch.int32)
-        else:
-            g_idx = None
-
-        weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
-    else:
-        qkv_slice = weights._get_slice(f"{prefix}.Wqkv.weight")
-        q = qkv_slice[q_start:q_stop]
-        k = qkv_slice[k_start:k_stop]
-        v = qkv_slice[v_start:v_stop]
-
-        weight = torch.cat([q, k, v], dim=0)
-        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
-
-    return TensorParallelColumnLinear(
-        get_linear(weight, bias=None, quantize=config.quantize)
+        num_heads=config.n_heads,
+        num_key_value_heads=config.attn_config.kv_n_heads,
    )


@@ -410,9 +317,7 @@ class DbrxAttention(torch.nn.Module):

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

-        paged_attention.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
-        )
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

        # output tensor
        attn_output = torch.empty_like(query)
@@ -420,7 +325,7 @@ class DbrxAttention(torch.nn.Module):
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
+            attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
@@ -431,7 +336,7 @@ class DbrxAttention(torch.nn.Module):
            )
        # Decode
        else:
-            paged_attention.attention(
+            attn_output = paged_attention(
                attn_output,
                query,
                kv_cache[0],
@@ -817,7 +722,9 @@ class FlashDbrxForCausalLM(torch.nn.Module):
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,

--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
+    FastRMSNorm,
+)
+
+
+class Gemma2Config(PretrainedConfig):
+    def __init__(
+        self,
+        vocab_size=256128,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class Gemma2FastRMSNorm(FastRMSNorm):
+    @classmethod
+    def load(cls, prefix, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
+        weight = weights.get_tensor(f"{prefix}.weight") + 1
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new
+
+    # perform the multiplication in full precision and downcast after
+    def forward(self, hidden_states, residual=None):
+        if residual is not None:
+            hidden_states += residual
+        residual = hidden_states
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states * self.weight
+        return hidden_states.to(self.dtype), residual
+
+
+def load_attention(config, prefix, weights):
+    if config.num_attention_heads != config.num_key_value_heads:
+        return _load_gqa(config, prefix, weights)
+    else:
+        return TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+            dim=0,
+            weights=weights,
+            bias=False,
+        )
+
+
+def _load_gqa(config, prefix: str, weights):
+    assert config.num_attention_heads % weights.process_group.size() == 0
+
+    weight = weights.get_multi_weights_col(
+        prefixes=[f"{prefix}.q_proj", f"{prefix}.k_proj", f"{prefix}.v_proj"],
+        quantize=config.quantize,
+        dim=0,
+    )
+
+    if config.quantize not in ["gptq", "awq", "marlin"]:
+        weight = weight.to(dtype=weights.dtype).to(device=weights.device)
+
+        head_size = config.head_dim
+        num_heads = config.num_attention_heads // weights.process_group.size()
+        num_key_value_heads = config.num_key_value_heads // weights.process_group.size()
+        assert list(weight.shape) == [
+            (num_heads + 2 * num_key_value_heads) * head_size,
+            config.hidden_size,
+        ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}"
+
+    return TensorParallelColumnLinear(
+        get_linear(weight, bias=None, quantize=config.quantize)
+    )
+
+
+class FlashGemma2Attention(torch.nn.Module):
+    def __init__(self, prefix: str, config, weights, causal: bool, is_sliding: bool):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.head_size = config.head_dim
+        self.causal = causal
+        if is_sliding:
+            self.window_size = config.sliding_window
+        else:
+            self.window_size = -1
+
+        self.rotary_emb = PositionRotaryEmbedding.static(
+            config=config,
+            dim=self.head_size,
+            base=config.rope_theta,
+            device=weights.device,
+        )
+
+        # self.softmax_scale = self.head_size**-0.5
+        self.softmax_scale = config.query_pre_attn_scalar**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+        self.num_key_value_heads = (
+            config.num_key_value_heads // weights.process_group.size()
+        )
+
+        self.query_key_value = load_attention(config, prefix, weights)
+
+        self.o_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.o_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.num_groups = self.num_heads // self.num_key_value_heads
+        self.kv_head_mapping = torch.arange(
+            0, self.num_key_value_heads, dtype=torch.int32, device=weights.device
+        ).repeat_interleave(self.num_groups)
+
+    def forward(
+        self,
+        hidden_states,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        qkv = self.query_key_value(hidden_states)
+        query, kv = qkv.split(
+            [
+                self.head_size * self.num_heads,
+                2 * self.head_size * self.num_key_value_heads,
+            ],
+            dim=1,
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        kv = kv.view(-1, 2, self.num_key_value_heads, self.head_size)
+
+        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
+
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                torch.select(kv, dim=1, index=0),
+                torch.select(kv, dim=1, index=1),
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+                causal=self.causal,
+                window_size_left=self.window_size,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.hidden_act
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+        # Fuse gate and up proj
+        self.gate_up_proj = TensorParallelColumnLinear.load_multi(
+            config,
+            prefixes=[f"{prefix}.gate_proj", f"{prefix}.up_proj"],
+            weights=weights,
+            dim=0,
+            bias=False,
+        )
+        self.down_proj = TensorParallelRowLinear.load(
+            config,
+            prefix=f"{prefix}.down_proj",
+            weights=weights,
+            bias=False,
+        )
+        self.intermediate_size = (
+            config.intermediate_size // weights.process_group.size()
+        )
+
+    def forward(self, hidden_states):
+        gate_up_states = self.gate_up_proj(hidden_states)
+        gate_up_states = gate_up_states.view(-1, 2, self.intermediate_size)
+        return self.down_proj(self.act(gate_up_states[:, 0]) * gate_up_states[:, 1])
+
+
+class FlashGemma2Layer(nn.Module):
+    def __init__(self, prefix, config, weights, causal: bool, is_sliding: bool):
+        super().__init__()
+        self.self_attn = FlashGemma2Attention(
+            prefix=f"{prefix}.self_attn",
+            config=config,
+            weights=weights,
+            causal=causal,
+            is_sliding=is_sliding,
+        )
+        self.mlp = Gemma2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.input_layernorm", weights=weights, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_attention_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.pre_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.pre_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+        self.post_feedforward_layernorm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.post_feedforward_layernorm",
+            weights=weights,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cos,
+        sin,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            normed_hidden_states,
+            cos,
+            sin,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        # faster post attention rms norm
+        normed_attn_res_output, _ = self.post_attention_layernorm(attn_output)
+        normed_attn_res_output = normed_attn_res_output + res
+        res = normed_attn_res_output
+
+        pre_normed, _ = self.pre_feedforward_layernorm(normed_attn_res_output)
+        mlp_output = self.mlp(pre_normed)
+        post_hidden_states, _ = self.post_feedforward_layernorm(mlp_output)
+
+        return post_hidden_states, normed_attn_res_output
+
+
+class FlashGemma2Model(torch.nn.Module):
+    def __init__(self, prefix, config, weights, causal: bool):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGemma2Layer(
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    causal=causal,
+                    is_sliding=layer_id % 2 == 0,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Gemma2FastRMSNorm.load(
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
+        )
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        # Get rotary cos and sin for this forward
+        # Avoid to index in each layer
+        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
+            position_ids, max_s, hidden_states.dtype
+        )
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cos,
+                sin,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FlashGemma2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix, config, weights, causal: bool):
+        super().__init__()
+
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemma2Model(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
+        self.lm_head = SpeculativeHead.load(
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
+        hidden_states = self.model(
+            input_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -26,14 +26,20 @@ from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple

-from text_generation_server.utils import paged_attention, flash_attn
-from text_generation_server.utils.layers import (
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    PositionRotaryEmbedding,
    SpeculativeHead,
    get_linear,
+)
+from text_generation_server.layers.rotary import PositionRotaryEmbedding
+from text_generation_server.layers.layernorm import (
    FastRMSNorm,
 )

@@ -97,8 +103,13 @@ class GemmaConfig(PretrainedConfig):
 class GemmaFastRMSNorm(FastRMSNorm):
    @classmethod
    def load(cls, prefix, weights, eps=1e-6):
+        dtype = weights.dtype
+        weights.dtype = torch.float32
        weight = weights.get_tensor(f"{prefix}.weight") + 1
-        return cls(weight, eps)
+        weights.dtype = dtype
+        new = cls(weight, eps)
+        new.dtype = dtype
+        return new

    # perform the multiplication in full precision and downcast after
    def forward(self, hidden_states, residual=None):
@@ -109,7 +120,7 @@ class GemmaFastRMSNorm(FastRMSNorm):
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = hidden_states * self.weight
-        return hidden_states.to(self.weight.dtype), residual
+        return hidden_states.to(self.dtype), residual


 def load_attention(config, prefix, weights):
@@ -134,7 +145,7 @@ def _load_gqa(config, prefix: str, weights):
        dim=0,
    )

-    if config.quantize not in ["gptq", "awq"]:
+    if config.quantize not in ["gptq", "awq", "marlin"]:
        weight = weight.to(dtype=weights.dtype).to(device=weights.device)

        head_size = config.head_dim
@@ -151,15 +162,11 @@ def _load_gqa(config, prefix: str, weights):


 class FlashGemmaAttention(torch.nn.Module):
-    def __init__(
-        self,
-        prefix: str,
-        config,
-        weights,
-    ):
+    def __init__(self, prefix: str, config, weights, causal: bool):
        super().__init__()
        self.num_heads = config.num_attention_heads
        self.head_size = config.head_dim
+        self.causal = causal

        self.rotary_emb = PositionRotaryEmbedding.static(
            config=config,
@@ -218,9 +225,7 @@ class FlashGemmaAttention(torch.nn.Module):

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

-        paged_attention.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
-        )
+        reshape_and_cache(kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots)

        # output tensor
        attn_output = torch.empty_like(query)
@@ -228,7 +233,7 @@ class FlashGemmaAttention(torch.nn.Module):
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
+            attention(
                query,
                torch.select(kv, dim=1, index=0),
                torch.select(kv, dim=1, index=1),
@@ -236,10 +241,11 @@ class FlashGemmaAttention(torch.nn.Module):
                cu_seqlen_prefill,
                max_s,
                self.softmax_scale,
+                causal=self.causal,
            )
        # Decode
        else:
-            paged_attention.attention(
+            attn_output = paged_attention(
                attn_output,
                query,
                kv_cache[0],
@@ -293,11 +299,10 @@ class GemmaMLP(nn.Module):


 class FlashGemmaLayer(nn.Module):
-    def __init__(self, layer_id, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()
-        prefix = f"model.layers.{layer_id}"
        self.self_attn = FlashGemmaAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn", config=config, weights=weights, causal=causal
        )
        self.mlp = GemmaMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)

@@ -349,41 +354,34 @@ class FlashGemmaLayer(nn.Module):


 class FlashGemmaModel(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()

        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
-        embed_norm = config.hidden_size**0.5
-        self.embed_tokens = TensorParallelEmbedding(
-            prefix="model.embed_tokens", weights=weights
-        )
-        self.embed_tokens.weight *= embed_norm
-
        self.layers = nn.ModuleList(
            [
                FlashGemmaLayer(
-                    layer_id,
-                    config,
-                    weights,
+                    prefix=f"{prefix}.layers.{layer_id}",
+                    config=config,
+                    weights=weights,
+                    causal=causal,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
        self.norm = GemmaFastRMSNorm.load(
-            prefix="model.norm", weights=weights, eps=config.rms_norm_eps
+            prefix=f"{prefix}.norm", weights=weights, eps=config.rms_norm_eps
        )

-        self.gradient_checkpointing = False
-
        self.head_size = self.layers[0].self_attn.head_size
        self.num_heads = self.layers[0].self_attn.num_heads
        self.num_key_value_heads = self.layers[0].self_attn.num_key_value_heads

    def forward(
        self,
-        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
        position_ids: torch.Tensor,
        cu_seqlen_prefill: Optional[torch.Tensor],
        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
@@ -392,7 +390,7 @@ class FlashGemmaModel(torch.nn.Module):
        input_lengths: torch.Tensor,
        max_s: int,
    ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds

        # Get rotary cos and sin for this forward
        # Avoid to index in each layer
@@ -421,13 +419,30 @@ class FlashGemmaModel(torch.nn.Module):


 class FlashGemmaForCausalLM(torch.nn.Module):
-    def __init__(self, config, weights):
+    def __init__(self, prefix, config, weights, causal: bool):
        super().__init__()

-        self.model = FlashGemmaModel(config, weights)
+        embed_norm = config.hidden_size**0.5
+        if not prefix:
+            prefix = "model"
+        else:
+            prefix = f"{prefix}.model"
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=f"{prefix}.embed_tokens", weights=weights
+        )
+        self.embed_tokens.weight *= embed_norm
+
+        self.model = FlashGemmaModel(
+            prefix=prefix, config=config, weights=weights, causal=causal
+        )
        self.lm_head = SpeculativeHead.load(
-            config,
-            prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head",
+            prefix=(
+                f"{prefix}.embed_tokens"
+                if config.tie_word_embeddings
+                else f"{prefix}.lm_head"
+            ),
+            config=config,
            weights=weights,
        )

@@ -441,10 +456,13 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        slots: torch.Tensor,
        input_lengths: torch.Tensor,
        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_embeds = self.embed_tokens(input_ids)
        hidden_states = self.model(
-            input_ids,
+            input_embeds,
            position_ids,
            cu_seqlen_prefill,
            kv_cache,

--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed
+
+from torch import nn
+from transformers.activations import ACT2FN
+from typing import Optional, List, Tuple
+
+from text_generation_server.layers.attention import (
+    paged_attention,
+    attention,
+    reshape_and_cache,
+)
+from text_generation_server.layers import (
+    TensorParallelRowLinear,
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    SpeculativeHead,
+    get_linear,
+)
+
+
+def load_qkv(config, prefix: str, weights, head_size, num_heads):
+    if config.quantize == "gptq":
+        return _load_qkv_gptq(
+            config,
+            prefix,
+            weights,
+        )
+    elif config.quantize == "marlin":
+        raise RuntimeError(
+            "GPT-2 models with marlin quantization are not yet supported"
+        )
+    else:
+        return _load_qkv(config, prefix, weights, head_size, num_heads)
+
+
+def _load_qkv_gptq(config, prefix: str, weights):
+    world_size = weights.process_group.size()
+    rank = weights.process_group.rank()
+
+    # Weights
+    weight = weights.get_weights_col_packed_qkv(
+        f"{prefix}.c_attn",
+        config.quantize,
+        config.num_attention_heads,
+        config.num_attention_heads,
+    )
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    bias = torch.cat(tensors, dim=0)
+    bias = bias.to(device=weights.device)
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+def _load_qkv(config, prefix: str, weights, head_size, num_heads):
+    """Load QKV from a single, transposed matrix."""
+
+    slice_ = weights._get_slice(f"{prefix}.c_attn.weight")
+    shape = slice_.get_shape()
+    total_size = shape[1]
+    assert total_size % 3 == 0, f"Prepacked is not divisible by {3}"
+    world_size = weights.process_group.size()
+    single_size = total_size // 3
+    assert single_size % world_size == 0
+    rank = weights.process_group.rank()
+
+    # Weights
+    block_size = single_size // world_size
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    tensors = []
+    for i in range(3):
+        tensor = slice_[:, start + i * single_size : stop + i * single_size]
+        tensors.append(tensor)
+    weight = torch.cat(tensors, dim=1).T
+    weight = weight.to(dtype=weights.dtype)
+    weight = weight.to(device=weights.device)
+
+    # Bias
+    slice_ = weights._get_slice(f"{prefix}.c_attn.bias")
+    shape = slice_.get_shape()
+    total_size = shape[0]
+    single_size = total_size // 3
+    block_size = single_size // world_size
+    assert single_size % world_size == 0
+    start = rank * block_size
+    stop = (rank + 1) * block_size
+    b = []
+    for i in range(3):
+        tensor = slice_[start + i * single_size : stop + i * single_size]
+        b.append(tensor)
+    bias = torch.cat(b, dim=0)
+    bias = bias.to(dtype=weights.dtype)
+    bias = bias.to(device=weights.device)
+    assert list(bias.shape) == [
+        3 * num_heads * head_size
+    ], f"{weight.shape} != {[3 * num_heads * head_size]}"
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+def load_row(config, prefix: str, weights, bias: bool):
+    """load_row, but with transposed weight matrices."""
+
+    if config.quantize == "gptq":
+        weight = weights.get_multi_weights_row(prefix, quantize=config.quantize)
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=0).T
+
+    if bias and weights.process_group.rank() == 0:
+        # Rank is only on the first rank process
+        bias = weights.get_tensor(f"{prefix}.bias")
+    else:
+        bias = None
+
+    return TensorParallelRowLinear(
+        get_linear(weight, bias, config.quantize), process_group=weights.process_group
+    )
+
+
+def load_col(config, prefix: str, weights, bias: bool):
+    """load_col, but with transposed weight matrices."""
+    if config.quantize == "gptq":
+        weight = weights.get_multi_weights_col(
+            [prefix], quantize=config.quantize, dim=1
+        )
+    else:
+        weight = weights.get_sharded(f"{prefix}.weight", dim=1).T
+
+    if bias:
+        bias = weights.get_sharded(f"{prefix}.bias", dim=0)
+    else:
+        bias = None
+
+    return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize))
+
+
+class FlashGPT2Attention(torch.nn.Module):
+    def __init__(
+        self,
+        prefix: str,
+        config,
+        weights,
+    ):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+
+        self.head_size = self.hidden_size // self.num_heads
+        self.softmax_scale = self.head_size**-0.5
+
+        if self.num_heads % weights.process_group.size() != 0:
+            raise ValueError(
+                f"`num_heads` must be divisible by `num_shards` (got `num_heads`: {self.num_heads} "
+                f"and `num_shards`: {weights.process_group.size()}"
+            )
+        self.num_heads = self.num_heads // weights.process_group.size()
+
+        self.query_key_value = load_qkv(
+            config,
+            prefix=prefix,
+            weights=weights,
+            head_size=self.head_size,
+            num_heads=self.num_heads,
+        )
+
+        self.o_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        self.kv_head_mapping = torch.arange(
+            0, self.num_heads, dtype=torch.int32, device=weights.device
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        query, key, value = self.query_key_value(hidden_states).split(
+            self.head_size * self.num_heads, dim=1
+        )
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_heads, self.head_size)
+        value = value.view(-1, self.num_heads, self.head_size)
+
+        reshape_and_cache(key, value, kv_cache[0], kv_cache[1], slots)
+
+        # output tensor
+        attn_output = torch.empty_like(query)
+
+        # Prefill
+        if cu_seqlen_prefill is not None:
+            # flash attention
+            attention(
+                query,
+                key,
+                value,
+                attn_output,
+                cu_seqlen_prefill,
+                max_s,
+                self.softmax_scale,
+            )
+        # Decode
+        else:
+            attn_output = paged_attention(
+                attn_output,
+                query,
+                kv_cache[0],
+                kv_cache[1],
+                self.kv_head_mapping,
+                self.softmax_scale,
+                block_tables,
+                input_lengths,
+                max_s,
+            )
+
+        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
+
+
+class GPT2MLP(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        act = config.activation_function
+        self.act = (
+            ACT2FN[act]
+            if "gelu" not in act
+            else lambda x: torch.nn.functional.gelu(
+                x,
+                approximate=(
+                    "tanh" if act in ["gelu_fast", "gelu_pytorch_tanh"] else "none"
+                ),
+            )
+        )
+
+        self.c_fc = load_col(
+            config, prefix=f"{prefix}.c_fc", weights=weights, bias=True
+        )
+        self.c_proj = load_row(
+            config,
+            prefix=f"{prefix}.c_proj",
+            weights=weights,
+            bias=True,
+        )
+
+        intermediate_size = (
+            config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+        )
+
+        self.intermediate_size = intermediate_size // weights.process_group.size()
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        return self.c_proj(hidden_states)
+
+
+class FlashGPT2Layer(nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+        self.self_attn = FlashGPT2Attention(
+            prefix=f"{prefix}.attn", config=config, weights=weights
+        )
+        self.mlp = GPT2MLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
+
+        self.input_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_1", weights=weights, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm.load(
+            prefix=f"{prefix}.ln_2",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        residual,
+        cu_seqlen_prefill,
+        kv_cache,
+        block_tables,
+        slots,
+        input_lengths,
+        max_s,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        attn_output = self.self_attn(
+            hidden_states,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+        )
+
+        hidden_states = attn_output + residual
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        mlp_output = self.mlp(hidden_states)
+
+        return residual + mlp_output, residual
+
+
+class FlashGPT2Model(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        process_group = weights.process_group
+        self.tp_rank = process_group.rank()
+        self.tp_world_size = process_group.size()
+        self.layers = nn.ModuleList(
+            [
+                FlashGPT2Layer(
+                    prefix=(
+                        f"h.{layer_id}" if not prefix else f"{prefix}.h.{layer_id}"
+                    ),
+                    config=config,
+                    weights=weights,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = nn.LayerNorm.load(
+            prefix="ln_f" if not prefix else f"{prefix}.ln_f",
+            weights=weights,
+            eps=config.layer_norm_epsilon,
+        )
+
+        self.gradient_checkpointing = False
+
+        self.head_size = self.layers[0].self_attn.head_size
+        self.num_heads = self.layers[0].self_attn.num_heads
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        true_max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states = inputs_embeds
+
+        residual = None
+        for i, layer in enumerate(self.layers):
+            hidden_states, residual = layer(
+                hidden_states,
+                residual,
+                cu_seqlen_prefill,
+                kv_cache[i],
+                block_tables,
+                slots,
+                input_lengths,
+                max_s,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class FlashGPT2ForCausalLM(torch.nn.Module):
+    def __init__(self, prefix, config, weights):
+        super().__init__()
+
+        self.embed_tokens = TensorParallelEmbedding(
+            prefix=("wte" if not prefix else f"{prefix}.wte"),
+            weights=weights,
+        )
+        self.embed_positions = TensorParallelEmbedding(
+            prefix=("wpe" if not prefix else f"{prefix}.wpe"),
+            weights=weights,
+        )
+
+        self.model = FlashGPT2Model(prefix, config, weights)
+        self.lm_head = SpeculativeHead.load(
+            config,
+            prefix="wte" if not prefix else f"{prefix}.wte",
+            weights=weights,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        cu_seqlen_prefill: Optional[torch.Tensor],
+        kv_cache: List[Tuple[torch.Tensor, torch.Tensor]],
+        block_tables: torch.Tensor,
+        slots: torch.Tensor,
+        input_lengths: torch.Tensor,
+        max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor] = None,
+        lm_head_indices: Optional[torch.Tensor] = None,
+        adapter_data: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        token_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+        inputs_embeds = token_embeds + position_embeds
+        hidden_states = self.model(
+            inputs_embeds,
+            position_ids,
+            cu_seqlen_prefill,
+            kv_cache,
+            block_tables,
+            slots,
+            input_lengths,
+            max_s,
+            true_max_s=max_s,
+            prefill_cache_indices=prefill_cache_indices,
+        )
+        if lm_head_indices is not None:
+            hidden_states = hidden_states[lm_head_indices]
+        logits, speculative_logits = self.lm_head(hidden_states)
+        return logits, speculative_logits