Revamp medusa implementation so that every model can benefit. (#1588)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.

Revamp medusa implementation so that every model can benefit. (#1588)
# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
bf700e7e · Nicolas Patry · GitHub · ac5a1c6f · bf700e7e · bf700e7e
Unverified Commit bf700e7e authored Feb 26, 2024 by Nicolas Patry Committed by GitHub Feb 26, 2024
20 changed files
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -236,6 +236,7 @@ def launcher(event_loop):
        use_flash_attention: bool = True,
        disable_grammar_support: bool = False,
        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)
        master_port = random.randint(10_000, 20_000)
@@ -268,6 +269,9 @@ def launcher(event_loop):
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")
@@ -302,6 +306,7 @@ def launcher(event_loop):
        use_flash_attention: bool = True,
        disable_grammar_support: bool = False,
        dtype: Optional[str] = None,
+        revision: Optional[str] = None,
    ):
        port = random.randint(8000, 10_000)
@@ -317,6 +322,9 @@ def launcher(event_loop):
        if dtype is not None:
            args.append("--dtype")
            args.append(dtype)
+        if revision is not None:
+            args.append("--revision")
+            args.append(revision)
        if trust_remote_code:
            args.append("--trust-remote-code")

--- a/integration-tests/models/test_flash_medusa.py
+++ b/integration-tests/models/test_flash_medusa.py
@@ -3,7 +3,9 @@ import pytest
 @pytest.fixture(scope="module")
 def flash_medusa_handle(launcher):
-    with launcher("FasterDecoding/medusa-vicuna-7b-v1.3", num_shard=2) as handle:
+    with launcher(
+        "FasterDecoding/medusa-vicuna-7b-v1.3", num_shard=2, revision="refs/pr/1"
+    ) as handle:
        yield handle

--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -154,12 +154,8 @@ def download_weights(
            import json
            medusa_head = hf_hub_download(
-                model_id, revision=revision, filename="medusa_lm_head.pt"
+                model_id, revision=revision, filename="medusa_lm_head.safetensors"
            )
-            if auto_convert:
-                medusa_sf = Path(medusa_head[: -len(".pt")] + ".safetensors")
-                if not medusa_sf.exists():
-                    utils.convert_files([Path(medusa_head)], [medusa_sf], [])
            medusa_config = hf_hub_download(
                model_id, revision=revision, filename="config.json"
            )
@@ -198,16 +194,12 @@ def download_weights(
            if not extension == ".safetensors" or not auto_convert:
                raise e
-    elif (Path(model_id) / "medusa_lm_head.pt").exists():
+    elif (Path(model_id) / "medusa_lm_head.safetensors").exists():
        # Try to load as a local Medusa model
        try:
            import json
-            medusa_head = Path(model_id) / "medusa_lm_head.pt"
+            medusa_head = Path(model_id) / "medusa_lm_head.safetensors"
-            if auto_convert:
-                medusa_sf = Path(model_id) / "medusa_lm_head.safetensors"
-                if not medusa_sf.exists():
-                    utils.convert_files([Path(medusa_head)], [medusa_sf], [])
            medusa_config = Path(model_id) / "config.json"
            with open(medusa_config, "r") as f:
                config = json.load(f)

--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -3,7 +3,9 @@ import torch
 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import modeling_auto
+from huggingface_hub import hf_hub_download
 from typing import Optional
+from pathlib import Path
 from text_generation_server.utils.speculate import get_speculate, set_speculate
 from text_generation_server.models.model import Model
@@ -115,44 +117,14 @@ def get_model(
    else:
        set_speculate(0)
-    if "facebook/galactica" in model_id:
-        return GalacticaSharded(
-            model_id,
-            revision,
-            quantize=quantize,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-    if model_id.startswith("bigcode/"):
-        if FLASH_ATTENTION:
-            return FlashSantacoderSharded(
-                model_id,
-                revision,
-                quantize=quantize,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-            )
-        elif sharded:
-            raise NotImplementedError(
-                FLASH_ATT_ERROR_MESSAGE.format("Sharded Santacoder")
-            )
-        else:
-            return SantaCoder(
-                model_id,
-                revision,
-                quantize=quantize,
-                dtype=dtype,
-                trust_remote_code=trust_remote_code,
-            )
    config_dict, _ = PretrainedConfig.get_config_dict(
        model_id, revision=revision, trust_remote_code=trust_remote_code
    )
    use_medusa = None
    if "medusa_num_heads" in config_dict:
-        use_medusa = model_id
+        medusa_model_id = model_id
+        medusa_revision = revision
        model_id = config_dict["base_model_name_or_path"]
        revision = "main"
        speculate_medusa = config_dict["medusa_num_heads"]
@@ -169,6 +141,20 @@ def get_model(
        config_dict, _ = PretrainedConfig.get_config_dict(
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
+        is_local = Path(medusa_model_id).exists()
+        if not is_local:
+            medusa_config = hf_hub_download(
+                medusa_model_id, revision=medusa_revision, filename="config.json"
+            )
+            hf_hub_download(
+                medusa_model_id,
+                revision=medusa_revision,
+                filename="medusa_lm_head.safetensors",
+            )
+            use_medusa = Path(medusa_config).parent
+        else:
+            use_medusa = Path(medusa_model_id)
        method = "medusa"
    else:
        method = "n-gram"
@@ -193,16 +179,22 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
-    if model_type == "gpt_bigcode":
+    if (
+        model_type == "gpt_bigcode"
+        or model_type == "gpt2"
+        and model_id.startswith("bigcode/")
+    ):
        if FLASH_ATTENTION:
            return FlashSantacoderSharded(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -215,6 +207,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -224,6 +217,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -232,6 +226,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -242,6 +237,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -250,6 +246,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -258,6 +255,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -268,15 +266,16 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
-                use_medusa=use_medusa,
            )
        else:
            return CausalLM(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -291,6 +290,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -301,9 +301,9 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
-                use_medusa=use_medusa,
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
@@ -312,6 +312,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -321,9 +322,9 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
-                use_medusa=use_medusa,
            )
        elif sharded:
            raise NotImplementedError(
@@ -334,6 +335,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -347,6 +349,7 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
+                    use_medusa=use_medusa,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
@@ -357,6 +360,7 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
+                    use_medusa=use_medusa,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
@@ -365,6 +369,7 @@ def get_model(
                    model_id,
                    revision,
                    quantize=quantize,
+                    use_medusa=use_medusa,
                    dtype=dtype,
                    trust_remote_code=trust_remote_code,
                )
@@ -378,6 +383,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -391,6 +397,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -400,6 +407,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -409,6 +417,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -418,6 +427,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -441,6 +451,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -449,6 +460,7 @@ def get_model(
            model_id,
            revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
@@ -460,6 +472,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )
@@ -468,6 +481,7 @@ def get_model(
                model_id,
                revision,
                quantize=quantize,
+                use_medusa=use_medusa,
                dtype=dtype,
                trust_remote_code=trust_remote_code,
            )

--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -42,6 +42,7 @@ class BLOOMSharded(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -70,6 +71,7 @@ class BLOOMSharded(CausalLM):
        )
        config.pad_token_id = 3
        config.quantize = quantize
+        config.use_medusa = use_medusa
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
@@ -103,7 +105,7 @@ class BLOOMSharded(CausalLM):
    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
    ):
-        outputs = self.model.forward(
+        outputs, speculative_logits = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
@@ -112,4 +114,4 @@ class BLOOMSharded(CausalLM):
        )
        logits = outputs.logits
-        return logits, outputs.past_key_values
+        return logits, speculative_logits, outputs.past_key_values
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -482,6 +482,7 @@ class CausalLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -550,7 +551,9 @@ class CausalLM(Model):
    def forward(
        self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
-    ) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
+    ) -> Tuple[
+        torch.Tensor, Optional[torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]
+    ]:
        # Model Forward
        kwargs = {
            "input_ids": input_ids,
@@ -563,7 +566,11 @@ class CausalLM(Model):
            kwargs["position_ids"] = position_ids
        outputs = self.model.forward(**kwargs)
-        return outputs.logits, outputs.past_key_values
+        if isinstance(outputs, tuple):
+            outputs, speculative_logits = outputs
+        else:
+            speculative_logits = None
+        return outputs.logits, speculative_logits, outputs.past_key_values
    @tracer.start_as_current_span("generate_token")
    def generate_token(
@@ -573,7 +580,7 @@ class CausalLM(Model):
        # slice the attention mask to the correct shape
        attention_mask = batch.attention_mask[:, : -batch.padding_right_offset]
-        logits, past = self.forward(
+        logits, speculative_logits, past = self.forward(
            batch.input_ids,
            attention_mask,
            batch.position_ids,

--- a/server/text_generation_server/models/custom_modeling/bloom_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/bloom_modeling.py
@@ -36,7 +36,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 CUSTOM_KERNELS_ENABLED = False
@@ -820,7 +820,7 @@ class BloomForCausalLM(BloomPreTrainedModel):
        super().__init__(config)
        self.transformer = BloomModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="word_embeddings",
            weights=weights,
@@ -904,17 +904,20 @@ class BloomForCausalLM(BloomPreTrainedModel):
        )
        hidden_states = transformer_outputs[0]
-        lm_logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
        loss = None
        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output
-        return CausalLMOutputWithCrossAttentions(
+        return (
+            CausalLMOutputWithCrossAttentions(
                loss=loss,
-            logits=lm_logits,
+                logits=logits,
                past_key_values=transformer_outputs.past_key_values,
                hidden_states=transformer_outputs.hidden_states,
                attentions=transformer_outputs.attentions,
+            ),
+            speculative_logits,
        )
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -37,7 +37,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
    FastRMSNorm,
 )
@@ -575,7 +575,7 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        super().__init__()
        self.model = FlashGemmaModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="model.embed_tokens" if config.tie_word_embeddings else "lm_head",
            weights=weights,
@@ -592,7 +592,7 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        input_lengths: torch.Tensor,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
@@ -605,5 +605,5 @@ class FlashGemmaForCausalLM(torch.nn.Module):
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
-        return logits
+        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -32,7 +32,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
    FastRMSNorm,
 )
@@ -410,7 +410,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
        super().__init__()
        self.model = FlashLlamaModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,
@@ -427,7 +427,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
        input_lengths: torch.Tensor,
        max_s: int,
        lm_head_indices: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.model(
            input_ids,
            position_ids,
@@ -440,5 +440,5 @@ class FlashLlamaForCausalLM(torch.nn.Module):
        )
        if lm_head_indices is not None:
            hidden_states = hidden_states[lm_head_indices]
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
-        return logits
+        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -32,7 +32,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
    FastRMSNorm,
 )
@@ -419,7 +419,7 @@ class FlashMistralForCausalLM(torch.nn.Module):
        super().__init__()
        self.model = MistralModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,

--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -37,7 +37,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
 )
@@ -810,7 +810,7 @@ class FlashMixtralForCausalLM(torch.nn.Module):
        super().__init__()
        self.model = MixtralModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,

--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -33,7 +33,7 @@ from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    FastLayerNorm,
    PositionRotaryEmbedding,
    get_linear,
@@ -369,7 +369,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
        super().__init__(config)
        self.gpt_neox = FlashGPTNeoXModel(config, weights)
-        self.embed_out = TensorParallelHead.load(
+        self.embed_out = SpeculativeHead.load(
            config, prefix="embed_out", weights=weights
        )

--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -12,7 +12,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    PositionRotaryEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
    FastLayerNorm,
 )
@@ -376,7 +376,7 @@ class FlashPhiForCausalLM(torch.nn.Module):
        super().__init__()
        self.model = FlashPhiModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config,
            prefix="lm_head",
            weights=weights,

--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -12,7 +12,7 @@ from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    FastLayerNorm,
    PositionRotaryEmbedding,
    get_linear,
@@ -613,9 +613,7 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
        self.transformer = FlashRWModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(config, prefix="lm_head", weights=weights)
-            config, prefix="lm_head", weights=weights
-        )
    def forward(
        self,

--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -9,7 +9,7 @@ from text_generation_server.utils import paged_attention, flash_attn
 from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
-    TensorParallelHead,
+    SpeculativeHead,
    TensorParallelEmbedding,
    FastLayerNorm,
    get_linear,
@@ -453,7 +453,7 @@ class FlashSantacoderForCausalLM(nn.Module):
    def __init__(self, config, weights):
        super().__init__()
        self.transformer = FlashSantacoderModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config, prefix="transformer.wte", weights=weights
        )

--- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py
@@ -51,7 +51,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
    PositionRotaryEmbedding,
    FastLinear,
 )
@@ -272,9 +272,7 @@ class IdeficsDecoupledTensorParallelLinear(nn.Module):
        weights,
    ) -> None:
        super().__init__()
-        self.fc = TensorParallelHead.load(
+        self.fc = SpeculativeHead.load(config=config, prefix="lm_head", weights=weights)
-            config=config, prefix="lm_head", weights=weights
-        )
        self.additional_fc = FastLinear.load(
            config=config,
            prefix="lm_head.additional_fc",
@@ -283,11 +281,11 @@ class IdeficsDecoupledTensorParallelLinear(nn.Module):
        )
    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        output = self.fc(input)
+        output, speculative_logits = self.fc(input)
        additional_features = self.additional_fc(input)
        output = torch.cat((output, additional_features), -1)
-        return output
+        return output, speculative_logits
    def extra_repr(self) -> str:
        """Overwriting `nn.Linear.extra_repr` to include new parameters."""
@@ -1503,17 +1501,20 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel):
        )
        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
        loss = None
-        return CausalLMOutputWithPastImage(
+        return (
+            CausalLMOutputWithPastImage(
                loss=loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
                image_hidden_states=outputs.image_hidden_states,
+            ),
+            speculative_logits,
        )
    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):

--- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py
@@ -9,6 +9,7 @@ from transformers.configuration_utils import PretrainedConfig
 import torch.nn.functional as F
 from text_generation_server.utils.layers import (
+    SpeculativeHead,
    TensorParallelEmbedding,
    FastRMSNorm,
    FastLinear,
@@ -205,14 +206,12 @@ class MambaModel(nn.Module):
        self.norm_f = FastRMSNorm.load(
            f"{prefix}.norm_f", weights, eps=config.layer_norm_epsilon
        )
-        self.lm_head = FastLinear.load(
+        self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
-            config, f"{prefix}.embedding", weights, bias=False
-        )
        self.config = config
    def forward(
        self, input_ids: torch.Tensor, inference_params=None, residual=None
-    ) -> Tuple[torch.Tensor, torch.Tensor, InferenceParams]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        hidden_states = self.embed_tokens(input_ids)
        for i, block in enumerate(self.blocks):
            hidden_states, residual, conv_state, ssm_state = block(
@@ -226,8 +225,8 @@ class MambaModel(nn.Module):
        )
        hidden_states, _ = self.norm_f(hidden_states.view(-1, hidden_states.size(-1)))
        hidden_states = hidden_states.view(residual.shape)
-        logits = self.lm_head(hidden_states)
+        logits, speculative_logits = self.lm_head(hidden_states)
        # update the offset for the next inference using these params
        inference_params.seqlen_offset += input_ids.size(1)
-        return logits
+        return logits, speculative_logits
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@@ -21,7 +21,7 @@ from text_generation_server.utils.layers import (
    TensorParallelEmbedding,
    TensorParallelColumnLinear,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
    get_linear,
 )
@@ -1090,7 +1090,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
        if not config.tie_word_embeddings:
            raise ValueError("MPTForCausalLM only supports tied word embeddings")
        self.transformer = MPTModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config, prefix="transformer.wte", weights=weights
        )
        self.logit_scale = None
@@ -1133,7 +1133,7 @@ class MPTForCausalLM(MPTPreTrainedModel):
            output_hidden_states=output_hidden_states,
            use_cache=use_cache,
        )
-        logits = self.lm_head(outputs.last_hidden_state)
+        logits, speculative_logits = self.lm_head(outputs.last_hidden_state)
        if self.logit_scale is not None:
            if self.logit_scale == 0:
                warnings.warn(
@@ -1147,12 +1147,15 @@ class MPTForCausalLM(MPTPreTrainedModel):
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
            )
-        return CausalLMOutputWithPast(
+        return (
+            CausalLMOutputWithPast(
                loss=loss,
                logits=logits,
                past_key_values=outputs.past_key_values,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
+            ),
+            speculative_logits,
        )
    def prepare_inputs_for_generation(

--- a/server/text_generation_server/models/custom_modeling/neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/neox_modeling.py
@@ -44,7 +44,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
@@ -646,7 +646,7 @@ class GPTNeoxForCausalLM(GPTNeoXPreTrainedModel):
    def __init__(self, config, weights):
        super().__init__(config)
        self.gpt_neox = GPTNeoXModel(config, weights)
-        self.embed_out = TensorParallelHead.load(
+        self.embed_out = SpeculativeHead.load(
            config, prefix="embed_out", weights=weights
        )

--- a/server/text_generation_server/models/custom_modeling/opt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/opt_modeling.py
@@ -32,7 +32,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )
 EPS = 1e-5
@@ -748,7 +748,7 @@ class OPTForCausalLM(OPTPreTrainedModel):
        self.model = OPTModel(config, weights)
-        self.lm_head = TensorParallelHead.load(
+        self.lm_head = SpeculativeHead.load(
            config, prefix="model.decoder.embed_tokens", weights=weights
        )