Revamp medusa implementation so that every model can benefit. (#1588)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.

Revamp medusa implementation so that every model can benefit. (#1588)
# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
bf700e7e · Nicolas Patry · GitHub · ac5a1c6f · bf700e7e · bf700e7e
Unverified Commit bf700e7e authored Feb 26, 2024 by Nicolas Patry Committed by GitHub Feb 26, 2024
20 changed files
--- a/server/text_generation_server/models/custom_modeling/phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/phi_modeling.py
@@ -13,7 +13,7 @@ from text_generation_server.utils.layers import (
    TensorParallelRowLinear,
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
-    TensorParallelHead,
+    SpeculativeHead,
    FastLinear,
 )

@@ -120,7 +120,7 @@ class PhiCausalLMHead(nn.Module):
            weights=weights,
            eps=config.layer_norm_epsilon,
        )
-        self.linear = TensorParallelHead.load(
+        self.linear = SpeculativeHead.load(
            config=config, prefix="lm_head.linear", weights=weights
        )


--- a/server/text_generation_server/models/custom_modeling/t5_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/t5_modeling.py
@@ -42,7 +42,7 @@ from text_generation_server.utils.layers import (
    TensorParallelColumnLinear,
    TensorParallelEmbedding,
    TensorParallelRowLinear,
-    TensorParallelHead,
+    SpeculativeHead,
 )


@@ -1033,14 +1033,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
        )

        try:
-            self.lm_head = TensorParallelHead.load(
+            self.lm_head = SpeculativeHead.load(
                config, prefix="lm_head", weights=weights
            )
        except RuntimeError:
            # Some models like t5-small were saved with shared weights unlike flan
            # Since they are declared as the same arch we have no choice but hope
            # that this is OK instead of using a proper flag.
-            self.lm_head = TensorParallelHead.load(
+            self.lm_head = SpeculativeHead.load(
                config, prefix="shared", weights=weights
            )

@@ -1126,7 +1126,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
            sequence_output = sequence_output * (self.model_dim**-0.5)

-        lm_logits = self.lm_head(sequence_output)
+        logits, speculative_logits = self.lm_head(sequence_output)

        loss = None
        if labels is not None:
@@ -1140,9 +1140,10 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
            return ((loss,) + output) if loss is not None else output

-        return Seq2SeqLMOutput(
+        return (
+            Seq2SeqLMOutput(
                loss=loss,
-            logits=lm_logits,
+                logits=logits,
                past_key_values=decoder_outputs.past_key_values,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
@@ -1150,6 +1151,8 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
+            ),
+            speculative_logits,
        )

    def prepare_inputs_for_generation(

--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -723,7 +723,7 @@ class FlashCausalLM(Model):
        torch.cuda.synchronize()

        with torch.cuda.graph(graph, pool=MEM_POOL):
-            self.cuda_graphs[bs]["logits"] = self.model.forward(
+            logits, speculative_logits = self.model.forward(
                input_ids=input_ids,
                position_ids=position_ids,
                cu_seqlen_prefill=None,
@@ -734,6 +734,8 @@ class FlashCausalLM(Model):
                max_s=max_s,
                lm_head_indices=None,
            )
+            self.cuda_graphs[bs]["logits"] = logits
+            self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
        torch.cuda.synchronize()

    def warmup(self, batch: FlashCausalLMBatch):
@@ -805,7 +807,9 @@ class FlashCausalLM(Model):

        return int(num_blocks * BLOCK_SIZE)

-    def forward(self, batch: FlashCausalLMBatch) -> torch.Tensor:
+    def forward(
+        self, batch: FlashCausalLMBatch
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
@@ -900,9 +904,14 @@ class FlashCausalLM(Model):

        # Replay the graph
        cuda_graph["graph"].replay()
-
        # Slice output to the correct shape
-        return cuda_graph["logits"][:bs]
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits

    @tracer.start_as_current_span("generate_token")
    def generate_token(
@@ -926,16 +935,11 @@ class FlashCausalLM(Model):
            batch.slots = slots

        try:
-            out = self.forward(batch)
+            out, speculative_logits = self.forward(batch)
        except Exception as e:
            del batch
            raise e

-        if isinstance(out, tuple):
-            out, speculative_logits = out
-        else:
-            speculative_logits = None
-
        if prefill:
            next_token_logits = (
                out[batch.prefill_next_token_indices] if prefill_logprobs else out

--- a/server/text_generation_server/models/flash_gemma.py
+++ b/server/text_generation_server/models/flash_gemma.py
@@ -25,9 +25,9 @@ class FlashGemma(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
-        use_medusa: Optional[str] = None,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
@@ -50,6 +50,7 @@ class FlashGemma(FlashCausalLM):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        torch.distributed.barrier(group=self.process_group)

@@ -59,36 +60,6 @@ class FlashGemma(FlashCausalLM):
            weights._set_gptq_params(model_id, revision)

        model = FlashGemmaForCausalLM(config, weights)
-        if use_medusa:
-            from text_generation_server.utils.medusa import MedusaModel
-            from huggingface_hub import hf_hub_download
-            import json
-            import os
-            from pathlib import Path
-
-            is_local_model = (
-                Path(use_medusa).exists() and Path(use_medusa).is_dir()
-            ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None
-
-            if not is_local_model:
-                medusa_config = hf_hub_download(
-                    use_medusa, revision=revision, filename="config.json"
-                )
-                medusa_head = hf_hub_download(
-                    use_medusa, revision=revision, filename="medusa_lm_head.pt"
-                )
-            else:
-                medusa_config = str(Path(use_medusa) / "config.json")
-                medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt")
-
-            with open(medusa_config, "r") as f:
-                config = json.load(f)
-            medusa_sf = medusa_head[: -len(".pt")] + ".safetensors"
-            weights = Weights(
-                [medusa_sf], device, dtype, process_group=self.process_group
-            )
-            lm_head = model.lm_head
-            model.lm_head = MedusaModel(config, weights, lm_head)

        torch.distributed.barrier(group=self.process_group)
        super(FlashGemma, self).__init__(

--- a/server/text_generation_server/models/flash_llama.py
+++ b/server/text_generation_server/models/flash_llama.py
@@ -26,9 +26,9 @@ class FlashLlama(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
-        use_medusa: Optional[str] = None,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
@@ -58,6 +58,7 @@ class FlashLlama(FlashCausalLM):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        torch.distributed.barrier(group=self.process_group)

@@ -67,37 +68,6 @@ class FlashLlama(FlashCausalLM):
            weights._set_gptq_params(model_id, revision)

        model = FlashLlamaForCausalLM(config, weights)
-        if use_medusa:
-            from text_generation_server.utils.medusa import MedusaModel
-            from huggingface_hub import hf_hub_download
-            import json
-            import os
-            from pathlib import Path
-
-            is_local_model = (
-                Path(use_medusa).exists() and Path(use_medusa).is_dir()
-            ) or os.getenv("WEIGHTS_CACHE_OVERRIDE", None) is not None
-
-            if not is_local_model:
-                medusa_config = hf_hub_download(
-                    use_medusa, revision=revision, filename="config.json"
-                )
-                medusa_head = hf_hub_download(
-                    use_medusa, revision=revision, filename="medusa_lm_head.pt"
-                )
-            else:
-                medusa_config = str(Path(use_medusa) / "config.json")
-                medusa_head = str(Path(use_medusa) / "medusa_lm_head.pt")
-
-            with open(medusa_config, "r") as f:
-                config = json.load(f)
-            medusa_sf = medusa_head[: -len(".pt")] + ".safetensors"
-            weights = Weights(
-                [medusa_sf], device, dtype, process_group=self.process_group
-            )
-            lm_head = model.lm_head
-            model.lm_head = MedusaModel(config, weights, lm_head)
-
        torch.distributed.barrier(group=self.process_group)
        super(FlashLlama, self).__init__(
            model=model,

--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -294,6 +294,7 @@ class BaseFlashMistral(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -319,6 +320,7 @@ class BaseFlashMistral(FlashCausalLM):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        # Set context windows
        if config.sliding_window is not None:
@@ -394,7 +396,7 @@ class BaseFlashMistral(FlashCausalLM):
        torch.cuda.synchronize()

        with torch.cuda.graph(graph, pool=MEM_POOL):
-            self.cuda_graphs[bs]["logits"] = self.model.forward(
+            logits, speculative_logits = self.model.forward(
                input_ids=input_ids,
                position_ids=position_ids,
                cu_seqlen_prefill=None,
@@ -406,9 +408,13 @@ class BaseFlashMistral(FlashCausalLM):
                prefill_cache_indices=None,
                lm_head_indices=None,
            )
+            self.cuda_graphs[bs]["logits"] = logits
+            self.cuda_graphs[bs]["speculative_logits"] = speculative_logits
        torch.cuda.synchronize()

-    def forward(self, batch: FlashMistralBatch) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(
+        self, batch: FlashMistralBatch
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        # Model Forward
        if batch.speculative_ids is not None:
            input_ids = batch.input_ids
@@ -479,7 +485,7 @@ class BaseFlashMistral(FlashCausalLM):
        cuda_graph = self.cuda_graphs.get(padded_bs, None)

        if cu_seqlen_prefill is not None or cuda_graph is None:
-            logits = self.model.forward(
+            logits, speculative_logits = self.model.forward(
                input_ids=input_ids,
                position_ids=position_ids,
                cu_seqlen_prefill=cu_seqlen_prefill,
@@ -493,7 +499,7 @@ class BaseFlashMistral(FlashCausalLM):
            )
            if batch.prefill_cache_indices is not None:
                batch.prefill_cache_indices = None
-            return logits
+            return logits, speculative_logits

        # Copy inputs to the static inputs of the cuda graph
        # Static inputs are potentially padded
@@ -511,7 +517,13 @@ class BaseFlashMistral(FlashCausalLM):
        cuda_graph["graph"].replay()

        # Slice output to the correct shape
-        return cuda_graph["logits"][:bs]
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits


 class FlashMistral(BaseFlashMistral):
@@ -520,6 +532,7 @@ class FlashMistral(BaseFlashMistral):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -529,6 +542,7 @@ class FlashMistral(BaseFlashMistral):
            model_id=model_id,
            revision=revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
--- a/server/text_generation_server/models/flash_mixtral.py
+++ b/server/text_generation_server/models/flash_mixtral.py
@@ -15,6 +15,7 @@ class FlashMixtral(BaseFlashMistral):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -24,6 +25,7 @@ class FlashMixtral(BaseFlashMistral):
            model_id=model_id,
            revision=revision,
            quantize=quantize,
+            use_medusa=use_medusa,
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
--- a/server/text_generation_server/models/flash_neox.py
+++ b/server/text_generation_server/models/flash_neox.py
@@ -24,6 +24,7 @@ class FlashNeoXSharded(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -46,6 +47,7 @@ class FlashNeoXSharded(FlashCausalLM):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")

--- a/server/text_generation_server/models/flash_phi.py
+++ b/server/text_generation_server/models/flash_phi.py
@@ -25,9 +25,9 @@ class FlashPhi(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
-        use_medusa: Optional[str] = None,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
@@ -48,6 +48,7 @@ class FlashPhi(FlashCausalLM):
            model_id, revision=revision, trust_remote_code=trust_remote_code
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        torch.distributed.barrier(group=self.process_group)


--- a/server/text_generation_server/models/flash_rw.py
+++ b/server/text_generation_server/models/flash_rw.py
@@ -25,6 +25,7 @@ class FlashRWSharded(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -61,6 +62,7 @@ class FlashRWSharded(FlashCausalLM):
        )

        config.quantize = quantize
+        config.use_medusa = use_medusa
        if config.quantize == "gptq":
            weights._set_gptq_params(model_id, revision)


--- a/server/text_generation_server/models/flash_santacoder.py
+++ b/server/text_generation_server/models/flash_santacoder.py
@@ -27,6 +27,7 @@ class FlashSantacoderSharded(FlashCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -51,6 +52,7 @@ class FlashSantacoderSharded(FlashCausalLM):
            trust_remote_code=True,
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa
        config.transpose = config.architectures[0].startswith("GPT2")

        torch.distributed.barrier(group=self.process_group)

--- a/server/text_generation_server/models/idefics.py
+++ b/server/text_generation_server/models/idefics.py
@@ -31,6 +31,7 @@ class IDEFICSSharded(IdeficsCausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -51,6 +52,7 @@ class IDEFICSSharded(IdeficsCausalLM):
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa
        config.vision_config.quantize = quantize

        tokenizer = LlamaTokenizerFast.from_pretrained(

--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -662,8 +662,13 @@ class IdeficsCausalLM(Model):
        if self.has_position_ids:
            kwargs["position_ids"] = position_ids

-        outputs = self.model.forward(**kwargs)
-        return outputs.logits, outputs.past_key_values, outputs.image_hidden_states
+        outputs, speculative_logits = self.model.forward(**kwargs)
+        return (
+            outputs.logits,
+            speculative_logits,
+            outputs.past_key_values,
+            outputs.image_hidden_states,
+        )

    @tracer.start_as_current_span("generate_token")
    def generate_token(
@@ -686,7 +691,7 @@ class IdeficsCausalLM(Model):
                :, : -batch.padding_right_offset
            ]

-        logits, past, image_hidden_states = self.forward(
+        logits, speculative_logits, past, image_hidden_states = self.forward(
            input_ids=batch.input_ids,
            attention_mask=attention_mask,
            position_ids=batch.position_ids,

--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -408,6 +408,7 @@ class Mamba(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -444,6 +445,7 @@ class Mamba(Model):
        tokenizer.pad_token = tokenizer.eos_token

        config.quantize = quantize
+        config.use_medusa = use_medusa
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
@@ -505,7 +507,7 @@ class Mamba(Model):
        torch.cuda.synchronize()

        with torch.cuda.graph(graph, pool=MEM_POOL):
-            logits = self.model.forward(
+            logits, speculative_logits = self.model.forward(
                input_ids=input_ids, inference_params=inference_params
            )
        torch.cuda.synchronize()
@@ -514,6 +516,7 @@ class Mamba(Model):
            "inference_params": inference_params,
            "graph": graph,
            "logits": logits,
+            "speculative_logits": speculative_logits,
        }
        self.cuda_graphs[batch_size] = graph_dict

@@ -556,9 +559,14 @@ class Mamba(Model):
        inference_params.ssm_states.copy_(
            cuda_graph["inference_params"].ssm_states[:, :bs]
        )
-
        # Slice output to the correct shape
-        return cuda_graph["logits"][:bs]
+        speculative_logits = (
+            cuda_graph["speculative_logits"][:bs]
+            if cuda_graph["speculative_logits"] is not None
+            else None
+        )
+        logits = cuda_graph["logits"][:bs]
+        return logits, speculative_logits

    def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]:
        start = time.time_ns()
@@ -589,7 +597,9 @@ class Mamba(Model):
            batch.inference_params = inference_params

        # Forward pass
-        logits = self.forward(input_ids, inference_params=batch.inference_params)
+        logits, speculative_logits = self.forward(
+            input_ids, inference_params=batch.inference_params
+        )

        # batch.inference_params = new_inference_params
        # Results

--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@@ -43,6 +43,7 @@ class MPTSharded(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -75,6 +76,7 @@ class MPTSharded(CausalLM):
            config = json.load(f)
        config = PretrainedConfig(**config)
        config.quantize = quantize
+        config.use_medusa = use_medusa

        torch.distributed.barrier(group=self.process_group)


--- a/server/text_generation_server/models/opt.py
+++ b/server/text_generation_server/models/opt.py
@@ -22,6 +22,7 @@ class OPTSharded(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -47,6 +48,7 @@ class OPTSharded(CausalLM):
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa
        tokenizer.pad_token_id = config.pad_token_id

        torch.distributed.barrier(group=self.process_group)

--- a/server/text_generation_server/models/phi.py
+++ b/server/text_generation_server/models/phi.py
@@ -22,6 +22,7 @@ class Phi(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -52,6 +53,7 @@ class Phi(CausalLM):
        tokenizer.pad_token = tokenizer.eos_token

        config.quantize = quantize
+        config.use_medusa = use_medusa
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)

--- a/server/text_generation_server/models/santacoder.py
+++ b/server/text_generation_server/models/santacoder.py
@@ -19,6 +19,7 @@ class SantaCoder(CausalLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):

--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -532,6 +532,7 @@ class Seq2SeqLM(Model):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -596,6 +597,7 @@ class Seq2SeqLM(Model):
        past_key_values: Optional = None,
    ) -> Tuple[
        torch.Tensor,
+        Optional[torch.Tensor],
        torch.Tensor,
        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
    ]:
@@ -609,8 +611,15 @@ class Seq2SeqLM(Model):
            past_key_values=past_key_values,
            use_cache=True,
        )
+        if isinstance(outputs, tuple):
+            # Our custom models
+            outputs, speculative_logits = outputs
+        else:
+            # Generic transformers models
+            speculative_logits = None
        return (
            outputs.logits,
+            speculative_logits,
            outputs.encoder_last_hidden_state,
            outputs.past_key_values,
        )
@@ -635,7 +644,7 @@ class Seq2SeqLM(Model):
        else:
            encoder_last_hidden_state = None

-        logits, encoder_last_hidden_state, past = self.forward(
+        logits, speculative_logits, encoder_last_hidden_state, past = self.forward(
            batch.input_ids,
            batch.attention_mask,
            batch.decoder_input_ids,

--- a/server/text_generation_server/models/t5.py
+++ b/server/text_generation_server/models/t5.py
@@ -25,6 +25,7 @@ class T5Sharded(Seq2SeqLM):
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
+        use_medusa: Optional[str] = None,
        dtype: Optional[torch.dtype] = None,
        trust_remote_code: bool = False,
    ):
@@ -42,6 +43,7 @@ class T5Sharded(Seq2SeqLM):
            trust_remote_code=trust_remote_code,
        )
        config.quantize = quantize
+        config.use_medusa = use_medusa

        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
@@ -94,7 +96,7 @@ class T5Sharded(Seq2SeqLM):
        List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
    ]:
        # Model Forward
-        outputs = self.model.forward(
+        outputs, speculative_logits = self.model.forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
@@ -106,6 +108,7 @@ class T5Sharded(Seq2SeqLM):

        return (
            outputs.logits,
+            speculative_logits,
            outputs.encoder_last_hidden_state,
            outputs.past_key_values,
        )