Fix more broken speculative decode tests (#17450)

Signed-off-by: Huy Do <huydhn@gmail.com>

Fix more broken speculative decode tests (#17450)
Signed-off-by: Huy Do <huydhn@gmail.com>
b74d888c · Huy Do · GitHub · 2007d4d5 · b74d888c · b74d888c
Unverified Commit b74d888c authored May 01, 2025 by Huy Do Committed by GitHub May 01, 2025
4 changed files
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,

--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,

--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -152,7 +152,7 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,

--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -51,9 +51,14 @@ class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase):
    def set_include_gpu_probs_tensor(self) -> None:
        # Need include_gpu_probs_tensor for MultiStepWorker
        self.model_runner.sampler.include_gpu_probs_tensor = True
+        if hasattr(self.model_runner.model, "sampler"):
+            (self.model_runner.model.sampler.include_gpu_probs_tensor) = True

    def set_should_modify_greedy_probs_inplace(self) -> None:
        self.model_runner.sampler.should_modify_greedy_probs_inplace = True
+        if hasattr(self.model_runner.model, "sampler"):
+            (self.model_runner.model.sampler.should_modify_greedy_probs_inplace
+             ) = True

    @torch.inference_mode()
    def sampler_output(