[Bugfix] Fix 2 Node and Spec Decode tests (#13341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Bugfix] Fix 2 Node and Spec Decode tests (#13341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
5d2965b7 · Cyrus Leung · GitHub · a0231b7c · 5d2965b7 · 5d2965b7
Unverified Commit 5d2965b7 authored Feb 16, 2025 by Cyrus Leung Committed by GitHub Feb 16, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 9 deletions

tests/distributed/test_pipeline_parallel.py tests/distributed/test_pipeline_parallel.py +5 -5

vllm/spec_decode/ngram_worker.py vllm/spec_decode/ngram_worker.py +12 -4

No files found.
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -275,11 +275,11 @@ def _compare_tp(
    if load_format == "dummy":
        # Avoid OOM
        text_overrides = {
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-            "num_experts": 2,
-            "num_experts_per_tok": 2,
-            "num_local_experts": 2,
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
        }

        if is_multimodal:

--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -6,6 +6,7 @@ from typing import List, Optional, Set, Tuple
 import torch
 import torch.nn as nn

+from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposals
@@ -25,11 +26,18 @@ class NGramWorker(NonLLMProposerWorkerBase):
    which don't rely on LLM model to give proposals.
    """

-    def __init__(self, *args, **kwargs):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        device_type: str = "cuda",
+        **kwargs,
+    ):
+        super().__init__(vllm_config)
+
        # Get local_rank/vocab_size from kwargs attribute
-        self.local_rank = kwargs["local_rank"]
-        self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
-        self.device_type = kwargs.get("device_type", "cuda")
+        self.local_rank = local_rank
+        self.device_type = device_type

        # Lazy initialization list.
        self._proposer: Top1Proposer