Merge branch 'v0.6.2-dev' of ssh://10.6.10.68:10022/dcutoolkit/deeplearing/vllm into v0.6.2-dev

f6ce3afa · zhuwenwen · 78800ecf · 1a313afb · f6ce3afa · f6ce3afa
Commit f6ce3afa authored Dec 05, 2024 by zhuwenwen
3 changed files
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -244,7 +244,7 @@ class Medusa(nn.Module):
                                    default_weight_loader)
            weight_loader(param, loaded_weight)
-            if self.use_llama_nn and "lm_head" in name:
+            if self.use_llama_nn and os.environ['LM_NN'] == '1' and "lm_head" in name:
                _weight = torch.zeros_like(param.data)
                ori_shape =_weight.shape

--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -201,7 +201,7 @@ class MLPSpeculator(nn.Module):
                                        default_weight_loader)
                weight_loader(param, loaded_weight)
-                if self.use_llama_nn and "head" in name:
+                if self.use_llama_nn and os.environ['LM_NN'] == '1' and "head" in name:
                    _weight = torch.zeros_like(param.data)
                    ori_shape =_weight.shape

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -531,6 +531,9 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        not called, meaning that the kv-cache in proposer for requests is not
        updated, so they cannot enable spec decode in the rest decoding.
        """
+        if self.tree_style_spec_decoding and self.kvcache_slot_to_be_moved is not None:
+            execute_model_req.kvcache_slot_to_be_moved = self.kvcache_slot_to_be_moved
+            self.kvcache_slot_to_be_moved = None
        sampler_output = self.scorer_worker.execute_model(execute_model_req)
        assert len(sampler_output) == 1
@@ -734,7 +737,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        # Get probabilities according to proposal method.
        proposal_probs = proposals.proposal_probs if proposals.proposal_probs is not None else None
-        if non_spec_indices:
+        if proposal_probs is not None and non_spec_indices:
            proposal_probs = proposal_probs[spec_indices]
        # Get proposed tokens.
@@ -744,7 +747,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
        # Get tree buffers.
        cart_candidates = proposals.cart_candidates if proposals.cart_candidates is not None else None
-        if non_spec_indices:
+        if cart_candidates is not None and non_spec_indices:
            cart_candidates = cart_candidates[spec_indices] 
        # Sampler arguments