Update default neuron config for speculation (#18274)

Signed-off-by: Elaine Zhao <elaineyz@amazon.com> Co-authored-by: Shashwat Srijan <sssrijan@amazon.com> Co-authored-by: Aakash Shetty <sheaak@amazon.com>

Update default neuron config for speculation (#18274)
Signed-off-by: Elaine Zhao <elaineyz@amazon.com> Co-authored-by: Shashwat Srijan <sssrijan@amazon.com> Co-authored-by: Aakash Shetty <sheaak@amazon.com>
ebed81fb · aws-elaineyz · GitHub · e2d7d312 · ebed81fb
Unverified Commit ebed81fb authored May 22, 2025 by aws-elaineyz Committed by GitHub May 22, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

vllm/model_executor/model_loader/neuronx_distributed.py vllm/model_executor/model_loader/neuronx_distributed.py +3 -1

No files found.
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -502,7 +502,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
        max_context_length=scheduler_config.max_model_len,
        seq_len=scheduler_config.max_model_len,
        enable_bucketing=True,
-        is_continuous_batching=(batch_size > 1),
+        is_continuous_batching=True,
        quantized=False,
        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
        padding_side="right",
@@ -520,6 +520,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
    args."""
    neuron_config = dict(
        tp_degree=parallel_config.tensor_parallel_size,
+        ctx_batch_size=1,
        batch_size=scheduler_config.max_num_seqs,
        max_context_length=scheduler_config.max_model_len,
        seq_len=scheduler_config.max_model_len,
@@ -527,6 +528,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
        trace_tokengen_model=False,
        enable_fused_speculation=True,
        enable_bucketing=True,
+        is_continuous_batching=True,
        quantized=False,
        torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
        on_device_sampling_config=dict(