[V0 deprecation] Deprecate V0 Neuron backend (#21159)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 deprecation] Deprecate V0 Neuron backend (#21159)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
4172235a · Woosuk Kwon · GitHub · 848562bd · 848562bd · 848562bd
Unverified Commit 4172235a authored Sep 06, 2025 by Woosuk Kwon Committed by GitHub Sep 06, 2025
20 changed files
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-Tests for miscellaneous utilities
-"""
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
-        (16, False, 32, 32, 1024, True),
-        (16, False, 32, 128, 1024, True),
-        (16, True, 32, 32, 1024, True),
-        (16, True, 32, 128, 1024, True),
-        (16, False, 32, 128, 1024, False),
-        (16, True, 32, 128, 1024, False),
-    ])
-def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len, use_key):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-
-    batch_size = 1
-    base = 10000
-    num_heads = 8
-
-    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
-                          is_neox_style, torch.float32)
-
-    positions = torch.randint(0,
-                              max_position, (batch_size, seq_len),
-                              device="cpu")
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=torch.float32,
-                        device="cpu")
-    key = torch.randn_like(query) if use_key else None
-    assert positions.is_cpu, \
-        "reference input tensor is expected to be CPU tensor."
-    ref_query, ref_key = rot.to(device="cpu").forward_native(
-        positions, query, key)
-    out_query, out_key = rot.to(device=device).forward_neuron(
-        positions.to(device=device), query.to(device=device),
-        key.to(device=device) if key is not None else None)
-    if use_key:
-        assert out_query.is_xla and out_key.is_xla, \
-            "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out_key.cpu(),
-                                   ref_key,
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out_key is None, "expected returned key to be None"
-        assert out_query.is_xla, \
-            "output tensor is expected to be XLA tensor"
-    torch.testing.assert_close(out_query.cpu(),
-                               ref_query,
-                               atol=1e-2,
-                               rtol=1e-2)
--- a/tests/neuron/2_core/test_comm_ops.py
+++ b/tests/neuron/2_core/test_comm_ops.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import functools
-from typing import Callable
-from unittest.mock import patch
-
-import pytest
-import torch
-import torch_xla.distributed.xla_multiprocessing as xmp
-from typing_extensions import ParamSpec
-
-from vllm.distributed.communication_op import (
-    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.utils import get_distributed_init_method, get_open_port
-
-_P = ParamSpec("_P")
-
-
-def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to reinitialize the Neuron Runtime before executing a test.
-    This is necessary for distributed tests which need to reallocate Neuron
-    Cores to separate subprocesses.
-    """
-
-    @functools.wraps(f)
-    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        runtime = torch.classes.neuron.Runtime()
-        runtime.initialize()
-        runtime.unsafe_close()
-
-        f(*args, **kwargs)
-        runtime.initialize()
-
-    return wrapper
-
-
-def all_gather_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_dimensions = 3
-    tensor_size = list(range(2, num_dimensions + 2))
-    total_size = 1
-    for s in tensor_size:
-        total_size *= s
-
-    all_gather_dimension = -1
-    all_tensors = [
-        torch.arange(total_size, dtype=torch.float32,
-                     device="xla").reshape(tensor_size) * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.cat(all_tensors, dim=all_gather_dimension)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
-    torch.testing.assert_close(t, expected)
-
-
-def all_reduce_test_worker(index, tp_degree, distributed_init_method):
-    init_distributed_environment(tp_degree,
-                                 index,
-                                 distributed_init_method,
-                                 index,
-                                 backend="xla")
-    ensure_model_parallel_initialized(tp_degree, 1)
-
-    num_elements = 8
-    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
-        for r in range(tp_degree)
-    ]
-    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    t = all_tensors[index % tp_degree]
-    t = tensor_model_parallel_all_reduce(t)
-    torch.testing.assert_close(t, expected)
-
-
-@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target",
-                         [all_reduce_test_worker, all_gather_test_worker])
-@reinitialize_neuron_runtime
-def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
-                                              test_target):
-
-    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
-               return_value=False):
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-
-        monkeypatch.setenv("VLLM_USE_V1", "1")
-        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
-        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
-                           ','.join(['1' for _ in range(tp_size)]))
-
-        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
--- a/tests/neuron/2_core/test_eagle.py
+++ b/tests/neuron/2_core/test_eagle.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-import os
-import shutil
-import tempfile
-
-import torch
-from huggingface_hub import snapshot_download
-from safetensors import safe_open
-
-from vllm import LLM, SamplingParams
-
-
-def patch_eagle_draft_with_lm_head(target_model_id: str,
-                                   draft_model_id: str) -> str:
-    # In NxDI, draft model checkpoint must include lm_head weights from target
-    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
-    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
-    # #eagle-checkpoint-compatibility
-    final_draft_dir = "/tmp/patched_eagle_draft"
-
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        target_dir = snapshot_download(repo_id=target_model_id,
-                                       local_dir=os.path.join(
-                                           tmp_dir, "target"))
-        draft_dir = snapshot_download(repo_id=draft_model_id,
-                                      local_dir=os.path.join(tmp_dir, "draft"))
-
-        lm_head_key = "lm_head.weight"
-        index_path = os.path.join(target_dir, "model.safetensors.index.json")
-        with open(index_path) as f:
-            index = json.load(f)
-        shard_name = index["weight_map"][lm_head_key]
-        target_safetensor_path = os.path.join(target_dir, shard_name)
-
-        with safe_open(target_safetensor_path, framework="pt") as f:
-            target_lm_head = f.get_tensor(lm_head_key)
-
-        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
-        draft_state_dict = torch.load(draft_path, map_location="cpu")
-        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
-        torch.save(draft_state_dict, draft_path)
-
-        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
-
-    return final_draft_dir
-
-
-def test_eagle():
-    patched_draft_path = patch_eagle_draft_with_lm_head(
-        target_model_id="meta-llama/Llama-2-7b-hf",
-        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
-    llm = LLM(
-        model="meta-llama/Llama-2-7b-hf",
-        speculative_config={
-            "model": patched_draft_path,
-            "num_speculative_tokens": 5,
-            "max_model_len": 128
-        },
-        max_num_seqs=1,
-        max_model_len=128,
-        tensor_parallel_size=2,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-            "fused_qkv": True
-        },
-    )
-    prompts = [
-        "The president of the United States is",
-    ]
-    outputs = llm.generate(prompts, SamplingParams(top_k=1))
-    expected_output = " the head of state and head of government of " \
-    "the United States. The president direct"
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Eagle speculation test passed.")
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-
-def test_mistral():
-    llm = LLM(model="mistralai/Mistral-7B-v0.1",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=128,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True
-              })
-
-    # Send more prompts than the compiled batch size (4) and request
-    # varying generation lengths to test accuracy related to Neuron
-    # specific sequence id sorting.
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-        "What is Annapurna labs?",
-        "I believe the meaning of life is",
-        "Tell me a story about a brave knight",
-        "Hello, my name is Llama",
-    ]
-
-    sampling_params = [
-        SamplingParams(top_k=1, max_tokens=10),
-        SamplingParams(top_k=1, max_tokens=20),
-        SamplingParams(top_k=1, max_tokens=30),
-        SamplingParams(top_k=1, max_tokens=40),
-        SamplingParams(top_k=1, max_tokens=50),
-        SamplingParams(top_k=1, max_tokens=60)
-    ]
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    expected_outputs = [
-        " the most powerful person in the world. He is",
-        " a city of many faces. It is a city of history, culture, art, "
-        "fashion, and",
-        "\n\nAnnapurna Labs is a semiconductor company that was founded "
-        "in 2013 by Amazon. The company is",
-        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
-        "believe that happiness is a state of mind.\n\nI believe that "
-        "happiness is a journey.\n\nI believe",
-        " who rescued a princess from a dragon.\n\nTell me a story about"
-        " a princess who rescued herself from a dragon.\n\nTell me a "
-        "story about a princess who rescued herself from a dragon and "
-        "then rescued a knight from",
-        " and I am a 10 year old male. I am a very friendly and "
-        "affectionate boy who loves to be around people. I am a very "
-        "active boy who loves to play and run around. I am a very smart "
-        "boy who loves to learn new things. I am a very loyal boy"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
-        assert (expected_output == generated_text)
-
-    print("Neuron Mistral test passed.")
--- a/tests/neuron/2_core/test_multi_lora.py
+++ b/tests/neuron/2_core/test_multi_lora.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from huggingface_hub import snapshot_download
-
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-
-
-def test_llama_single_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled": False,
-                  "skip_warmup": True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=1,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_1])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
-
-
-def test_llama_multiple_lora():
-    sql_lora_files = snapshot_download(
-        repo_id="yard1/llama-2-7b-sql-lora-test")
-    llm = LLM(model="meta-llama/Llama-2-7b-hf",
-              tensor_parallel_size=2,
-              max_num_seqs=4,
-              max_model_len=512,
-              override_neuron_config={
-                  "sequence_parallel_enabled":
-                  False,
-                  "skip_warmup":
-                  True,
-                  "lora_modules": [{
-                      "name": "lora_id_1",
-                      "path": sql_lora_files
-                  }, {
-                      "name": "lora_id_2",
-                      "path": sql_lora_files
-                  }]
-              },
-              enable_lora=True,
-              max_loras=2,
-              max_lora_rank=256,
-              device="neuron")
-    """For multi-lora requests using NxDI as the backend, only the lora_name 
-    needs to be specified. The lora_id and lora_path are supplied at the LLM 
-    class/server initialization, after which the paths are handled by NxDI"""
-    lora_req_1 = LoRARequest("lora_id_1", 0, " ")
-    lora_req_2 = LoRARequest("lora_id_2", 1, " ")
-    prompts = [
-        "The president of the United States is",
-        "The capital of France is",
-    ]
-    outputs = llm.generate(prompts,
-                           SamplingParams(top_k=1),
-                           lora_request=[lora_req_1, lora_req_2])
-
-    expected_outputs = [
-        " the head of state and head of government of the United States. "
-        "The president direct",
-        " a city of contrasts. The city is home to the Eiffel Tower"
-    ]
-
-    for expected_output, output in zip(expected_outputs, outputs):
-        generated_text = output.outputs[0].text
-        assert (expected_output == generated_text)
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -54,7 +54,6 @@ SystemEnv = namedtuple(
        'is_xnnpack_available',
        'cpu_info',
        'rocm_version',  # vllm specific field
-        'neuron_sdk_version',  # vllm specific field
        'vllm_version',  # vllm specific field
        'vllm_build_flags',  # vllm specific field
        'gpu_topo',  # vllm specific field
@@ -275,15 +274,6 @@ def get_rocm_version(run_lambda):
                                     r'HIP version: (\S+)')


-def get_neuron_sdk_version(run_lambda):
-    # Adapted from your install script
-    try:
-        result = run_lambda(["neuron-ls"])
-        return result if result[0] == 0 else 'N/A'
-    except Exception:
-        return 'N/A'
-
-
 def get_vllm_version():
    from vllm import __version__, __version_tuple__

@@ -306,10 +296,9 @@ def get_vllm_version():

 def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
-    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+    return 'CUDA Archs: {}; ROCm: {}'.format(
        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
-        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
    )


@@ -601,7 +590,6 @@ def get_env_info():
    conda_packages = get_conda_packages(run_lambda)

    rocm_version = get_rocm_version(run_lambda)
-    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
    vllm_version = get_vllm_version()
    vllm_build_flags = summarize_vllm_build_flags()
    gpu_topo = get_gpu_topo(run_lambda)
@@ -635,7 +623,6 @@ def get_env_info():
        is_xnnpack_available=is_xnnpack_available(),
        cpu_info=get_cpu_info(run_lambda),
        rocm_version=rocm_version,
-        neuron_sdk_version=neuron_sdk_version,
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
@@ -702,7 +689,6 @@ env_info_fmt += """
         vLLM Info
 ==============================
 ROCM Version                 : {rocm_version}
-Neuron SDK Version           : {neuron_sdk_version}
 vLLM Version                 : {vllm_version}
 vLLM Build Flags:
  {vllm_build_flags}

--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -461,11 +461,6 @@ class ModelConfig:
        DP (which is controlled by `--data-parallel-size`).
        This is only supported on a per-model basis and falls back to
        `"weights"` if the encoder does not support DP."""
-    override_neuron_config: dict[str, Any] = field(default_factory=dict)
-    """Initialize non-default neuron config or override default neuron config
-    that are specific to Neuron devices, this argument will be used to
-    configure the neuron config that can not be gathered from the vllm
-    arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
    pooler_config: Optional["PoolerConfig"] = field(init=False)
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
@@ -785,10 +780,6 @@ class ModelConfig:
        if not self.skip_tokenizer_init:
            self._verify_tokenizer_mode()

-        if (not current_platform.is_neuron() and self.override_neuron_config):
-            raise ValueError(
-                "`override_neuron_config` is only supported on Neuron.")
-
        # Avoid running try_verify_and_update_config multiple times
        self.config_updated = False

@@ -1696,13 +1687,7 @@ class ModelConfig:
        """
        For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
        True to enable cross-attention
-        Neuron needs all multimodal data to be in the decoder and does not
-        need to explicitly enable cross-attention
        """
-        if (current_platform.is_neuron()
-                and self.hf_config.model_type == "mllama"):
-            return False
-
        return is_encoder_decoder(self.hf_config)

    @property
@@ -1871,7 +1856,7 @@ class LoadConfig:
            self.ignore_patterns = ["original/**/*"]


-Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
+Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]


 @config
@@ -1927,9 +1912,7 @@ class DeviceConfig:
                self.device_type = self.device.type

        # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron"]:
-            self.device = torch.device("cpu")
-        elif self.device_type in ["tpu"]:
+        if self.device_type in ["tpu"]:
            self.device = None
        else:
            # Set device with device type
@@ -3941,7 +3924,6 @@ class VllmConfig:
            f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
            f"tokenizer_mode={self.model_config.tokenizer_mode}, "
            f"revision={self.model_config.revision}, "
-            f"override_neuron_config={self.model_config.override_neuron_config}, "  # noqa
            f"tokenizer_revision={self.model_config.tokenizer_revision}, "
            f"trust_remote_code={self.model_config.trust_remote_code}, "
            f"dtype={self.model_config.dtype}, "

--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -33,9 +33,8 @@ class CacheConfig:
    """Configuration for the KV cache."""

    block_size: SkipValidation[BlockSize] = None  # type: ignore
-    """Size of a contiguous cache block in number of tokens. This is ignored on
-    neuron devices and set to `--max-model-len`. On CUDA devices, only block
-    sizes up to 32 are supported. On HPU devices, block size defaults to 128.
+    """Size of a contiguous cache block in number of tokens. On CUDA devices,
+    only block sizes up to 32 are supported.

    This config has no static default. If left unspecified by the user, it will
    be set in `Platform.check_and_update_config()` based on the current

--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -377,10 +377,7 @@ class ParallelConfig:
            from vllm.executor import ray_utils
            backend: DistributedExecutorBackend = "mp"
            ray_found = ray_utils.ray_is_available()
-            if current_platform.is_neuron():
-                # neuron uses single process to control multiple devices
-                backend = "uni"
-            elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+            if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                backend = "uni"
            elif (current_platform.is_cuda()
                  and cuda_device_count_stateless() < self.world_size):

--- a/vllm/distributed/device_communicators/neuron_communicator.py
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-
-from vllm.distributed.device_communicators.base_device_communicator import (
-    DeviceCommunicatorBase)
-from vllm.platforms import current_platform
-
-if current_platform.is_neuron():
-    import torch_xla.core.xla_model as xm
-
-
-class NeuronCommunicator(DeviceCommunicatorBase):
-
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, x)
-
-    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        assert dim == -1, "Neuron only supports dim=-1 for all-gather."
-        return xm.all_gather(x, dim=dim)
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -419,8 +419,6 @@ class EngineArgs:
    scheduling_policy: SchedulerPolicy = SchedulerConfig.policy
    scheduler_cls: Union[str, Type[object]] = SchedulerConfig.scheduler_cls

-    override_neuron_config: dict[str, Any] = \
-        get_field(ModelConfig, "override_neuron_config")
    override_pooler_config: Optional[Union[dict, PoolerConfig]] = \
        ModelConfig.override_pooler_config
    compilation_config: CompilationConfig = \
@@ -561,8 +559,6 @@ class EngineArgs:
                                 help=model_kwargs["hf_token"]["help"])
        model_group.add_argument("--hf-overrides",
                                 **model_kwargs["hf_overrides"])
-        model_group.add_argument("--override-neuron-config",
-                                 **model_kwargs["override_neuron_config"])
        model_group.add_argument("--override-pooler-config",
                                 **model_kwargs["override_pooler_config"])
        model_group.add_argument("--logits-processor-pattern",
@@ -992,7 +988,6 @@ class EngineArgs:
            mm_processor_kwargs=self.mm_processor_kwargs,
            mm_processor_cache_gb=self.mm_processor_cache_gb,
            mm_encoder_tp_mode=self.mm_encoder_tp_mode,
-            override_neuron_config=self.override_neuron_config,
            override_pooler_config=self.override_pooler_config,
            logits_processor_pattern=self.logits_processor_pattern,
            generation_config=self.generation_config,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -236,7 +236,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # ================== Installation Time Env Vars ==================

    # Target device of vLLM, supporting [cuda (by default),
-    # rocm, neuron, cpu]
+    # rocm, cpu]
    "VLLM_TARGET_DEVICE":
    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),


--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -73,11 +73,6 @@ class CustomOp(nn.Module):
        # NOTE(woosuk): This is a placeholder for future extensions.
        return self.forward_native(*args, **kwargs)

-    def forward_neuron(self, *args, **kwargs):
-        # By default, we assume that Neuron ops are compatible with the
-        # PyTorch-native implementation.
-        return self.forward_native(*args, **kwargs)
-
    def forward_oot(self, *args, **kwargs):
        # By default, we assume that OOT ops are compatible with the
        # PyTorch-native implementation.
@@ -105,8 +100,6 @@ class CustomOp(nn.Module):
            return self.forward_tpu
        elif current_platform.is_xpu():
            return self.forward_xpu
-        elif current_platform.is_neuron():
-            return self.forward_neuron
        elif current_platform.is_out_of_tree():
            return self.forward_oot
        else:

--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -95,13 +95,6 @@ class SiluAndMul(CustomOp):
        self.op(out, x)
        return out

-    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
-        d = x.shape[-1] // 2
-        x_reshaped = x.view(-1, x.shape[-1])
-        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
-        result = s * x_reshaped[:, d:]
-        return result.view(*x.shape[:-1], d)
-

 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -26,7 +26,6 @@ QuantizationMethods = Literal[
    "bitsandbytes",
    "hqq",
    "experts_int8",
-    "neuron_quant",
    "ipex",
    "quark",
    "moe_wna16",
@@ -108,7 +107,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
    from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
    from .moe_wna16 import MoeWNA16Config
    from .mxfp4 import Mxfp4Config
-    from .neuron_quant import NeuronQuantConfig
    from .petit import PetitNvFp4Config
    from .ptpc_fp8 import PTPCFp8Config
    from .rtn import RTNConfig
@@ -135,7 +133,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
        "ptpc_fp8": PTPCFp8Config,
        "hqq": HQQMarlinConfig,
        "experts_int8": ExpertsInt8Config,
-        "neuron_quant": NeuronQuantConfig,
        "ipex": IPEXConfig,
        "quark": QuarkConfig,
        "moe_wna16": MoeWNA16Config,

--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from importlib.util import find_spec
-from typing import Any, Optional
-
-from torch.nn import Module
-
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn']
-
-
-class AlwaysSupportedDtypes(list):
-
-    def __contains__(self, item):
-        return True
-
-
-class NeuronQuantConfig(QuantizationConfig):
-    """Int8 Quantization Config class for Neuron Backend."""
-
-    def __init__(
-        self,
-        dequant_dtype: str = "f16",
-        quantize_method: str = "vector_dynamic",
-    ) -> None:
-        super().__init__()
-        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
-        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
-            raise ValueError(
-                f"Neuron quantization datatype {self.quant_dtype} is not valid,"
-                f" the quantization datatype should match one of the below "
-                f"types {SUPPORTED_QUANT_DTYPE_LIST}")
-        self.dequant_dtype = dequant_dtype
-        self.quantize_method = quantize_method
-
-    def get_name(self) -> QuantizationMethods:
-        return "neuron_quant"
-
-    def get_supported_act_dtypes(self) -> list[str]:
-        # Neuron implements custom handling logic for quantization support
-        return AlwaysSupportedDtypes()
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        raise NotImplementedError(
-            "This function should not be called with Neuron Backend")
-
-    @staticmethod
-    def get_config_filenames() -> list[str]:
-        return []
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "NeuronQuantConfig":
-        quantize_method = cls.get_from_keys(config, ["quantize_method"])
-        dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"])
-        return cls(dequant_dtype=dequant_dtype,
-                   quantize_method=quantize_method)
-
-    def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
-        if find_spec("transformers_neuronx") is not None:
-            return self.get_quantization_config()
-        else:
-            raise NotImplementedError(
-                "Neuron Quantization is only supported through"
-                " transformers_neuronx.")
-
-    def get_quantization_config(self):
-        from transformers_neuronx.config import QuantizationConfig
-        return QuantizationConfig(quant_dtype=self.quant_dtype,
-                                  dequant_dtype=self.dequant_dtype,
-                                  quantize_method=self.quantize_method)
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -7,7 +7,7 @@ import torch

 from vllm.model_executor.custom_op import CustomOp

-from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
+from .common import apply_rotary_emb_torch


 @CustomOp.register("rotary_embedding")
@@ -149,87 +149,6 @@ class RotaryEmbedding(CustomOp):
                                     self.cos_sin_cache, self.is_neox_style)
        return query, key

-    def forward_neuron(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-
-        def _apply_rotary_emb_neuron(
-            x: torch.Tensor,
-            cos: torch.Tensor,
-            sin: torch.Tensor,
-            is_neox_style: bool,
-        ) -> torch.Tensor:
-            cos = cos.unsqueeze(-2).to(x.dtype)
-            sin = sin.unsqueeze(-2).to(x.dtype)
-            if is_neox_style:
-                x1, x2 = torch.chunk(x, 2, dim=-1)
-            else:
-                # x1 = x[..., ::2]
-
-                # x2 = x[..., 1::2]
-                d = x.shape[-1] // 2
-                x_reshaped = x.view(-1, x.shape[-1])
-                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
-                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
-            o1 = x1 * cos - x2 * sin
-            o2 = x2 * cos + x1 * sin
-            if is_neox_style:
-                return torch.cat((o1, o2), dim=-1)
-            else:
-                return torch.stack((o1, o2), dim=-1).flatten(-2)
-
-        if offsets is not None:
-            positions = positions + offsets
-
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
-
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-
-        if self.rotary_dim == self.head_size:
-            query = apply_rotary_emb_dispatch(query, cos, sin,
-                                              self.is_neox_style)
-            query = query.reshape(query_shape)
-            if key is not None:
-                key = apply_rotary_emb_dispatch(key, cos, sin,
-                                                self.is_neox_style)
-                key = key.reshape(key_shape)
-        else:
-            head_size = query.shape[-1]
-            query_reshaped = query.view(-1, head_size)
-            query_pass = query_reshaped[:, self.rotary_dim:].view(
-                *query.shape[:-1], head_size - self.rotary_dim)
-            query_rot = query_reshaped[:, :self.rotary_dim].view(
-                *query.shape[:-1], self.rotary_dim)
-            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
-                                                 self.is_neox_style)
-            query = torch.cat((query_rot, query_pass),
-                              dim=-1).reshape(query_shape)
-
-            if key is not None:
-                key_reshaped = key.view(-1, head_size)
-                key_pass = key_reshaped[:, self.rotary_dim:].view(
-                    *key.shape[:-1], head_size - self.rotary_dim)
-                key_rot = key_reshaped[:, :self.rotary_dim].view(
-                    *key.shape[:-1], self.rotary_dim)
-                key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
-                                                   self.is_neox_style)
-                key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
-
    def extra_repr(self) -> str:
        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
        s += f", max_position_embeddings={self.max_position_embeddings}"

--- a/vllm/model_executor/model_loader/neuron.py
+++ b/vllm/model_executor/model_loader/neuron.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Utilities for selecting and loading Neuron models in transformers-neuronx
-framework."""
-import ast
-import copy
-import importlib
-import os
-from typing import Optional
-
-import torch
-import torch.nn as nn
-from transformers import PretrainedConfig
-
-from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.logprobs import Logprob
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import get_quantization_config
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import CompletionSequenceGroupOutput, SequenceOutput
-
-TORCH_DTYPE_TO_NEURON_AMP = {
-    "auto": "f32",
-    "half": "f16",
-    "float16": "f16",
-    "bfloat16": "bf16",
-    "float": "f32",
-    "float32": "f32",
-    torch.float16: "f16",
-    torch.bfloat16: "bf16",
-    torch.float32: "f32",
-}
-
-# Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = {
-    "LlamaForCausalLM": ("transformers_neuronx.llama.model",
-                         "LlamaForSampling", "LlamaForCausalLM"),
-    "MistralForCausalLM": ("transformers_neuronx.mistral.model",
-                           "MistralForSampling", "MistralForCausalLM")
-}
-
-
-class NeuronCausalLM(nn.Module):
-
-    def __init__(self,
-                 config: PretrainedConfig,
-                 on_device_sampling_disabled: bool = False) -> None:
-        super().__init__()
-        self.config = config
-        self.logits_processor = LogitsProcessor(config.vocab_size,
-                                                logits_as_input=True)
-
-        self.on_device_sampling_disabled = on_device_sampling_disabled
-        if self.on_device_sampling_disabled:
-            # Use default sampler
-            self.sampler = Sampler()
-
-        # Lazy initialized
-        self.model: nn.Module
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        logits = self.model(input_ids,
-                            cache_ids=positions,
-                            start_ids=input_block_ids)
-        return logits
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-
-        if self.on_device_sampling_disabled:
-            next_tokens = self.sampler(logits, sampling_metadata)
-            return next_tokens
-
-        # On-device sampling outputs the token ids directly.
-        sampled_token_ids = logits.flatten()
-        next_tokens = []
-        sample_idx = 0
-        for seq_group in sampling_metadata.seq_groups:
-            samples = []
-            for seq_id in seq_group.seq_ids:
-                token_id = sampled_token_ids[sample_idx].item()
-                samples.append(
-                    SequenceOutput(parent_seq_id=seq_id,
-                                   output_token=token_id,
-                                   logprobs={token_id: Logprob(token_id)}))
-                sample_idx += 1
-            next_tokens.append(
-                CompletionSequenceGroupOutput(samples=samples,
-                                              prompt_logprobs=None))
-
-        return SamplerOutput(outputs=next_tokens)
-
-    def load_weights(self, model_name_or_path: str, **kwargs):
-        arch = _get_model_architecture(self.config)
-        neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
-            _NEURON_SUPPORTED_MODELS[arch])
-        neuronx_module = importlib.import_module(neuronx_module_path)
-        neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
-
-        self.model = neuronx_model_cls.from_pretrained(model_name_or_path,
-                                                       **kwargs)
-        self.model.to_neuron()
-
-
-class NeuronSpeculationCausalLM(nn.Module):
-    """A Neuron-optimized causal language model with speculative decoding."""
-
-    SPECULATION_TERMINATION_ID = -1
-
-    def __init__(self, speculation_model) -> None:
-        super().__init__()
-        self.model = speculation_model
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        input_block_ids: torch.Tensor,
-    ) -> torch.Tensor:
-        tokens, counts = self.model.speculative_iteration(
-            input_ids, positions, input_block_ids)
-
-        # Mark the end of accepted speculative tokens for each sequence with the
-        # speculation termination id.
-        batch_size, steps = tokens.shape
-        mask = torch.arange(steps).expand(batch_size, -1) >= counts
-        tokens[mask] = self.SPECULATION_TERMINATION_ID
-
-        return tokens
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[list[SamplerOutput]]:
-        batch_size, num_steps = logits.shape
-        seq_ids = [
-            seq_id for sg in sampling_metadata.seq_groups
-            for seq_id in sg.seq_ids
-        ]
-        # Organize input tensors by step instead of by sequence.
-        accepted_token_ids_by_step = logits.transpose(0, 1)
-        accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
-
-        sampler_output_list = []
-        for step_index in range(num_steps):
-            if all(token_id == self.SPECULATION_TERMINATION_ID
-                   for token_id in accepted_token_ids_by_step[step_index]):
-                break
-            step_output_token_ids = []
-            for sequence_index in range(batch_size):
-                token_id = accepted_token_ids_by_step[step_index][
-                    sequence_index]
-                step_output_token_ids.append(
-                    CompletionSequenceGroupOutput(samples=[
-                        SequenceOutput(parent_seq_id=seq_ids[sequence_index],
-                                       output_token=token_id,
-                                       logprobs={token_id: Logprob(token_id)})
-                    ],
-                                                  prompt_logprobs=None))
-            sampler_output_list.append(
-                SamplerOutput(outputs=step_output_token_ids))
-        return sampler_output_list
-
-
-def _get_model_architecture(config: PretrainedConfig) -> str:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        if arch in _NEURON_SUPPORTED_MODELS:
-            return arch
-    raise ValueError(
-        f"Model architectures {architectures} are not supported on Neuron "
-        f"for now. Supported architectures: "
-        f"{list(_NEURON_SUPPORTED_MODELS.keys())}")
-
-
-def _get_buckets(env: str, default_value: list[int]) -> list[int]:
-    env_value = os.getenv(env)
-    if env_value is None:
-        return default_value
-    buckets_remove_empty = filter(
-        lambda x: x is not None and len(x.strip()) > 0, env_value.split(","))
-    buckets_int = map(int, buckets_remove_empty)
-    buckets_list = list(buckets_int)
-    return buckets_list
-
-
-def _get_default_neuron_config(model_config: ModelConfig,
-                               parallel_config: ParallelConfig,
-                               scheduler_config: SchedulerConfig):
-    """Generate a neuron config based on vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-    quant_config = dict(
-        dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        quantize_method="vector_dynamic")
-    neuron_quantization_config_builder = lambda quant: get_quantization_config(
-        quant).from_config(quant_config).get_quant_method(None, "")
-    # TODO: Add Paged attention config to the default neuron arguments.
-    default_neuron_args = dict(
-        collectives_layout=LAYOUT_BSH,
-        attention_layout=LAYOUT_BSH,
-        fuse_qkv=True,
-        quant=neuron_quantization_config_builder(model_config.quantization)
-        if model_config.quantization else None,
-        continuous_batching=continuous_batching_config,
-        weight_tiling=bool(model_config.quantization),
-        on_device_generation=_get_neuron_on_device_generation_config(
-            model_config))
-    return default_neuron_args
-
-
-def _get_default_neuron_config_for_speculation(
-        model_config: ModelConfig, parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig):
-    """Generate a neuron config for speculative decoding based on
-    vllm config args."""
-    from transformers_neuronx.config import ContinuousBatchingConfig
-    from transformers_neuronx.constants import LAYOUT_BSH
-
-    continuous_batching_config = ContinuousBatchingConfig(
-        batch_size_for_shared_caches=scheduler_config.max_num_seqs)
-
-    default_neuron_args = dict(collectives_layout=LAYOUT_BSH,
-                               attention_layout=LAYOUT_BSH,
-                               fuse_qkv=True,
-                               on_device_embedding=True,
-                               continuous_batching=continuous_batching_config,
-                               on_device_generation=copy.deepcopy(
-                                   model_config.neuron_sampling_params))
-    return default_neuron_args
-
-
-def _get_neuron_on_device_generation_config(model_config: ModelConfig):
-    if not _is_neuron_on_device_sampling_disabled(model_config):
-        return copy.deepcopy(model_config.neuron_sampling_params)
-    return None
-
-
-def _is_neuron_on_device_sampling_disabled(model_config: ModelConfig) -> bool:
-    return not getattr(model_config, "neuron_sampling_params", None)
-
-
-def _get_neuron_config_after_override(default_neuron_config,
-                                      overridden_neuron_config):
-    from transformers_neuronx.config import (ContinuousBatchingConfig,
-                                             GenerationConfig,
-                                             KVCacheQuantizationConfig,
-                                             NeuronConfig, QuantizationConfig,
-                                             SparseAttnConfig)
-
-    sparse_attn = overridden_neuron_config.pop("sparse_attn", {})
-    if sparse_attn:
-        overridden_neuron_config["sparse_attn"] = SparseAttnConfig(
-            **sparse_attn)
-
-    kv_cache_quant = overridden_neuron_config.pop("kv_cache_quant", {})
-    if kv_cache_quant:
-        overridden_neuron_config["kv_cache_quant"] = KVCacheQuantizationConfig(
-            **kv_cache_quant)
-
-    continuous_batching = overridden_neuron_config.pop("continuous_batching",
-                                                       {})
-    if continuous_batching:
-        overridden_neuron_config[
-            "continuous_batching"] = ContinuousBatchingConfig(
-                **continuous_batching)
-
-    quant = overridden_neuron_config.pop("quant", {})
-    if quant:
-        overridden_neuron_config["quant"] = QuantizationConfig(**quant)
-
-    on_device_generation = overridden_neuron_config.pop(
-        "on_device_generation", {})
-    if on_device_generation:
-        overridden_neuron_config["on_device_generation"] = GenerationConfig(
-            **on_device_generation)
-    default_neuron_config.update(overridden_neuron_config)
-    return NeuronConfig(**default_neuron_config)
-
-
-def get_neuron_model(model_config: ModelConfig,
-                     parallel_config: ParallelConfig,
-                     scheduler_config: SchedulerConfig) -> nn.Module:
-    """Initializes a neuron-optimized model for inference."""
-    # Create a model instance.
-    model = NeuronCausalLM(
-        model_config.hf_config,
-        _is_neuron_on_device_sampling_disabled(model_config))
-
-    default_neuron_config_args = _get_default_neuron_config(
-        model_config, parallel_config, scheduler_config)
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    model.load_weights(model_config.model,
-                       tp_degree=parallel_config.tensor_parallel_size,
-                       amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-                       neuron_config=neuron_config,
-                       context_length_estimate=context_length_estimates,
-                       n_positions=n_positions,
-                       batch_size=scheduler_config.max_num_seqs)
-
-    return model.eval()
-
-
-def get_neuron_speculation_model(model_config: ModelConfig,
-                                 parallel_config: ParallelConfig,
-                                 scheduler_config: SchedulerConfig,
-                                 speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized speculation model for inference.
-
-    This method is only applicable for speculation with a standalone draft model
-    """
-    from transformers_neuronx.fused_speculation import FusedSpeculativeDecoder
-
-    # For Eagle SD, we need to pass in additional parameters in neuron config.
-    is_eagle = getattr(speculation_config.draft_model_config.hf_config,
-                       "is_eagle", False)
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    if is_eagle:
-        default_neuron_config_args['is_eagle_target'] = True
-
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    if is_eagle:
-        default_draft_neuron_config_args['is_eagle_draft'] = True
-        default_draft_neuron_config_args['has_pre_attention_norm'] = False
-
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    num_speculative_tokens = speculation_config.num_speculative_tokens
-    # Create speculation model instance.
-    speculation_model = FusedSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                num_speculative_tokens)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
-
-
-def get_neuron_eagle_speculation_model(model_config: ModelConfig,
-                                       parallel_config: ParallelConfig,
-                                       scheduler_config: SchedulerConfig,
-                                       speculation_config: SpeculativeConfig):
-    """Initializes a neuron-optimized EAGLE speculation model for inference."""
-    from transformers_neuronx.eagle_speculation import EagleSpeculativeDecoder
-
-    # Create target model instance.
-    target_model = NeuronCausalLM(model_config.hf_config)
-
-    default_neuron_config_args = _get_default_neuron_config_for_speculation(
-        model_config, parallel_config, scheduler_config)
-    default_neuron_config_args['is_eagle_target'] = True
-    neuron_config = _get_neuron_config_after_override(
-        default_neuron_config_args, model_config.override_neuron_config)
-
-    context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS",
-                                            [scheduler_config.max_model_len])
-    n_positions = _get_buckets("NEURON_TOKEN_GEN_BUCKETS",
-                               [scheduler_config.max_model_len])
-
-    target_model.load_weights(
-        model_config.model,
-        tp_degree=parallel_config.tensor_parallel_size,
-        amp=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
-        neuron_config=neuron_config,
-        context_length_estimate=context_length_estimates,
-        n_positions=n_positions,
-        batch_size=scheduler_config.max_num_seqs)
-
-    target_model.eval()
-
-    # Create draft model instance.
-    draft_model = NeuronCausalLM(
-        speculation_config.draft_model_config.hf_config)
-
-    default_draft_neuron_config_args = (
-        _get_default_neuron_config_for_speculation(
-            speculation_config.draft_model_config, parallel_config,
-            scheduler_config))
-    default_draft_neuron_config_args['is_eagle_draft'] = True
-    default_draft_neuron_config_args['has_pre_attention_norm'] = False
-    draft_neuron_config = _get_neuron_config_after_override(
-        default_draft_neuron_config_args,
-        speculation_config.draft_model_config.override_neuron_config)
-
-    draft_model.load_weights(speculation_config.draft_model_config.model,
-                             tp_degree=speculation_config.
-                             draft_parallel_config.tensor_parallel_size,
-                             amp=TORCH_DTYPE_TO_NEURON_AMP[
-                                 speculation_config.draft_model_config.dtype],
-                             neuron_config=draft_neuron_config,
-                             context_length_estimate=context_length_estimates,
-                             n_positions=n_positions,
-                             batch_size=scheduler_config.max_num_seqs)
-
-    draft_model.eval()
-
-    token_tree: dict[int, list[int]] = ast.literal_eval(
-        speculation_config.speculative_token_tree)
-
-    speculation_model = EagleSpeculativeDecoder(draft_model.model,
-                                                target_model.model,
-                                                token_tree=token_tree)
-    speculation_model.to_neuron()
-
-    return NeuronSpeculationCausalLM(speculation_model)
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py