[V0 Deprecation] Remove MQLLMEngine (#25019)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>

[V0 Deprecation] Remove MQLLMEngine (#25019)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
5801e497 · Woosuk Kwon · GitHub · 58d4c705 · 5801e497 · 5801e497
Unverified Commit 5801e497 authored Sep 16, 2025 by Woosuk Kwon Committed by GitHub Sep 16, 2025
15 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,7 +46,6 @@ steps:
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
-  - tests/mq_llm_engine
  - tests/async_engine
  - tests/test_inputs.py
  - tests/test_outputs.py
@@ -57,7 +56,6 @@ steps:
  - tests/transformers_utils
  commands:
  - python3 standalone_tests/lazy_imports.py
-  - pytest -v -s mq_llm_engine # MQLLMEngine
  - pytest -v -s async_engine # AsyncLLMEngine
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py

--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -10,7 +10,6 @@ from unittest.mock import MagicMock
 import pytest

 from vllm.config.multimodal import MultiModalConfig
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
@@ -18,6 +17,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
 from vllm.lora.request import LoRARequest
 from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM

 MODEL_NAME = "openai-community/gpt2"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@@ -82,7 +82,7 @@ def register_mock_resolver():
 @pytest.fixture
 def mock_serving_setup():
    """Provides a mocked engine and serving completion instance."""
-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False


--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -13,12 +13,12 @@ import pytest
 import pytest_asyncio

 from vllm.config.multimodal import MultiModalConfig
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                    OpenAIServingModels)
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine.async_llm import AsyncLLM

 from ...utils import RemoteOpenAIServer

@@ -276,7 +276,7 @@ def test_async_serving_chat_init():

 @pytest.mark.asyncio
 async def test_serving_chat_returns_correct_model_name():
-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

@@ -312,7 +312,7 @@ async def test_serving_chat_returns_correct_model_name():

 @pytest.mark.asyncio
 async def test_serving_chat_should_set_correct_max_tokens():
-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

@@ -355,7 +355,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
    }

    # Reinitialize the engine with new settings
-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

@@ -410,7 +410,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
    }

    # Reinitialize the engine with new settings
-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

@@ -467,7 +467,7 @@ async def test_serving_chat_could_load_correct_generation_config():
        "repetition_penalty": 1.05
    }

-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False

@@ -523,7 +523,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
    mock_model_config = MockModelConfig()
    mock_model_config.hf_config.model_type = model_type

-    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine = MagicMock(spec=AsyncLLM)
    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
    mock_engine.errored = False


--- a/tests/mq_llm_engine/__init__.py
+++ b/tests/mq_llm_engine/__init__.py
--- a/tests/mq_llm_engine/conftest.py
+++ b/tests/mq_llm_engine/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that aborting is handled properly."""
-
-import asyncio
-import tempfile
-import uuid
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
-from vllm.engine.arg_utils import AsyncEngineArgs
-
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
-RAISED_ERROR = KeyError
-RAISED_VALUE = "foo"
-EXPECTED_TOKENS = 250
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-@pytest.mark.asyncio
-async def test_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        request_id_to_be_aborted = "request-aborted"
-        request_ids_a = [f"request-a-{idx}" for idx in range(10)]
-        request_ids_b = [f"request-b-{idx}" for idx in range(10)]
-
-        # Requests started before one to be aborted.
-        tasks = []
-        for request_id in request_ids_a:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
-
-        # Aborted.
-        task_aborted = asyncio.create_task(
-            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
-
-        # Requests started after one to be aborted.
-        for request_id in request_ids_b:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
-
-        # Actually abort.
-        await asyncio.sleep(0.5)
-        await client.abort(request_id_to_be_aborted)
-
-        # Confirm that we got all the EXPECTED tokens from the requests.
-        for task in tasks:
-            count, request_id = await task
-            assert count == EXPECTED_TOKENS, (
-                f"{request_id} generated only {count} tokens")
-
-        # Cancel task (this will hang indefinitely if not).
-        task_aborted.cancel()
-
-        # Shutdown.
-        client.close()
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that various errors are handled properly."""
-
-import asyncio
-import tempfile
-import time
-import uuid
-from unittest.mock import Mock
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.multiprocessing import MQEngineDeadError
-from vllm.engine.multiprocessing.engine import MQLLMEngine
-from vllm.entrypoints.openai.api_server import build_async_engine_client
-from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroupMetadata
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
-
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
-RAISED_ERROR = KeyError
-RAISED_VALUE = "foo"
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during first forward pass.
-    engine.engine.model_executor.execute_model = Mock(
-        side_effect=RAISED_ERROR(RAISED_VALUE))
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_evil_forward(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_forward) as engine:
-
-        client = await engine.make_client()
-
-        # Server should be healthy after initial probe.
-        await asyncio.sleep(2.0)
-        await client.check_health()
-
-        # Throws an error that should get ENGINE_DEAD_ERROR.
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-        assert client.errored
-
-        await asyncio.sleep(1.0)
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-        assert client.errored
-
-        # Shutdown.
-        client.close()
-
-
-def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
-                                        ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during first forward pass.
-    engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_health_check(tmp_socket):
-    with RemoteMQLLMEngine(
-            engine_args=ENGINE_ARGS,
-            ipc_path=tmp_socket,
-            run_fn=run_with_evil_model_executor_health) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # Health probe should throw RAISED_ERROR.
-        await asyncio.sleep(15.)
-
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-        assert client.errored
-
-        # Generate call should throw ENGINE_DEAD_ERROR
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        client.close()
-
-
-def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during abort call.
-    engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # First check health should work.
-        await client.check_health()
-
-        # Trigger an abort on the client side.
-        # This request ID does not exist, and will cause the engine to error
-        await client.abort(request_id="foo")
-
-        # Future generation requests will now fail
-        # with reference to the original KeyError("foo")
-        with pytest.raises(MQEngineDeadError) as execinfo:
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
-                pass
-        assert "KeyError" in repr(execinfo.value)
-        assert client.errored
-
-        # This should raise the original error.
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_batch_error(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # First check health should work.
-        await client.check_health()
-
-        # Batch of requests
-        async def do_generate(client):
-            # min_tokens=2048 to keep busy the engine busy
-            # to get enough time to get process a request
-            # that will crash the engine
-            params = SamplingParams(min_tokens=2048, max_tokens=2048)
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=params,
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
-
-        # This request will force a processing batch to raise
-        # an exception and next the engine get errored
-        await client.abort(request_id="foo")
-
-        # The batch of those request failed, then they
-        # should get the same exception as a MQEngineDeadError.
-        errors = await asyncio.gather(*tasks, return_exceptions=True)
-        for e in errors:
-            assert isinstance(e, MQEngineDeadError)
-            assert "KeyError" in repr(e)
-
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_bad_request(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        # Invalid request should fail, but not crash the server.
-        with pytest.raises(ValueError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id="abcd-1",
-                                           lora_request=LoRARequest(
-                                               "invalid-lora", 1,
-                                               "invalid-path")):
-                pass
-
-        # This request should be okay.
-        async for _ in client.generate(prompt="Hello my name is",
-                                       sampling_params=SamplingParams(),
-                                       request_id="abcd-2"):
-            pass
-
-        # Shutdown.
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
-        parser = make_arg_parser(parser)
-        args = parser.parse_args([])
-
-        # When LLMEngine is loaded, it will crash.
-        def mock_init():
-            raise ValueError
-
-        m.setattr(LLMEngine, "__init__", mock_init)
-
-        start = time.perf_counter()
-        async with build_async_engine_client(args):
-            pass
-        end = time.perf_counter()
-
-        assert end - start < 100, (
-            "Expected vLLM to gracefully shutdown in <100s "
-            "if there is an error in the startup.")
-
-
-@pytest.mark.asyncio
-async def test_mp_cuda_init():
-    # it should not crash, when cuda is initialized
-    # in the API server process
-    import torch
-    torch.cuda.init()
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
-
-    async with build_async_engine_client(args):
-        pass
-
-
-@pytest.mark.asyncio
-async def test_engine_process_death(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # kill the engine process
-        engine.proc.kill()
-
-        # Generate call should fail
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        # And the health check should show the engine is dead
-        with pytest.raises(RuntimeError, match="Engine process .* died"):
-            await client.check_health()
-
-        client.close()
-
-
-def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
-                                   ipc_path: str):
-    """Simulate an exception while preparing inputs for the model.
-    In the wild, this could be something like a multimodal input processor
-    failing on invalid image data."""
-
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    runner = engine.engine.model_executor.driver_worker.worker.model_runner
-
-    # Raise error in the model runner when adding a sequence group.
-    # See class ModelInputForGPUBuilder
-    def raiser(_, seq_group_metadata: SequenceGroupMetadata):
-        if seq_group_metadata.request_id.startswith("evil"):
-            raise RAISED_ERROR(RAISED_VALUE)
-
-    runner.builder.per_seq_group_compute_fns.append(raiser)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_inputs(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_input_processing) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # Engine should be healthy
-        await client.check_health()
-
-        async def run_failing_request():
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id="evil" + str(uuid.uuid4())):
-                pass
-
-        async def run_passing_request():
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
-                pass
-
-        passing_tasks = [
-            asyncio.create_task(run_passing_request()) for _ in range(10)
-        ]
-        failing_tasks = [
-            asyncio.create_task(run_failing_request()) for _ in range(10)
-        ]
-        await asyncio.gather(*failing_tasks, return_exceptions=True)
-        await asyncio.gather(*passing_tasks)
-
-        # All the bad inputs should have raised
-        for task in failing_tasks:
-            with pytest.raises(RAISED_ERROR):
-                task.result()
-
-        # But all good inputs should have still succeeded
-        for task in passing_tasks:
-            task.result()
-
-        # And the engine should remain healthy
-        assert not client.errored
-        await client.check_health()
-
-        client.close()
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that the MQLLMEngine is able to handle 10k concurrent requests."""
-
-import asyncio
-import tempfile
-import uuid
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
-from vllm.engine.arg_utils import AsyncEngineArgs
-
-MODEL = "google/gemma-1.1-2b-it"
-NUM_EXPECTED_TOKENS = 10
-NUM_REQUESTS = 10000
-
-# Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-@pytest.mark.asyncio
-async def test_load(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
-
-        # Create concurrent requests.
-        tasks = []
-        for request_id in request_ids:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, NUM_EXPECTED_TOKENS)))
-
-        # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
-        for task in tasks:
-            num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
-
-        # Shutdown.
-        client.close()
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import multiprocessing
-from typing import Callable, Union
-
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
-from vllm.engine.multiprocessing.engine import MQLLMEngine
-from vllm.outputs import RequestOutput
-from vllm.usage.usage_lib import UsageContext
-
-
-async def generate(
-        client: MQLLMEngineClient,
-        request_id: str,
-        num_tokens: int,
-        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
-
-    final_output = None
-    count = 0
-    async for out in client.generate(
-            request_id=request_id,
-            prompt="Hello my name is Robert and",
-            sampling_params=SamplingParams(max_tokens=num_tokens,
-                                           temperature=0)):
-
-        count += 1
-        final_output = out
-        await asyncio.sleep(0.)
-
-    if return_output:
-        return final_output
-
-    # Confirm we generated all the tokens we expected.
-    return count, request_id
-
-
-def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Run engine.
-    engine.start()
-
-
-class RemoteMQLLMEngine:
-
-    def __init__(self,
-                 engine_args: AsyncEngineArgs,
-                 ipc_path: str,
-                 run_fn: Callable = run_normal) -> None:
-
-        self.engine_args = engine_args
-        self.ipc_path = ipc_path
-        context = multiprocessing.get_context("spawn")
-        self.proc = context.Process(target=run_fn,
-                                    args=(engine_args, ipc_path))
-        self.proc.start()
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.proc.kill()
-
-    async def make_client(self) -> MQLLMEngineClient:
-        engine_config = self.engine_args.create_engine_config()
-        client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
-        while True:
-            try:
-                await client.setup()
-                break
-            except TimeoutError:
-                assert self.proc.is_alive()
-        return client
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import uuid
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import List, Mapping, Optional, Union
-
-from vllm import PoolingParams
-from vllm.inputs import PromptType
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
-from vllm.utils import Device
-
-VLLM_RPC_SUCCESS_STR = "SUCCESS"
-
-IPC_INPUT_EXT = "_input_socket"
-IPC_OUTPUT_EXT = "_output_socket"
-IPC_HEALTH_EXT = "_health_socket"
-IPC_DATA_EXT = "_data_socket"
-
-
-class MQEngineDeadError(RuntimeError):
-    pass
-
-
-@dataclass
-class RPCProcessRequest:
-    prompt: PromptType
-    params: Union[SamplingParams, PoolingParams]
-    request_id: str
-    lora_request: Optional[LoRARequest] = None
-    trace_headers: Optional[Mapping[str, str]] = None
-    priority: int = 0
-
-    def __init__(
-        self,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> None:
-        super().__init__()
-
-        self.prompt = prompt
-        self.params = params
-        self.request_id = request_id
-        self.lora_request = lora_request
-        self.trace_headers = trace_headers
-        self.priority = priority
-
-
-@dataclass
-class RPCError:
-    request_id: Optional[str]
-    is_engine_errored: bool
-    exception: BaseException
-
-
-@dataclass
-class RPCAbortRequest:
-    request_id: str
-
-
-class RPCStartupRequest(Enum):
-    IS_SERVER_READY = 1
-
-
-@dataclass
-class RPCStartupResponse:
-    tracing_enabled: bool
-
-
-class RPCUProfileRequest(Enum):
-    START_PROFILE = 1
-    STOP_PROFILE = 2
-
-
-class RPCResetMultiModalCacheRequest(Enum):
-    RESET = 1
-
-
-@dataclass
-class RPCResetPrefixCacheRequest:
-    device: Device
-
-
-class RPCSleepRequest(Enum):
-    SLEEP_LEVEL_1 = 1
-    SLEEP_LEVEL_2 = 2
-
-
-@dataclass
-class RPCWakeUpRequest:
-    tags: Optional[list[str]] = None
-
-
-@dataclass
-class RPCIsSleepingRequest:
-    # Set the default value of request_id to a new UUID
-    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
-
-
-@dataclass
-class RPCIsSleepingResponse:
-    request_id: str
-    is_sleeping: bool
-
-
-@dataclass
-class RPCLoadAdapterRequest:
-    lora_request: LoRARequest
-    # Set the default value of request_id to a new UUID
-    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
-
-
-@dataclass
-class RPCAdapterLoadedResponse:
-    request_id: str
-    lora_loaded: bool
-
-
-RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest, RPCLoadAdapterRequest,
-                      RPCResetMultiModalCacheRequest,
-                      RPCResetPrefixCacheRequest, RPCSleepRequest,
-                      RPCWakeUpRequest, RPCIsSleepingRequest]
-
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
-                          RPCIsSleepingResponse, RPCError]
-
-
-def ENGINE_DEAD_ERROR(
-        error: Optional[BaseException] = None) -> MQEngineDeadError:
-    if error is None:
-        return MQEngineDeadError(
-            "Engine loop is not running. Inspect the stacktrace to "
-            "find the original error")
-
-    return MQEngineDeadError(
-        "Engine loop is not running. Inspect the stacktrace to "
-        f"find the original error: {repr(error)}.")
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pickle
-import signal
-from contextlib import contextmanager
-from typing import Iterator, List, Optional, Union
-
-import cloudpickle
-import zmq
-
-from vllm import AsyncEngineArgs, SamplingParams
-from vllm.config import VllmConfig
-from vllm.engine.llm_engine import LLMEngine
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
-                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
-                                         IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
-                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCAdapterLoadedResponse, RPCError,
-                                         RPCIsSleepingRequest,
-                                         RPCIsSleepingResponse,
-                                         RPCLoadAdapterRequest,
-                                         RPCProcessRequest,
-                                         RPCResetMultiModalCacheRequest,
-                                         RPCResetPrefixCacheRequest,
-                                         RPCSleepRequest, RPCStartupRequest,
-                                         RPCStartupResponse,
-                                         RPCUProfileRequest, RPCWakeUpRequest)
-# yapf: enable
-from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import deprecate_kwargs
-from vllm.worker.model_runner_base import InputProcessingError
-
-logger = init_logger(__name__)
-
-POLLING_TIMEOUT_MS = 10000
-HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
-
-
-class MQLLMEngine:
-    """A multiprocessing wrapper for
-    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-
-    This class is used to wrap the
-    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
-    in concurrent manner. It runs a background loop and uses zeromq to
-    receive new requests and stream outputs incrementally via ipc.
-
-    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
-    process is kicked off when a new RPCProcessRequest is received by the
-    input_socket.
-
-    The self.engine_loop checks the input_socket for new requests,
-    adds them to the LLMEngine if there are any, calls the internal
-    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
-    the RequestOutputs back over the output_socket.
-
-    If use_async_sockets is set, the logic associated with reading new
-    requests from the socket and sending data to the socket is passed
-    as a callback to the llm_engine, which calls the logic asynchronously
-    such that the IPC can be overlapped with the GPU.
-
-    Args:
-        ipc_path: Base path for zeromq interprocess messaging
-        use_async_sockets: Whether to make send/recv async with GPU
-        log_requests: Whether to log the requests.
-        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-    """
-
-    def __init__(self,
-                 ipc_path: str,
-                 use_async_sockets: bool,
-                 *args,
-                 log_requests: bool = True,
-                 **kwargs) -> None:
-        # For MQLLMEngine, we can use cached outputs, since each new request
-        # output is immediately pickled and send over the socket, which frees
-        # the python object to be reused again.
-        kwargs['use_cached_outputs'] = True
-
-        self.engine = LLMEngine(*args, **kwargs)
-        self.log_requests = log_requests
-
-        self.use_async_sockets = use_async_sockets
-        if self.use_async_sockets:
-            self.engine.process_request_outputs_callback = \
-                self._async_socket_engine_callback
-
-        self.ctx = zmq.Context()  # type: ignore[attr-defined]
-
-        # Receive input from the client.
-        self.input_socket = self.ctx.socket(zmq.constants.PULL)
-        self.input_socket.bind(f"{ipc_path}{IPC_INPUT_EXT}")
-
-        # Send output stream back to client.
-        self.output_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.output_socket.bind(f"{ipc_path}{IPC_OUTPUT_EXT}")
-
-        # Send heartbeats back to client.
-        self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
-
-        # IPC path for the data socket.
-        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
-
-        # Error state.
-        self._errored_with: Optional[BaseException] = None
-
-    @property
-    def dead_error(self) -> BaseException:
-        if self._errored_with is not None:
-            return ENGINE_DEAD_ERROR(self._errored_with)
-        else:
-            return ENGINE_DEAD_ERROR()
-
-    @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=("This argument will have no effect. "
-                            "Use `enable_log_requests` instead."),
-    )
-    def from_vllm_config(
-            cls,
-            vllm_config: VllmConfig,
-            usage_context: UsageContext,
-            enable_log_requests: bool,
-            disable_log_stats: bool,
-            ipc_path: str,
-            disable_log_requests: bool = True,  # Deprecated, will be removed
-    ) -> "MQLLMEngine":
-        # Setup plugins for each process
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
-        use_async_sockets = vllm_config.model_config.use_async_output_proc
-
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=LLMEngine._get_executor_cls(vllm_config),
-            ipc_path=ipc_path,
-            usage_context=usage_context,
-            use_async_sockets=use_async_sockets,
-            log_requests=enable_log_requests,
-            log_stats=(not disable_log_stats),
-        )
-
-    @staticmethod
-    def from_engine_args(engine_args: AsyncEngineArgs,
-                         usage_context: UsageContext, ipc_path: str):
-        """Creates an MQLLMEngine from the engine arguments."""
-
-        vllm_config = engine_args.create_engine_config(usage_context)
-        return MQLLMEngine.from_vllm_config(
-            ipc_path=ipc_path,
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            enable_log_requests=engine_args.enable_log_requests,
-            disable_log_stats=engine_args.disable_log_stats,
-        )
-
-    def start(self):
-        try:
-            try:
-                logger.debug("Starting Startup Loop.")
-                self.run_startup_loop()
-                logger.debug("Starting Engine Loop.")
-                self.run_engine_loop()
-            except Exception as e:
-                logger.exception(repr(e))
-        except KeyboardInterrupt:
-            logger.debug("Shutting down MQLLMEngine.")
-        finally:
-            logger.debug("MQLLMEngine is shut down.")
-            self.cleanup()
-
-    def cleanup(self):
-        """Cleanup zeromq state on shutdown."""
-        # Closes all sockets and destroys context.
-        self.ctx.destroy(linger=0)
-        del self.engine
-
-    @contextmanager
-    def make_data_socket(
-            self) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
-        socket = self.ctx.socket(zmq.constants.ROUTER)
-        try:
-            socket.bind(self.data_ipc_path)
-            yield socket
-        finally:
-            socket.close(linger=0)
-
-    def run_startup_loop(self) -> None:
-        """Startup loop for sending data from Engine -> Client."""
-
-        with self.make_data_socket() as socket:
-            response: Union[RPCStartupResponse, BaseException]
-            try:
-                identity, message = socket.recv_multipart(copy=False)
-                request: RPCStartupRequest = pickle.loads(message.buffer)
-
-                # Handle the query from the Client.
-                if request == RPCStartupRequest.IS_SERVER_READY:
-                    tracing_enabled = self.engine.is_tracing_enabled()
-                    response = RPCStartupResponse(
-                        tracing_enabled=tracing_enabled)
-
-            except Exception as e:
-                response = e
-
-            socket.send_multipart((identity, pickle.dumps(response)),
-                                  copy=False)
-
-    def run_engine_loop(self):
-        """Core busy loop of the LLMEngine."""
-
-        while True:
-            if not self.engine.has_unfinished_requests():
-                # Poll until there is work to do.
-                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                    # When there's no work, check on engine health and send
-                    # health status back to client
-                    self._health_check()
-                    self.engine.do_log_stats()
-                    logger.debug("Waiting for new requests in engine loop.")
-
-            # Handle any input from the client.
-            self.handle_new_input()
-
-            # Engine step.
-            request_outputs = self.engine_step()
-
-            # Send request outputs (if async, done in engine_step callback).
-            if not self.use_async_sockets:
-                self._send_outputs(request_outputs)
-
-    def engine_step(self) -> List[RequestOutput]:
-        """Engine step wrapper with error handling."""
-        try:
-            return self.engine.step()
-        except SystemExit:
-            raise
-        except InputProcessingError as e:
-            # Special case where we handle an error preparing the inputs for
-            # a single request in the batch
-            rpc_err = RPCError(request_id=e.request_id,
-                               is_engine_errored=False,
-                               exception=e.__cause__)
-            self._send_outputs(rpc_err)
-            return []
-        except BaseException as e:
-            self._set_errored(e)
-            rpc_err = RPCError(request_id=None,
-                               is_engine_errored=True,
-                               exception=e)
-            self._send_outputs(rpc_err)
-            raise e
-
-    def handle_new_input(self):
-        """Handle new input from the socket"""
-        try:
-            while self.input_socket.poll(timeout=0) != 0:
-                frames = self.input_socket.recv_multipart(copy=False)
-                request = pickle.loads(frames[0].buffer)
-
-                if isinstance(request, RPCProcessRequest):
-                    if len(frames) > 1:
-                        # Use cloudpickle for logits processors
-                        assert isinstance(request.params, SamplingParams)
-                        lprocs = cloudpickle.loads(frames[1].buffer)
-                        request.params.logits_processors = lprocs
-                    self._handle_process_request(request)
-                elif isinstance(request, RPCAbortRequest):
-                    self._handle_abort_request(request)
-                elif isinstance(request, RPCUProfileRequest):
-                    if request == RPCUProfileRequest.START_PROFILE:
-                        self.start_profile()
-                    else:
-                        self.stop_profile()
-                elif isinstance(request, RPCLoadAdapterRequest):
-                    self._handle_load_adapter_request(request)
-                elif isinstance(request, RPCResetMultiModalCacheRequest):
-                    self.reset_mm_cache()
-                elif isinstance(request, RPCResetPrefixCacheRequest):
-                    self.reset_prefix_cache()
-                elif isinstance(request, RPCSleepRequest):
-                    self.sleep(request.value)
-                elif isinstance(request, RPCWakeUpRequest):
-                    self.wake_up(request.tags)
-                elif isinstance(request, RPCIsSleepingRequest):
-                    self._handle_is_sleeping_request(request)
-                else:
-                    raise ValueError("Unknown RPCRequest Type: "
-                                     f"{type(request)}")
-
-        except Exception as e:
-            self._set_errored(e)
-            self._send_unhealthy(e)
-            raise e from None
-
-    def _handle_process_request(self, request: RPCProcessRequest):
-        """Handle RPCProcessRequest by adding it to the LLMEngine."""
-        request_id = request.request_id
-
-        if self._errored_with is not None:
-            rpc_err = RPCError(request_id=request_id,
-                               is_engine_errored=True,
-                               exception=ENGINE_DEAD_ERROR(self._errored_with))
-            self._send_outputs(rpc_err)
-
-        try:
-            self.engine.add_request(request_id=request_id,
-                                    prompt=request.prompt,
-                                    params=request.params,
-                                    lora_request=request.lora_request,
-                                    trace_headers=request.trace_headers,
-                                    priority=request.priority)
-
-            if self.log_requests:
-                logger.info("Added request %s.", request.request_id)
-
-        except Exception as e:
-            # We do not set self._errored = True here, since the error
-            # is due to an issue adding this request to the engine,
-            # rather than an issue with the engine itself.
-            logger.debug("Failed to add request %s to engine. %s",
-                         request.request_id, e)
-            is_errored = self._errored_with is not None
-            rpc_err = RPCError(request_id=request_id,
-                               is_engine_errored=is_errored,
-                               exception=e)
-            self._send_outputs(rpc_err)
-
-            # Remove request from the engine.
-            self.engine.abort_request(request_id)
-
-    def _handle_abort_request(self, request: RPCAbortRequest):
-        self.engine.abort_request(request.request_id)
-        if self.log_requests:
-            logger.info("Aborted request %s.", request.request_id)
-
-    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
-        try:
-            lora_loaded = self.engine.add_lora(request.lora_request)
-        except BaseException as e:
-            # Send back an error if the adater fails to load
-            rpc_err = RPCError(request_id=request.request_id,
-                               is_engine_errored=False,
-                               exception=e)
-            self._send_outputs(rpc_err)
-            return
-        # Otherwise, send back the successful load message
-        self._send_outputs(
-            RPCAdapterLoadedResponse(request_id=request.request_id,
-                                     lora_loaded=lora_loaded))
-
-    def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
-        is_sleeping = self.is_sleeping()
-        self._send_outputs(
-            RPCIsSleepingResponse(request_id=request.request_id,
-                                  is_sleeping=is_sleeping))
-
-    def _health_check(self):
-        # Send unhealthy if engine has already errored
-        if self._errored_with is not None:
-            self._send_unhealthy(self._errored_with)
-        try:
-            self.engine.check_health()
-            self._send_healthy()
-        except Exception as e:
-            self._set_errored(e)
-            self._send_unhealthy(e)
-
-    def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send outputs back to the engine client. These can be:
-        - Exceptions
-        - A list of generation outputs
-        - A response from loading a lora adapter
-        """
-        if outputs:
-            try:
-                from ray.exceptions import RayTaskError
-
-                # RayTaskError might not pickelable here. We need to unpack the
-                # underlying exception as the real exception in the output.
-                if (isinstance(outputs, RPCError)
-                        and isinstance(outputs.exception, RayTaskError)):
-                    outputs.exception = outputs.exception.cause
-            except ImportError:
-                pass
-
-            output_bytes = pickle.dumps(outputs)
-            self.output_socket.send_multipart((output_bytes, ), copy=False)
-
-    def _send_healthy(self):
-        """Send HEALTHY message to RPCClient."""
-        if not self.heartbeat_socket.closed:
-            self.heartbeat_socket.send_multipart(HEALTHY_RESPONSE, copy=False)
-
-    def _send_unhealthy(self, error: BaseException):
-        """Send UNHEALTHY message to RPCClient."""
-        if not self.heartbeat_socket.closed:
-            error_bytes = pickle.dumps(error)
-            self.heartbeat_socket.send_multipart((error_bytes, ), copy=False)
-
-    def _async_socket_engine_callback(self,
-                                      request_outputs: REQUEST_OUTPUTS_T):
-        """Callback used by engine to make socket handling async with GPU."""
-        self._send_outputs(request_outputs)
-        self.handle_new_input()
-
-    def _set_errored(self, e: BaseException):
-        """Log and set errored status if this is the first issue."""
-        if self._errored_with is None:
-            self._errored_with = e
-
-    def start_profile(self) -> None:
-        self.engine.start_profile()
-
-    def stop_profile(self) -> None:
-        self.engine.stop_profile()
-
-    def reset_mm_cache(self) -> bool:
-        return self.engine.reset_mm_cache()
-
-    def reset_prefix_cache(self) -> bool:
-        return self.engine.reset_prefix_cache()
-
-    def sleep(self, level: int = 1) -> None:
-        self.engine.sleep(level)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up(tags)
-
-    def is_sleeping(self) -> bool:
-        return self.engine.is_sleeping()
-
-
-def signal_handler(*_) -> None:
-    raise KeyboardInterrupt("MQLLMEngine terminated")
-
-
-def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
-                  ipc_path: str, disable_log_stats: bool,
-                  enable_log_requests: bool, engine_alive):
-    try:
-        # Ensure we can serialize transformer config before spawning
-        maybe_register_config_serialize_by_value()
-
-        engine = MQLLMEngine.from_vllm_config(
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            disable_log_stats=disable_log_stats,
-            enable_log_requests=enable_log_requests,
-            ipc_path=ipc_path)
-
-        signal.signal(signal.SIGTERM, signal_handler)
-
-        engine.start()
-
-    except BaseException as e:
-        logger.exception(e)
-        engine_alive.value = False
-        raise e from None
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,7 +12,6 @@ from fastapi import FastAPI, Request, Response

 from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
-from vllm.engine.multiprocessing import MQEngineDeadError
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.constants import (H11_MAX_HEADER_COUNT_DEFAULT,
                                        H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT)
@@ -156,7 +155,6 @@ def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:

    @app.exception_handler(RuntimeError)
    @app.exception_handler(AsyncEngineDeadError)
-    @app.exception_handler(MQEngineDeadError)
    @app.exception_handler(EngineDeadError)
    @app.exception_handler(EngineGenerateError)
    async def runtime_exception_handler(request: Request, __):

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
-import atexit
 import gc
 import importlib
 import inspect
@@ -17,7 +16,6 @@ import uuid
 from argparse import Namespace
 from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
-from functools import partial
 from http import HTTPStatus
 from typing import Annotated, Any, Callable, Optional

@@ -42,8 +40,6 @@ import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
-from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (load_chat_template,
                                         resolve_hf_chat_template,
@@ -102,13 +98,10 @@ from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
                                    log_non_default_args, with_cancellation)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, decorate_logs,
-                        get_open_zmq_ipc_path, is_valid_ipv6_address,
-                        set_ulimit)
+                        is_valid_ipv6_address, set_ulimit)
 from vllm.v1.metrics.prometheus import get_prometheus_registry
 from vllm.version import __version__ as VLLM_VERSION

@@ -237,8 +230,7 @@ async def build_async_engine_client_from_engine_args(
                async_llm.shutdown()

    # V0 AsyncLLM.
-    elif (MQLLMEngineClient.is_unsupported_config(vllm_config)
-          or disable_frontend_multiprocessing):
+    else:

        engine_client: Optional[EngineClient] = None
        try:
@@ -252,96 +244,6 @@ async def build_async_engine_client_from_engine_args(
            if engine_client and hasattr(engine_client, "shutdown"):
                engine_client.shutdown()

-    # V0MQLLMEngine.
-    else:
-        if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
-            # Make TemporaryDirectory for prometheus multiprocessing
-            # Note: global TemporaryDirectory will be automatically
-            #   cleaned up upon exit.
-            global prometheus_multiproc_dir
-            prometheus_multiproc_dir = tempfile.TemporaryDirectory()
-            os.environ[
-                "PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
-        else:
-            logger.warning(
-                "Found PROMETHEUS_MULTIPROC_DIR was set by user. "
-                "This directory must be wiped between vLLM runs or "
-                "you will find inaccurate metrics. Unset the variable "
-                "and vLLM will properly handle cleanup.")
-
-        # Select random path for IPC.
-        ipc_path = get_open_zmq_ipc_path()
-        logger.debug("Multiprocessing frontend to use %s for IPC Path.",
-                     ipc_path)
-
-        # Start RPCServer in separate process (holds the LLMEngine).
-        # the current process might have CUDA context,
-        # so we need to spawn a new process
-        context = multiprocessing.get_context("spawn")
-
-        # Ensure we can serialize transformer config before spawning
-        maybe_register_config_serialize_by_value()
-
-        # The Process can raise an exception during startup, which may
-        # not actually result in an exitcode being reported. As a result
-        # we use a shared variable to communicate the information.
-        engine_alive = multiprocessing.Value('b', True, lock=False)
-        engine_process = context.Process(
-            target=run_mp_engine,
-            args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
-                  engine_args.disable_log_stats,
-                  engine_args.enable_log_requests, engine_alive))
-        engine_process.start()
-        engine_pid = engine_process.pid
-        assert engine_pid is not None, "Engine process failed to start."
-        logger.info("Started engine process with PID %d", engine_pid)
-
-        def _cleanup_ipc_path():
-            socket_path = ipc_path.replace("ipc://", "")
-            if os.path.exists(socket_path):
-                os.remove(socket_path)
-
-        # Ensure we clean up the local IPC socket file on exit.
-        atexit.register(_cleanup_ipc_path)
-
-        # Build RPCClient, which conforms to EngineClient Protocol.
-        build_client = partial(MQLLMEngineClient, ipc_path, vllm_config,
-                               engine_pid)
-        mq_engine_client = await asyncio.get_running_loop().run_in_executor(
-            None, build_client)
-        try:
-            while True:
-                try:
-                    await mq_engine_client.setup()
-                    break
-                except TimeoutError:
-                    if (not engine_process.is_alive()
-                            or not engine_alive.value):
-                        raise RuntimeError(
-                            "Engine process failed to start. See stack "
-                            "trace for the root cause.") from None
-
-            yield mq_engine_client  # type: ignore[misc]
-        finally:
-            # Ensure rpc server process was terminated
-            engine_process.terminate()
-
-            # Close all open connections to the backend
-            mq_engine_client.close()
-
-            # Wait for engine process to join
-            engine_process.join(4)
-            if engine_process.exitcode is None:
-                # Kill if taking longer than 5 seconds to stop
-                engine_process.kill()
-
-            # Lazy import for prometheus multiprocessing.
-            # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-            # before prometheus_client is imported.
-            # See https://prometheus.github.io/client_python/multiprocess/
-            from prometheus_client import multiprocess
-            multiprocess.mark_process_dead(engine_process.pid)
-

 async def validate_json_request(raw_request: Request):
    content_type = raw_request.headers.get("content-type", "").lower()

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -191,7 +191,7 @@ class RocmPlatform(Platform):
                             kv_cache_dtype, block_size, use_v1, use_mla,
                             has_sink) -> str:
        if use_mla:
-            from vllm.attention.backends.rocm_aiter_mla import (
+            from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
                is_aiter_mla_enabled)

            if selected_backend is None: