Merge branch 'v0.5.4-dtk24.04.1'

e7c1b7f3 · zhuwenwen · 7462218e · 04c62b93 · 7462218e · e7c1b7f3
Commit e7c1b7f3 authored Sep 06, 2024 by zhuwenwen
20 changed files
--- a/tests/tokenization/test_image_processor.py
+++ b/tests/tokenization/test_image_processor.py
-import pytest
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.transformers_utils.image_processor import get_image_processor
-
-IMAGE_PROCESSOR_NAMES = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-34b-hf",
-]
-
-
-@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
-def test_image_processor_revision(processor_name: str):
-    # Assume that "main" branch always exists
-    image_processor = get_image_processor(processor_name, revision="main")
-    assert isinstance(image_processor, BaseImageProcessor)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_image_processor(processor_name, revision="never")
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
 import asyncio
 import os
+import sys
+from typing import List, Optional
 from unittest.mock import patch

 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizerBase

-from vllm.transformers_utils.tokenizer_group import get_tokenizer_group
+from vllm.transformers_utils.tokenizer_group import (TokenizerGroup,
+                                                     get_tokenizer_group)
 from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
    RayTokenizerGroupPool)
-from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
-    TokenizerGroup)

 from ..conftest import get_tokenizer_pool_config


+class CustomTokenizerGroup(TokenizerGroup):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._i = 0
+
+    def encode(self, *args, **kwargs):
+        self._i += 1
+        return super().encode(*args, **kwargs)
+
+
 @pytest.mark.asyncio
-@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"])
+@pytest.mark.parametrize("tokenizer_group_type",
+                         [None, "ray", CustomTokenizerGroup])
 async def test_tokenizer_group(tokenizer_group_type):
    reference_tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer_group = get_tokenizer_group(
@@ -34,6 +47,8 @@ async def test_tokenizer_group(tokenizer_group_type):
                      PreTrainedTokenizerBase)
    assert tokenizer_group.get_lora_tokenizer(
        None) == await tokenizer_group.get_lora_tokenizer_async(None)
+    if tokenizer_group_type is CustomTokenizerGroup:
+        assert tokenizer_group._i > 0


 @pytest.mark.asyncio
@@ -100,3 +115,100 @@ async def test_tokenizer_group_ray_pool_env_var_propagation(
            max_num_seqs=1,
            max_input_length=None)
        tokenizer_pool.ping()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tokenizer_group_type", ["ray"])
+async def test_tokenizer_group_ray_pool_fault_tolerance(tokenizer_group_type):
+    """Test that Ray tokenizer pool group can recover from failures and
+    if that's not possible, mark itself as unhealthy."""
+
+    class FailingTokenizerGroup(TokenizerGroup):
+
+        def __init__(self,
+                     *args,
+                     fail_at: Optional[List[int]] = None,
+                     **kwargs):
+            super().__init__(*args, **kwargs)
+            self.i = 0
+            self.fail_at = fail_at or []
+
+        def encode(self, *args, **kwargs):
+            self.i += 1
+            if self.i in self.fail_at:
+                sys.exit(1)
+            return super().encode(*args, **kwargs)
+
+    class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
+        _worker_cls = FailingTokenizerGroup
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type)
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Modify fail at to not fail at all (will be re-read when actor is
+    # re-initialized).
+    fail_at[0] = 1000
+
+    # We should recover successfully.
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+
+    # Check that we have a new actor
+    assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
+    assert tokenizer_group_pool.tokenizer_actors != tokenizer_actors
+
+    # Fail at first iteration
+    fail_at = [1]
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=None,
+        fail_at=fail_at)
+
+    # We should fail after re-initialization.
+    with pytest.raises(RuntimeError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt",
+                                                lora_request=None)
+
+    # check_health should raise the same thing
+    with pytest.raises(RuntimeError):
+        tokenizer_group_pool.check_health()
+
+    # Ensure that non-ActorDiedErrors are still propagated correctly and do not
+    # cause a re-initialization.
+    fail_at = []
+    tokenizer_group_pool = FailingRayTokenizerGroupPool.from_config(
+        tokenizer_pool_config,
+        tokenizer_id="gpt2",
+        enable_lora=False,
+        max_num_seqs=1,
+        max_input_length=2,
+        fail_at=fail_at)
+    tokenizer_actors = tokenizer_group_pool.tokenizer_actors.copy()
+
+    # Prompt too long error
+    with pytest.raises(ValueError):
+        await tokenizer_group_pool.encode_async(request_id="1",
+                                                prompt="prompt" * 100,
+                                                lora_request=None)
+    await tokenizer_group_pool.encode_async(request_id="1",
+                                            prompt="prompt",
+                                            lora_request=None)
+    # Actors should stay the same.
+    assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
--- a/tests/tracing/__init__.py
+++ b/tests/tracing/__init__.py
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
+import os
+import threading
+from concurrent import futures
+from typing import Callable, Dict, Iterable, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: Dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service():
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(trace_service):
+    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+
+    sampling_params = SamplingParams(temperature=0.01,
+                                     top_p=0.1,
+                                     max_tokens=256)
+    model = "facebook/opt-125m"
+    llm = LLM(
+        model=model,
+        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+    )
+    prompts = ["This is a short prompt"]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    timeout = 5
+    if not trace_service.evt.wait(timeout):
+        raise TimeoutError(
+            f"The fake trace service didn't receive a trace within "
+            f"the {timeout} seconds timeout")
+
+    attributes = decode_attributes(trace_service.request.resource_spans[0].
+                                   scope_spans[0].spans[0].attributes)
+    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(
+        SpanAttributes.LLM_REQUEST_BEST_OF) == sampling_params.best_of
+    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        outputs[0].prompt_token_ids)
+    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+    assert attributes.get(
+        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+    metrics = outputs[0].metrics
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+    ttft = metrics.first_token_time - metrics.arrival_time
+    assert attributes.get(
+        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+    e2e_time = metrics.finished_time - metrics.arrival_time
+    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
--- a/tests/utils.py
+++ b/tests/utils.py
+import functools
 import os
+import signal
 import subprocess
 import sys
 import time
 import warnings
 from contextlib import contextmanager
-from typing import List
+from pathlib import Path
+from typing import Any, Dict, List, Optional

 import openai
 import ray
 import requests
+from transformers import AutoTokenizer

 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment)
 from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port
+from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip

-# Path to root of repository so that utilities can be imported by ray workers
-VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+if is_hip():
+    from amdsmi import (amdsmi_get_gpu_vram_usage,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down)
+
+    @contextmanager
+    def _nvml():
+        try:
+            amdsmi_init()
+            yield
+        finally:
+            amdsmi_shut_down()
+else:
+    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
+                        nvmlInit, nvmlShutdown)
+
+    @contextmanager
+    def _nvml():
+        try:
+            nvmlInit()
+            yield
+        finally:
+            nvmlShutdown()
+
+
+VLLM_PATH = Path(__file__).parent.parent
+"""Path to root of the vLLM repository."""


 class RemoteOpenAIServer:
    DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
-    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
-
-    @ray.remote(num_gpus=1)
-    class _RemoteRunner:
-
-        def __init__(self, cli_args: List[str], *, wait_url: str,
-                     wait_timeout: float) -> None:
-            env = os.environ.copy()
-            env["PYTHONUNBUFFERED"] = "1"
-            self.proc = subprocess.Popen(
-                [
-                    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-                    *cli_args
-                ],
-                env=env,
-                stdout=sys.stdout,
-                stderr=sys.stderr,
-            )
+    MAX_SERVER_START_WAIT_S = 120  # wait for server to start for 120 seconds

-            self._wait_for_server(url=wait_url, timeout=wait_timeout)
-
-        def ready(self):
-            return True
-
-        def _wait_for_server(self, *, url: str, timeout: float):
-            # run health check
-            start = time.time()
-            while True:
-                try:
-                    if requests.get(url).status_code == 200:
-                        break
-                except Exception as err:
-                    if self.proc.poll() is not None:
-                        raise RuntimeError(
-                            "Server exited unexpectedly.") from err
-
-                    time.sleep(0.5)
-                    if time.time() - start > timeout:
-                        raise RuntimeError(
-                            "Server failed to start in time.") from err
-
-        def __del__(self):
-            if hasattr(self, "proc"):
-                self.proc.terminate()
-
-    def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
+    def __init__(
+        self,
+        model: str,
+        cli_args: List[str],
+        *,
+        env_dict: Optional[Dict[str, str]] = None,
+        auto_port: bool = True,
+    ) -> None:
        if auto_port:
            if "-p" in cli_args or "--port" in cli_args:
                raise ValueError("You have manually specified the port"
@@ -74,17 +67,53 @@ class RemoteOpenAIServer:

            cli_args = cli_args + ["--port", str(get_open_port())]

-        parser = make_arg_parser()
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
        args = parser.parse_args(cli_args)
        self.host = str(args.host or 'localhost')
        self.port = int(args.port)

-        self._runner = self._RemoteRunner.remote(
-            cli_args,
-            wait_url=self.url_for("health"),
-            wait_timeout=self.MAX_SERVER_START_WAIT_S)
+        env = os.environ.copy()
+        # the current process might initialize cuda,
+        # to be safe, we should use spawn method
+        env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+        if env_dict is not None:
+            env.update(env_dict)
+        self.proc = subprocess.Popen(["vllm", "serve"] + [model] + cli_args,
+                                     env=env,
+                                     stdout=sys.stdout,
+                                     stderr=sys.stderr)
+        self._wait_for_server(url=self.url_for("health"),
+                              timeout=self.MAX_SERVER_START_WAIT_S)
+
+    def __enter__(self):
+        return self

-        self._wait_until_ready()
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.terminate()
+        try:
+            self.proc.wait(3)
+        except subprocess.TimeoutExpired:
+            # force kill if needed
+            self.proc.kill()
+
+    def _wait_for_server(self, *, url: str, timeout: float):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(url).status_code == 200:
+                    break
+            except Exception as err:
+                result = self.proc.poll()
+                if result is not None and result != 0:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > timeout:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err

    @property
    def url_root(self) -> str:
@@ -93,9 +122,6 @@ class RemoteOpenAIServer:
    def url_for(self, *parts: str) -> str:
        return self.url_root + "/" + "/".join(parts)

-    def _wait_until_ready(self) -> None:
-        ray.get(self._runner.ready.remote())
-
    def get_client(self):
        return openai.OpenAI(
            base_url=self.url_for("v1"),
@@ -109,6 +135,141 @@ class RemoteOpenAIServer:
        )


+def compare_two_settings(model: str,
+                         arg1: List[str],
+                         arg2: List[str],
+                         env1: Optional[Dict[str, str]] = None,
+                         env2: Optional[Dict[str, str]] = None):
+    """
+    Launch API server with two different sets of arguments/environments
+    and compare the results of the API calls.
+
+    Args:
+        model: The model to test.
+        arg1: The first set of arguments to pass to the API server.
+        arg2: The second set of arguments to pass to the API server.
+        env1: The first set of environment variables to pass to the API server.
+        env2: The second set of environment variables to pass to the API server.
+    """
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
+    prompt = "Hello, my name is"
+    token_ids = tokenizer(prompt)["input_ids"]
+    results = []
+    for args, env in ((arg1, env1), (arg2, env2)):
+        with RemoteOpenAIServer(model, args, env_dict=env) as server:
+            client = server.get_client()
+
+            # test models list
+            models = client.models.list()
+            models = models.data
+            served_model = models[0]
+            results.append({
+                "test": "models_list",
+                "id": served_model.id,
+                "root": served_model.root,
+            })
+
+            # test with text prompt
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+            results.append({
+                "test": "single_completion",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test using token IDs
+            completion = client.completions.create(
+                model=model,
+                prompt=token_ids,
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "token_ids",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test seeded random sampling
+            completion = client.completions.create(model=model,
+                                                   prompt=prompt,
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test": "seeded_sampling",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            })
+
+            # test seeded random sampling with multiple prompts
+            completion = client.completions.create(model=model,
+                                                   prompt=[prompt, prompt],
+                                                   max_tokens=5,
+                                                   seed=33,
+                                                   temperature=1.0)
+
+            results.append({
+                "test":
+                "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason":
+                [choice.finish_reason for choice in completion.choices],
+                "usage":
+                completion.usage,
+            })
+
+            # test simple list
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+            )
+
+            results.append({
+                "test": "simple_list",
+                "text0": batch.choices[0].text,
+                "text1": batch.choices[1].text,
+            })
+
+            # test streaming
+            batch = client.completions.create(
+                model=model,
+                prompt=[prompt, prompt],
+                max_tokens=5,
+                temperature=0.0,
+                stream=True,
+            )
+            texts = [""] * 2
+            for chunk in batch:
+                assert len(chunk.choices) == 1
+                choice = chunk.choices[0]
+                texts[choice.index] += choice.text
+            results.append({
+                "test": "streaming",
+                "texts": texts,
+            })
+
+    n = len(results) // 2
+    arg1_results = results[:n]
+    arg2_results = results[n:]
+    for arg1_result, arg2_result in zip(arg1_results, arg2_results):
+        assert arg1_result == arg2_result, \
+            f"Results for {model=} are not the same with {arg1=} and {arg2=}"
+
+
 def init_test_distributed_environment(
    tp_size: int,
    pp_size: int,
@@ -125,13 +286,15 @@ def init_test_distributed_environment(
    ensure_model_parallel_initialized(tp_size, pp_size)


-def multi_process_tensor_parallel(
+def multi_process_parallel(
    tp_size: int,
    pp_size: int,
-    test_target,
+    test_target: Any,
 ) -> None:
    # Using ray helps debugging the error when it failed
    # as compared to multiprocessing.
+    # NOTE: We need to set working_dir for distributed tests,
+    # otherwise we may get import errors on ray workers
    ray.init(runtime_env={"working_dir": VLLM_PATH})

    distributed_init_port = get_open_port()
@@ -154,3 +317,83 @@ def error_on_warning():
        warnings.simplefilter("error")

        yield
+
+
+@_nvml()
+def wait_for_gpu_memory_to_clear(devices: List[int],
+                                 threshold_bytes: int,
+                                 timeout_s: float = 120) -> None:
+    # Use nvml instead of pytorch to reduce measurement error from torch cuda
+    # context.
+    start_time = time.time()
+    while True:
+        output: Dict[int, str] = {}
+        output_raw: Dict[int, float] = {}
+        for device in devices:
+            if is_hip():
+                dev_handle = amdsmi_get_processor_handles()[device]
+                mem_info = amdsmi_get_gpu_vram_usage(dev_handle)
+                gb_used = mem_info["vram_used"] / 2**10
+            else:
+                dev_handle = nvmlDeviceGetHandleByIndex(device)
+                mem_info = nvmlDeviceGetMemoryInfo(dev_handle)
+                gb_used = mem_info.used / 2**30
+            output_raw[device] = gb_used
+            output[device] = f'{gb_used:.02f}'
+
+        print('gpu memory used (GB): ', end='')
+        for k, v in output.items():
+            print(f'{k}={v}; ', end='')
+        print('')
+
+        dur_s = time.time() - start_time
+        if all(v <= (threshold_bytes / 2**30) for v in output_raw.values()):
+            print(f'Done waiting for free GPU memory on devices {devices=} '
+                  f'({threshold_bytes/2**30=}) {dur_s=:.02f}')
+            break
+
+        if dur_s >= timeout_s:
+            raise ValueError(f'Memory of devices {devices=} not free after '
+                             f'{dur_s=:.02f} ({threshold_bytes/2**30=})')
+
+        time.sleep(5)
+
+
+def fork_new_process_for_each_test(f):
+    """Decorator to fork a new process for each test function.
+    See https://github.com/vllm-project/vllm/issues/7053 for more details.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+        # Make the process the leader of its own process group
+        # to avoid sending SIGTERM to the parent process
+        os.setpgrp()
+        from _pytest.outcomes import Skipped
+        pid = os.fork()
+        if pid == 0:
+            try:
+                f(*args, **kwargs)
+            except Skipped as e:
+                # convert Skipped to exit code 0
+                print(str(e))
+                os._exit(0)
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                os._exit(1)
+            else:
+                os._exit(0)
+        else:
+            pgid = os.getpgid(pid)
+            _pid, _exitcode = os.waitpid(pid, 0)
+            # ignore SIGTERM signal itself
+            old_singla_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
+            # kill all child processes
+            os.killpg(pgid, signal.SIGTERM)
+            # restore the signal handler
+            signal.signal(signal.SIGTERM, old_singla_handler)
+            assert _exitcode == 0, (f"function {f} failed when called with"
+                                    f" args {args} and kwargs {kwargs}")
+
+    return wrapper
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
+import dataclasses
+from typing import List, Tuple, Type
+
+import torch
+
+from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.worker.embedding_model_runner import (
+    ModelInputForGPUWithPoolingMetadata)
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class MockAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_impl_cls():
+        raise NotImplementedError
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return AttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        raise AttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        raise NotImplementedError
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        pass
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        pass
+
+
+def test_model_runner_input():
+    sampling_metadata = SamplingMetadata(
+        ["seq_group"],
+        "selected_token_indices",
+        "categorized_sample_indices",
+        "num_prompts",
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        sampling_metadata=sampling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithSamplingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # For sampling metadata, only selected_token_indices is copied.
+    assert (received_model_input.sampling_metadata.selected_token_indices ==
+            sampling_metadata.selected_token_indices)
+    assert received_model_input.sampling_metadata.seq_groups is None
+
+
+def test_embedding_model_runner_input():
+    pooling_metadata = PoolingMetadata(
+        seq_groups=[[0]],
+        seq_data={},
+        prompt_lens=[1],
+    )
+    attn_metadata = AttentionMetadata(
+        num_prefills=1,
+        num_prefill_tokens=2,
+        num_decode_tokens=3,
+        slot_mapping=torch.zeros(1),
+    )
+    model_input = ModelInputForGPUWithPoolingMetadata(
+        input_tokens=torch.ones(10),
+        input_positions=torch.ones(10),
+        pooling_metadata=pooling_metadata,
+        attn_metadata=attn_metadata)
+
+    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
+
+    # Test round trip serialization.
+    tensor_dict = model_input.as_broadcastable_tensor_dict()
+    attn_backend = MockAttentionBackend()
+    received_model_input = (
+        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict, attn_backend=attn_backend))
+    # Check that received copy has correct values.
+    assert isinstance(received_model_input,
+                      ModelInputForGPUWithPoolingMetadata)
+    assert received_model_input.input_tokens is not None
+    assert (
+        received_model_input.input_tokens == model_input.input_tokens).all()
+    assert received_model_input.input_positions is not None
+    assert (received_model_input.input_positions == model_input.input_positions
+            ).all()
+    assert received_model_input.multi_modal_kwargs is None
+    assert (received_model_input.multi_modal_kwargs ==
+            model_input.multi_modal_kwargs)
+    assert received_model_input.lora_requests is None
+    assert received_model_input.lora_requests == model_input.lora_requests
+    assert received_model_input.lora_mapping is None
+    assert received_model_input.lora_mapping == model_input.lora_mapping
+    for field in dataclasses.fields(AttentionMetadata):
+        assert getattr(received_model_input.attn_metadata, field.name,
+                       None) == getattr(attn_metadata, field.name, None)
+    # Pooling metadata is not broadcast.
+    assert received_model_input.pooling_metadata is None
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
+from typing import List
+
 import pytest
 import torch

@@ -21,6 +23,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
        cache_config=engine_config.cache_config,
        load_config=engine_config.load_config,
        lora_config=engine_config.lora_config,
+        prompt_adapter_config=engine_config.prompt_adapter_config,
        is_driver_worker=True,
    )
    return model_runner
@@ -35,8 +38,8 @@ def test_prepare_prompt(batch_size):
        enable_chunked_prefill=False,
    )

-    seq_lens = []
-    seq_group_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    block_tables = {0: [1]}
    for i in range(batch_size):
        # make sure all tokens fit into one block
@@ -59,12 +62,13 @@ def test_prepare_prompt(batch_size):
        expected_selected_token_indices.append(selected_token_start_idx +
                                               seq_len - 1)
        selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
    input_tokens = model_input.input_tokens
    input_positions = model_input.input_positions
    attn_metadata = model_input.attn_metadata
    return_seq_lens = model_input.seq_lens
-    slot_mapping = model_input.slot_mapping
+    slot_mapping = attn_metadata.slot_mapping
    assert return_seq_lens == seq_lens
    assert len(slot_mapping) == len(input_tokens)

@@ -151,15 +155,14 @@ def test_prepare_decode_cuda_graph(batch_size):
        enable_chunked_prefill=False,
    )

-    context_lens = []
-    seq_group_metadata_list = []
+    context_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
    # Assume each seq group finishes prefill.
    for i in range(batch_size):
        # make sure all tokens fit into one block
        context_len = i % (model_runner.block_size - 1) + 1
        context_lens.append(context_len)
-        seq_data = list(range(context_len))
-        seq_data = SequenceData(seq_data)
+        seq_data = SequenceData(list(range(context_len)))
        seq_data.update_num_computed_tokens(context_len)
        # Append one token ID since prefill is finished.
        seq_data.append_token_id(1, 0)
@@ -173,10 +176,11 @@ def test_prepare_decode_cuda_graph(batch_size):
        assert seq_group_metadata.token_chunk_size == 1
        seq_group_metadata_list.append(seq_group_metadata)

-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
    input_tokens, input_positions, attn_metadata, slot_mapping = (
        model_input.input_tokens, model_input.input_positions,
-        model_input.attn_metadata, model_input.slot_mapping)
+        model_input.attn_metadata, model_input.attn_metadata.slot_mapping)
    assert len(slot_mapping) == len(input_tokens)

    expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
@@ -189,6 +193,7 @@ def test_prepare_decode_cuda_graph(batch_size):
    for _ in range(expected_bs - len(seq_lens)):
        seq_lens.append(1)
    assert attn_metadata.seq_lens == seq_lens
+    assert attn_metadata.num_decode_tokens == len(seq_lens)
    start_idx = 0
    start_loc = [start_idx]
    for _ in context_lens:
@@ -257,33 +262,30 @@ def test_empty_seq_group():
        dtype="float16",
        enforce_eager=False,
    )
-    seq_group_metadata_list = []
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    input_tokens, input_positions, attn_metadata, slot_mapping = (
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata = (
        model_input.input_tokens,
        model_input.input_positions,
        model_input.attn_metadata,
-        model_input.slot_mapping,
    )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+    assert input_tokens is None
+    assert input_positions is None
    assert attn_metadata is None
-    assert len(slot_mapping) == 0
-
-    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
-    (input_tokens, input_positions, attn_metadata, slot_mapping,
-     return_seq_lens) = (
-         model_input.input_tokens,
-         model_input.input_positions,
-         model_input.attn_metadata,
-         model_input.slot_mapping,
-         model_input.seq_lens,
-     )
-    assert len(input_tokens) == 0
-    assert len(input_positions) == 0
+
+    model_input = model_runner._prepare_model_input_tensors(
+        seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata, return_seq_lens) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+        model_input.seq_lens,
+    )
+    assert input_tokens is None
+    assert input_positions is None
    assert attn_metadata is None
-    assert len(slot_mapping) == 0
-    assert len(return_seq_lens) == 0
+    assert return_seq_lens is None


 @pytest.fixture
@@ -310,10 +312,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
    )

    # Add prefill requests.
-    seq_lens = []
-    seq_group_metadata_list = []
-    prefill_metadata_list = []
-    decode_metadata_list = []
+    seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    prefill_metadata_list: List[SequenceGroupMetadata] = []
+    decode_metadata_list: List[SequenceGroupMetadata] = []
    block_tables = {0: [1]}
    prefill_batch_size = batch_size // 2
    decode_batch_size = batch_size - prefill_batch_size
@@ -352,8 +354,12 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
        seq_group_metadata_list.append(seq_group_metadata)
        decode_metadata_list.append(seq_group_metadata)

-    (input_tokens, input_positions, attn_metadata, _, _, _,
-     _) = model_runner.prepare_input_tensors(seq_group_metadata_list)
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata) = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+    )

    prefill_meta_actual = attn_metadata.prefill_metadata
    decode_meta_actual = attn_metadata.decode_metadata
@@ -366,7 +372,7 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):

    # Verify attn metadata is consistent. We don't need to test individual
    # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input(
+    attn_metadata = model_runner._prepare_model_input_tensors(
        seq_group_metadata_list).attn_metadata

    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),

--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -39,8 +39,8 @@ def test_swap() -> None:
        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)

    # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine.gpu_cache
-    cpu_cache = worker.cache_engine.cpu_cache
+    gpu_cache = worker.cache_engine[0].gpu_cache
+    cpu_cache = worker.cache_engine[0].cpu_cache
    num_layers = len(gpu_cache)
    for i in range(num_layers):
        gpu_key_cache, gpu_value_cache = gpu_cache[i]

--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -5,21 +5,20 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import PromptStrictInputs, TextPrompt, TokensPrompt
+from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
 from vllm.outputs import (CompletionOutput, EmbeddingOutput,
                          EmbeddingRequestOutput, RequestOutput)
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.version import __dcu_version__
-
-from .version import __version__
+from vllm.version import __commit__, __version__, __dcu_version__

 __all__ = [
+    "__commit__",
    "__version__",
    "LLM",
    "ModelRegistry",
-    "PromptStrictInputs",
+    "PromptInputs",
    "TextPrompt",
    "TokensPrompt",
    "SamplingParams",

--- a/vllm/_core_ext.py
+++ b/vllm/_core_ext.py
+import importlib.util
+from enum import Enum
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+core_C_available = importlib.util.find_spec('._core_C', 'vllm') is not None
+
+
+# Mirrors enum in `core/scalar_type.hpp`
+class NanRepr(Enum):
+    NONE = 0  # nans are not supported
+    IEEE_754 = 1  # nans are: Exp all 1s, mantissa not all 0s
+    EXTD_RANGE_MAX_MIN = 2  # nans are: Exp all 1s, mantissa all 1s
+
+
+if TYPE_CHECKING or not core_C_available:
+    # On platforms were we cannot use/build the C++ core extension (i.e. namely
+    # neuron and tpu), we define the mock ScalarType class here that partially
+    # mimics the C++ ScalarType class.
+    #
+    # We also use this provide type signatures to the Python LSP for the methods
+    # in the C++ ScalarType class. So these type signatures should be kept
+    # in sync with csrc/core/scalar_type.hpp
+
+    from dataclasses import dataclass
+
+    @dataclass(frozen=True)
+    class ScalarType:
+        """
+        ScalarType can represent a wide range of floating point and integer 
+        types, in particular it can be used to represent sub-byte data types 
+        (something that torch.dtype currently does not support). It is also 
+        capable of  representing types with a bias, i.e.:
+          `stored_value = value + bias`, 
+        this is useful for quantized types (e.g. standard GPTQ 4bit uses a bias 
+        of 8). The implementation for this class can be found in 
+        csrc/core/scalar_type.hpp, these type signatures should be kept in sync 
+        with that file.
+        """
+
+        exponent: int
+        """
+        Number of bits in the exponent if this is a floating point type
+        (zero if this an integer type)
+        """
+
+        mantissa: int
+        """
+        Number of bits in the mantissa if this is a floating point type,
+        or the number bits representing an integer excluding the sign bit if 
+        this an integer type.
+        """
+
+        bias: int
+        """
+        bias used to encode the values in this scalar type 
+        (value = stored_value - bias, default 0) for example if we store the 
+        type as an unsigned integer with a bias of 128 then the value 0 will be 
+        stored as 128 and -1 will be stored as 127 and 1 will be stored as 129.
+        """
+
+        signed: bool
+        "If the type is signed (i.e. has a sign bit)"
+
+        _finite_values_only: bool = False
+        """
+        Private: if NANs are supported, used `has_infs()` instead.
+        """
+
+        nan_repr: int = NanRepr.IEEE_754.value
+        """
+        How NaNs are represent in this scalar type, returns NanRepr value. 
+        (not applicable for integer types)
+        """
+
+        @property
+        def size_bits(self):
+            return self.exponent + self.mantissa + int(self.signed)
+
+        def min(self) -> Union[int, float]:
+            """
+            Min representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def max(self) -> Union[int, float]:
+            """
+            Max representable value for this scalar type. 
+            (accounting for bias if there is one)
+            """
+            raise NotImplementedError
+
+        def is_signed(self) -> bool:
+            """
+            If the type is signed (i.e. has a sign bit), same as `signed`
+            added for consistency with:
+            https://pytorch.org/docs/stable/generated/torch.Tensor.is_signed.html
+            """
+            ...
+
+        def is_floating_point(self):
+            "If the type is a floating point type"
+            return self.exponent != 0
+
+        def is_integer(self):
+            "If the type is an integer type"
+            return self.exponent == 0
+
+        def has_bias(self):
+            "If the type has a non-zero bias"
+            return self.bias != 0
+
+        def has_infs(self):
+            "If the type is floating point and supports infinity"
+            return not self._finite_values_only
+
+        def has_nans(self):
+            return self.nan_repr != NanRepr.NONE.value
+
+        def is_ieee_754(self) -> bool:
+            """
+            If the type is a floating point type that follows IEEE 754 
+            conventions
+            """
+            return self.nan_repr == NanRepr.IEEE_754.value and \
+                not self._finite_values_only
+
+        def __str__(self) -> str:
+            raise NotImplementedError
+
+        def __repr__(self) -> str:
+            raise NotImplementedError
+
+        #
+        # Convenience Constructors
+        #
+
+        @classmethod
+        def int_(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            "Create a signed integer scalar type (size_bits includes sign-bit)."
+            return cls(size_bits - 1, size_bits, bias if bias else 0, True)
+
+        @classmethod
+        def uint(cls, size_bits: int, bias: Optional[int]) -> 'ScalarType':
+            """Create a unsigned integer scalar type."""
+            return cls(size_bits, size_bits, bias if bias else 0, False)
+
+        @classmethod
+        def float_IEEE754(cls, exponent: int, mantissa: int) -> 'ScalarType':
+            """
+            Create a standard floating point type 
+            (i.e. follows IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True)
+
+        @classmethod
+        def float_(cls, exponent: int, mantissa: int, finite_values_only: bool,
+                   nan_repr: int):
+            """
+            Create a non-standard floating point type 
+            (i.e. does not follow IEEE 754 conventions).
+            """
+            return cls(exponent, mantissa, 0, True, finite_values_only,
+                       nan_repr)
+
+elif core_C_available:
+    try:
+        import vllm._core_C  # noqa: F401
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._core_C with %r", e)
+
+    ScalarType = torch.classes._core_C.ScalarType
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
 import contextlib
 import functools
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Union
+
 import torch

+from vllm._core_ext import ScalarType
 from vllm.logger import init_logger

-logger = init_logger(__name__)
-
 try:
    from lmslim import quant_ops 
 except Exception:
    print("INFO: Please install lmslim if you want to infer gptq or awq model.\n") 

+logger = init_logger(__name__)
+
 try:
    import vllm._C
 except ImportError as e:
    logger.warning("Failed to import from vllm._C with %r", e)

-with contextlib.suppress(ImportError):
-    import vllm._moe_C
-
 with contextlib.suppress(ImportError):
    # ruff: noqa: F401
-    import vllm._punica_C
+    import vllm._moe_C


 def is_custom_op_supported(op_name: str) -> bool:
@@ -61,7 +60,7 @@ def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
 def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
    torch.ops._C.gelu_tanh_and_mul(out, x)
    
-
+    
 def silu_and_mul_opt(out: torch.Tensor, x: torch.Tensor) -> None:
    torch.ops._C.silu_and_mul_opt(out, x)

@@ -82,6 +81,10 @@ def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
    torch.ops._C.gelu_new(out, x)


+def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+    torch.ops._C.gelu_quick(out, x)
+
+
 # page attention ops
 def paged_attention_v1(
    out: torch.Tensor,
@@ -96,7 +99,8 @@ def paged_attention_v1(
    max_seq_len: int,
    alibi_slopes: Optional[torch.Tensor],
    kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
    tp_rank: int = 0,
    blocksparse_local_blocks: int = 0,
    blocksparse_vert_stride: int = 0,
@@ -106,8 +110,9 @@ def paged_attention_v1(
    torch.ops._C.paged_attention_v1(
        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)


 def paged_attention_v2(
@@ -126,7 +131,8 @@ def paged_attention_v2(
    max_seq_len: int,
    alibi_slopes: Optional[torch.Tensor],
    kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
    tp_rank: int = 0,
    blocksparse_local_blocks: int = 0,
    blocksparse_vert_stride: int = 0,
@@ -136,7 +142,7 @@ def paged_attention_v2(
    torch.ops._C.paged_attention_v2(
        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
        blocksparse_local_blocks, blocksparse_vert_stride,
        blocksparse_block_size, blocksparse_head_sliding_step)

@@ -155,7 +161,8 @@ def paged_attention_v1_opt(
    max_seq_len: int,
    alibi_slopes: Optional[torch.Tensor],
    kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
    tp_rank: int = 0,
    blocksparse_local_blocks: int = 0,
    blocksparse_vert_stride: int = 0,
@@ -165,8 +172,9 @@ def paged_attention_v1_opt(
    torch.ops._C.paged_attention_v1_opt(
        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step)


 def paged_attention_v2_opt(
@@ -185,7 +193,8 @@ def paged_attention_v2_opt(
    max_seq_len: int,
    alibi_slopes: Optional[torch.Tensor],
    kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
    tp_rank: int = 0,
    blocksparse_local_blocks: int = 0,
    blocksparse_vert_stride: int = 0,
@@ -195,7 +204,7 @@ def paged_attention_v2_opt(
    torch.ops._C.paged_attention_v2_opt(
        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
+        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
        blocksparse_local_blocks, blocksparse_vert_stride,
        blocksparse_block_size, blocksparse_head_sliding_step)
    
@@ -233,6 +242,7 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                       weight: torch.Tensor, epsilon: float) -> None:
    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
    
+
 # layer norm ops (opt)
 def rms_norm_opt(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
             epsilon: float) -> None:
@@ -242,12 +252,24 @@ def rms_norm_opt(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
 def fused_add_rms_norm_opt(input: torch.Tensor, residual: torch.Tensor,
                       weight: torch.Tensor, epsilon: float) -> None:
    torch.ops._C.fused_add_rms_norm_opt(input, residual, weight, epsilon)
-    
+
+
+def advance_step(num_seqs: int, num_queries: int, block_size: int,
+                 input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor,
+                 input_positions: torch.Tensor, seq_lens: torch.Tensor,
+                 slot_mapping: torch.Tensor,
+                 block_tables: torch.Tensor) -> None:
+    """Advance a step on GPU for existing inputs for a multi-step runner"""
+    return torch.ops._C.advance_step(num_seqs, num_queries, block_size,
+                                     input_tokens, sampled_token_ids,
+                                     input_positions, seq_lens, slot_mapping,
+                                     block_tables)
+
 # trans_w16
 def trans_w16_gemm(dst: torch.Tensor, src: torch.Tensor,
                row:int, col:int) -> None :
    torch.ops._C.trans_w16_gemm(dst,src,row,col)
-
+    

 # quantization ops
 # awq
@@ -314,7 +336,7 @@ def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                 bit: int) -> None:
    quant_ops.gptq_shuffle(q_weight, q_perm, bit)
    # torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
-    
+

 # squeezellm
 def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
@@ -333,17 +355,24 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 # marlin_24
 def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                        b_meta: torch.Tensor, b_scales: torch.Tensor,
-                        workspace: torch.Tensor, num_bits: int, size_m: int,
-                        size_n: int, size_k: int) -> torch.Tensor:
+                        workspace: torch.Tensor, b_q_type: ScalarType,
+                        size_m: int, size_n: int, size_k: int) -> torch.Tensor:
    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                            workspace, num_bits, size_m,
+                                            workspace, b_q_type, size_m,
                                            size_n, size_k)


 # cutlass
-def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
+def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
+
+
+def cutlass_scaled_mm(a: torch.Tensor,
+                      b: torch.Tensor,
+                      scale_a: torch.Tensor,
                      scale_b: torch.Tensor,
-                      out_dtype: Type[torch.dtype]) -> torch.Tensor:
+                      out_dtype: torch.dtype,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
    assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)

@@ -351,7 +380,8 @@ def cutlass_scaled_mm(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
    n = b.shape[1]
    out = torch.empty((m, n), dtype=out_dtype, device=a.device)

-    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
+
    return out


@@ -378,21 +408,48 @@ def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                                           num_bits)


-def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
-                     b_scales: torch.Tensor, g_idx: torch.Tensor,
-                     perm: torch.Tensor, workspace: torch.Tensor,
-                     num_bits: int, size_m: int, size_n: int, size_k: int,
-                     is_k_full: bool) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
-                                         workspace, num_bits, size_m, size_n,
-                                         size_k, is_k_full)
+# gptq_marlin
+def awq_marlin_repack(b_q_weight: torch.Tensor, size_k: int, size_n: int,
+                      num_bits: int) -> torch.Tensor:
+    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+
+
+def gptq_marlin_gemm(a: torch.Tensor,
+                     b_q_weight: torch.Tensor,
+                     b_scales: torch.Tensor,
+                     b_zeros: torch.Tensor,
+                     g_idx: torch.Tensor,
+                     perm: torch.Tensor,
+                     workspace: torch.Tensor,
+                     b_q_type: ScalarType,
+                     size_m: int,
+                     size_n: int,
+                     size_k: int,
+                     is_k_full: bool,
+                     has_zp: bool = False,
+                     use_fp32_reduce: bool = False) -> torch.Tensor:
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
+                                         g_idx, perm, workspace, b_q_type,
+                                         size_m, size_n, size_k, is_k_full,
+                                         has_zp, use_fp32_reduce)
+
+
+# fp8 marlin
+def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    b_scales: torch.Tensor, workspace: torch.Tensor,
+                    num_bits: int, size_m: int, size_n: int,
+                    size_k: int) -> torch.Tensor:
+    return torch.ops._C.fp8_marlin_gemm(a, b_q_weight, b_scales, workspace,
+                                        num_bits, size_m, size_n, size_k)


 # fp8
 # def scaled_fp8_quant(
 #     input: torch.Tensor,
 #     scale: Optional[torch.Tensor] = None,
-#     batch_dim_padding: Optional[int] = None,
+#     num_token_padding: Optional[int] = None,
+#     scale_ub: Optional[torch.Tensor] = None,
+#     use_per_token_if_dynamic: bool = False,
 # ) -> Tuple[torch.Tensor, torch.Tensor]:
 #     """
 #     Quantize input tensor to FP8 and return quantized tensor and scale.
@@ -400,31 +457,45 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 #     This function supports both static and dynamic quantization: If you
 #     provide the scale, it will use static scaling and if you omit it,
 #     the scale will be determined dynamically. The function also allows
-#     optional padding of the output tensor for downstream kernels that
+#     optional padding of the output tensors for downstream kernels that
 #     will benefit from padding.

 #     Args:
 #         input: The input tensor to be quantized to FP8
 #         scale: Optional scaling factor for the FP8 quantization
-#         batch_dim_padding: If specified, pad the first dimension
+#         scale_ub: Optional upper bound for scaling factor in dynamic 
+#             per token case
+#         num_token_padding: If specified, pad the first dimension
 #             of the output to at least this value.
+#         use_per_token_if_dynamic: Whether to do per_tensor or per_token 
+#             in the dynamic quantization case.

 #     Returns:
 #         Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
 #             scaling factor.
 #     """
-#     if batch_dim_padding:
-#         shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:])
-#         output = torch.empty(shape,
-#                              device=input.device,
-#                              dtype=torch.float8_e4m3fn)
-#     else:
-#         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+#     # This code assumes batch_dim and num_tokens are flattened
+#     assert (input.ndim == 2)
+#     shape: Union[Tuple[int, int], torch.Size] = input.shape
+#     if num_token_padding:
+#         shape = (max(num_token_padding, input.shape[0]), shape[1])
+#     output = torch.empty(shape, device=input.device, dtype=torch.float8_e4m3fn)
+
 #     if scale is None:
-#         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-#         torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+#         if use_per_token_if_dynamic:
+#             scale = torch.empty((shape[0], 1),
+#                                 device=input.device,
+#                                 dtype=torch.float32)
+#             torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+#                 output, input, scale, scale_ub)
+#         else:
+#             scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+#             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
 #     else:
+#         # num_token_padding not implemented for this case
+#         assert (scale.numel() == 1 or num_token_padding is None)
 #         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+
 #     return output, scale


@@ -458,6 +529,15 @@ def scaled_int8_quant(
    return output, input_scales


+# qqq ops
+def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                    s_tok: torch.Tensor, s_ch: torch.Tensor,
+                    s_group: torch.Tensor, workspace: torch.Tensor,
+                    size_m: int, size_n: int, size_k: int) -> torch.Tensor:
+    return torch.ops._C.marlin_qqq_gemm(a, b_q_weight, s_tok, s_ch, s_group,
+                                        workspace, size_m, size_n, size_k)
+
+
 # moe
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                         block_size: int, sorted_token_ids: torch.Tensor,
@@ -482,11 +562,12 @@ def reshape_and_cache(
    value_cache: torch.Tensor,
    slot_mapping: torch.Tensor,
    kv_cache_dtype: str,
-    kv_scale: float,
+    k_scale: float,
+    v_scale: float,
 ) -> None:
    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
                                             value_cache, slot_mapping,
-                                             kv_cache_dtype, kv_scale)
+                                             kv_cache_dtype, k_scale, v_scale)


 def reshape_and_cache_flash(
@@ -496,13 +577,17 @@ def reshape_and_cache_flash(
    value_cache: torch.Tensor,
    slot_mapping: torch.Tensor,
    kv_cache_dtype: str,
+    k_scale: float,
+    v_scale: float,
 ) -> None:
    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
                                                   value_cache, slot_mapping,
-                                                   kv_cache_dtype)
+                                                   kv_cache_dtype, k_scale,
+                                                   v_scale)


-def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
+def copy_blocks(key_caches: List[torch.Tensor],
+                value_caches: List[torch.Tensor],
                block_mapping: torch.Tensor) -> None:
    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)

@@ -574,43 +659,6 @@ def register_graph_buffers(fa: int, handles: List[str],
    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)


-# punica
-def dispatch_bgmv(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
-                                      scale)
-
-
-def dispatch_bgmv_low_level(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    w_t_all: torch.Tensor,
-    indicies: torch.Tensor,
-    layer_idx: int,
-    scale: float,
-    h_in: int,
-    h_out: int,
-    y_offset: int,
-) -> None:
-    torch.ops._punica_C.dispatch_bgmv_low_level(
-        y,
-        x,
-        w_t_all,
-        indicies,
-        layer_idx,
-        scale,
-        h_in,
-        h_out,
-        y_offset,
-    )
-
-
 # temporary fix for https://github.com/vllm-project/vllm/issues/5456
 # TODO: remove this in v0.6.0
 names_and_values = globals()
@@ -630,4 +678,4 @@ for k, v in names_and_values.items():
        names_and_values_to_update[k] = hint_on_error(v)

 names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
+del names_and_values_to_update, names_and_values, v, k, fn_type
\ No newline at end of file
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+try:
+    import intel_extension_for_pytorch as ipex
+except ImportError as e:
+    logger.warning("Import error msg: %s", e.msg)
+
+
+class ipex_ops:
+
+    @staticmethod
+    def _reshape_activation_tensor(
+            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        num = x.size(0)
+        d = x.size(1) // 2
+        x = x.reshape(num, 2, d)
+        x1, x2 = torch.chunk(x, chunks=2, dim=1)
+        x1 = x1.reshape(num, d)
+        x2 = x2.reshape(num, d)
+        return x1, x2
+
+    @staticmethod
+    def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.silu_mul(x1, x2, out)
+
+    @staticmethod
+    def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "none")
+
+    @staticmethod
+    def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
+        x1, x2 = ipex_ops._reshape_activation_tensor(x)
+        ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
+
+    @staticmethod
+    def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    @staticmethod
+    def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
+        out.copy_(torch.nn.functional.gelu(x))
+
+    # TODO add implementation of gelu_quick here
+    # def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
+
+    @staticmethod
+    def paged_attention_v1(
+        out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            device=query.device,
+            dtype=torch.int32,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v1(  # type: ignore
+            out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            scale,
+            block_tables,
+            context_lens,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+
+    @staticmethod
+    def paged_attention_v2(
+        out: torch.Tensor,
+        exp_sum: torch.Tensor,
+        max_logits: torch.Tensor,
+        tmp_out: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        num_kv_heads: int,
+        scale: float,
+        block_tables: torch.Tensor,
+        context_lens: torch.Tensor,
+        block_size: int,
+        max_context_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+        tp_rank: int = 0,
+        blocksparse_local_blocks: int = 0,
+        blocksparse_vert_stride: int = 0,
+        blocksparse_block_size: int = 64,
+        blocksparse_head_sliding_step: int = 0,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        num_heads = out.size(1)
+        num_queries_per_tokens = num_heads // num_kv_heads
+        head_mapping = torch.arange(
+            0,
+            num_kv_heads,
+            dtype=torch.int32,
+            device=query.device,
+        ).view(num_kv_heads,
+               1).repeat_interleave(num_queries_per_tokens).flatten()
+        # todo: ipex will refactor namespace
+        torch.xpu.paged_attention_v2(  # type: ignore
+            out,
+            exp_sum,
+            max_logits,
+            tmp_out,
+            query.contiguous(),
+            key_cache.view_as(value_cache),
+            value_cache,
+            head_mapping,
+            block_tables,
+            context_lens,
+            scale,
+            block_size,
+            max_context_len,
+            alibi_slopes,
+        )
+
+    @staticmethod
+    def rotary_embedding(
+        positions: torch.Tensor,  # [batch_size, seq_len]
+        query: torch.Tensor,  # [batch_size, seq_len, num_heads*head_size]
+        key: torch.Tensor,  # [batch_size, seq_len, num_kv_heads*head_size]
+        head_size: int,
+        cos_sin_cache: torch.Tensor,  # [cos_sin_dim, rot_dim]
+        is_neox: bool,
+    ) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[positions.long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    @staticmethod
+    def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
+                                 key: torch.Tensor, head_size: int,
+                                 cos_sin_cache: torch.Tensor, is_neox: bool,
+                                 rot_dim: int,
+                                 cos_sin_cache_offsets: torch.Tensor) -> None:
+        if positions.dim() == 1:
+            positions = positions.unsqueeze(0)
+            query = query.unsqueeze(0)
+            key = key.unsqueeze(0)
+        cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
+        rotary_dim = cos_sin_cache.size(1)
+        query = query.view(*query.shape[:-1], -1, head_size)
+        key = key.view(*key.shape[:-1], -1, head_size)
+
+        query_rot = query[..., :rotary_dim]
+        key_rot = key[..., :rotary_dim]
+
+        cos_sin = cos_sin_cache[torch.add(positions,
+                                          cos_sin_cache_offsets).long()]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        if is_neox:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
+                                             rotary_dim, is_neox, positions)
+
+    @staticmethod
+    def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
+                 epsilon: float) -> None:
+        tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
+        out.copy_(tmp)
+
+    @staticmethod
+    def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
+                           weight: torch.Tensor, epsilon: float) -> None:
+        tmp = ipex.llm.functional.add_rms_norm(residual, input, weight, None,
+                                               epsilon, True)
+        input.copy_(tmp)
+
+    @staticmethod
+    def varlen_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+        seqlen_q: torch.Tensor,
+        seqlen_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        pdropout: float,
+        softmax_scale: float,
+        zero_tensors: bool,
+        is_causal: bool,
+        return_softmax: bool,
+        gen_: torch.Generator,
+    ) -> None:
+        ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
+                                             seqlen_k, max_seqlen_q,
+                                             max_seqlen_k, pdropout,
+                                             softmax_scale, zero_tensors,
+                                             is_causal, return_softmax, gen_)
+
+    @staticmethod
+    def reshape_and_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_mapping: torch.Tensor,
+        kv_cache_dtype: str,
+        k_scale: float,
+        v_scale: float,
+    ) -> None:
+        assert kv_cache_dtype == "auto"
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slot_mapping)
+
+    @staticmethod
+    def copy_blocks(key_caches: List[torch.Tensor],
+                    value_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.copy_blocks(  # type: ignore
+            key_caches,
+            value_caches,
+            block_mapping,
+        )
+
+    @staticmethod
+    def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
+                    block_mapping: torch.Tensor) -> None:
+        torch.xpu.swap_blocks(src, dst, block_mapping)  # type: ignore
--- a/vllm/adapter_commons/__init__.py
+++ b/vllm/adapter_commons/__init__.py
--- a/vllm/adapter_commons/layers.py
+++ b/vllm/adapter_commons/layers.py
+from dataclasses import dataclass
+from typing import Tuple
+
+
+@dataclass
+class AdapterMapping:
+    # Per every token in input_ids:
+    index_mapping: Tuple[int, ...]
+    # Per sampled token:
+    prompt_mapping: Tuple[int, ...]
+
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
\ No newline at end of file
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+
+from torch import nn
+
+from vllm.logger import init_logger
+from vllm.utils import LRUCache
+
+logger = init_logger(__name__)
+
+
+class AdapterModel(ABC):
+
+    def __init__(self, model_id=None):
+        self.id = model_id
+
+    @abstractmethod
+    def from_local_checkpoint(cls, model_dir, model_id=None, **kwargs):
+        # Common initialization code
+        # Load weights or embeddings from local checkpoint
+        raise NotImplementedError("Subclasses must implement this method.")
+
+
+T = TypeVar('T')
+
+
+class AdapterLRUCache(LRUCache[T]):
+
+    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
+                                                              None]):
+        super().__init__(capacity)
+        self.deactivate_fn = deactivate_fn
+
+    def _on_remove(self, key: Hashable, value: Optional[T]):
+        logger.debug("Removing adapter int id: %d", key)
+        self.deactivate_fn(key)
+        return super()._on_remove(key, value)
+
+
+class AdapterModelManager(ABC):
+
+    def __init__(
+        self,
+        model: nn.Module,
+    ):
+        """Create a AdapterModelManager and adapter for a given model.
+        Args:
+            model: the model to be adapted.
+        """
+        self.model: nn.Module = model
+        self._registered_adapters: Dict[int, Any] = {}
+        # Dict instead of a Set for compatibility with LRUCache.
+        self._active_adapters: Dict[int, None] = {}
+        self.adapter_type = 'Adapter'
+        self._last_mapping = None
+
+    def __len__(self) -> int:
+        return len(self._registered_adapters)
+
+    @property
+    @abstractmethod
+    def adapter_slots(self) -> int:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def capacity(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def activate_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def deactivate_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_adapter(self, adapter: Any) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def set_adapter_mapping(self, mapping: Any) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_adapter(self, adapter_id: int) -> Optional[Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_adapters(self) -> Dict[int, Any]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pin_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
--- a/vllm/adapter_commons/request.py
+++ b/vllm/adapter_commons/request.py
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+
+@dataclass
+class AdapterRequest(ABC):
+    """
+    Base class for adapter requests.
+    """
+
+    @property
+    @abstractmethod
+    def adapter_id(self) -> int:
+        raise NotImplementedError
+
+    def __post_init__(self) -> None:
+        if self.adapter_id < 1:
+            raise ValueError(f"id must be > 0, got {self.adapter_id}")
+
+    def __eq__(self, value: object) -> bool:
+        return isinstance(
+            value, self.__class__) and self.adapter_id == value.adapter_id
+
+    def __hash__(self) -> int:
+        return hash(self.adapter_id)
--- a/vllm/adapter_commons/utils.py
+++ b/vllm/adapter_commons/utils.py
+from typing import Any, Callable, Dict, Optional, Set
+
+
+## model functions
+def deactivate_adapter(adapter_id: int, active_adapters: Dict[int, None],
+                       deactivate_func: Callable) -> bool:
+    if adapter_id in active_adapters:
+        deactivate_func(adapter_id)
+        active_adapters.pop(adapter_id)
+        return True
+    return False
+
+
+def add_adapter(adapter: Any, registered_adapters: Dict[int, Any],
+                capacity: int, add_func: Callable) -> bool:
+    if adapter.id not in registered_adapters:
+        if len(registered_adapters) >= capacity:
+            raise RuntimeError('No free adapter slots.')
+        add_func(adapter)
+        registered_adapters[adapter.id] = adapter
+        return True
+    return False
+
+
+def set_adapter_mapping(mapping: Any, last_mapping: Any,
+                        set_mapping_func: Callable) -> Any:
+    if last_mapping != mapping:
+        set_mapping_func(mapping)
+        return mapping
+    return last_mapping
+
+
+def remove_adapter(adapter_id: int, registered_adapters: Dict[int, Any],
+                   deactivate_func: Callable) -> bool:
+    deactivate_func(adapter_id)
+    return bool(registered_adapters.pop(adapter_id, None))
+
+
+def list_adapters(registered_adapters: Dict[int, Any]) -> Dict[int, Any]:
+    return dict(registered_adapters)
+
+
+def get_adapter(adapter_id: int,
+                registered_adapters: Dict[int, Any]) -> Optional[Any]:
+    return registered_adapters.get(adapter_id, None)
+
+
+## worker functions
+def set_active_adapters_worker(requests: Set[Any], mapping: Optional[Any],
+                               apply_adapters_func,
+                               set_adapter_mapping_func) -> None:
+    apply_adapters_func(requests)
+    set_adapter_mapping_func(mapping)
+
+
+def add_adapter_worker(adapter_request: Any, list_adapters_func,
+                       load_adapter_func, add_adapter_func,
+                       activate_adapter_func) -> bool:
+    if adapter_request.adapter_id in list_adapters_func():
+        return False
+    loaded_adapter = load_adapter_func(adapter_request)
+    loaded = add_adapter_func(loaded_adapter)
+    activate_adapter_func(loaded_adapter.id)
+    return loaded
+
+
+def apply_adapters_worker(adapter_requests: Set[Any], list_adapters_func,
+                          adapter_slots: int, remove_adapter_func,
+                          add_adapter_func) -> None:
+    models_that_exist = list_adapters_func()
+    models_map = {
+        adapter_request.adapter_id: adapter_request
+        for adapter_request in adapter_requests if adapter_request
+    }
+    if len(models_map) > adapter_slots:
+        raise RuntimeError(
+            f"Number of requested models ({len(models_map)}) is greater "
+            f"than the number of GPU model slots "
+            f"({adapter_slots}).")
+    new_models = set(models_map)
+    models_to_add = new_models - models_that_exist
+    models_to_remove = models_that_exist - new_models
+    for adapter_id in models_to_remove:
+        remove_adapter_func(adapter_id)
+    for adapter_id in models_to_add:
+        add_adapter_func(models_map[adapter_id])
+
+
+def list_adapters_worker(adapter_manager_list_adapters_func) -> Set[int]:
+    return set(adapter_manager_list_adapters_func())
--- a/vllm/adapter_commons/worker_manager.py
+++ b/vllm/adapter_commons/worker_manager.py
+from abc import ABC, abstractmethod
+from typing import Any, Optional, Set
+
+import torch
+
+
+class AbstractWorkerManager(ABC):
+
+    def __init__(self, device: torch.device):
+        self.device = device
+
+    @property
+    @abstractmethod
+    def is_enabled(self) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def set_active_adapters(self, requests: Set[Any],
+                            mapping: Optional[Any]) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_adapter(self, adapter_request: Any) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_adapter(self, adapter_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_all_adapters(self) -> None:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_adapters(self) -> Set[int]:
+        raise NotImplementedError
--- a/vllm/assets/__init__.py
+++ b/vllm/assets/__init__.py
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
+from pathlib import Path
+
+import vllm.envs as envs
+
+
+def get_cache_dir():
+    """Get the path to the cache for storing downloaded assets."""
+    path = Path(envs.VLLM_ASSETS_CACHE)
+    path.mkdir(parents=True, exist_ok=True)
+
+    return path