merge v0.4.3

b9e12416 · zhuwenwen · e5d707db · e9d3aa04 · b9e12416 · b9e12416
Commit b9e12416 authored May 31, 2024 by zhuwenwen
20 changed files
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -5,56 +5,6 @@ from vllm import SamplingParams
 from .conftest import get_output_from_llm_generator


-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": "JackFram/llama-68m",
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-
-        # Required for spec decode.
-        "use_v2_block_manager": True
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            # Expect failure as spec decode not supported by
-            # Ray backend.
-            "worker_use_ray": True,
-        },
-    ])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_ray(test_llm_generator):
-    """Verify that speculative decoding with Ray fails.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    try:
-        with pytest.raises(
-                AssertionError,
-                match="Speculative decoding not yet supported for "):
-            get_output_from_llm_generator(test_llm_generator, prompts,
-                                          sampling_params)
-    finally:
-        # we need to free up ray resource,
-        # so that latter test could use the gpu we allocated here
-        import ray
-        ray.shutdown()
-
-
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{

--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
+"""Tests which cover integration of the speculative decoding framework with
+other features, e.g. cuda graphs.
+"""
+
+import pytest
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Verify equality when cuda graphs allowed.
+        "enforce_eager": False,
+        "model": "JackFram/llama-68m",
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        {
+            # Identical models.
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("output_len", [32])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_cuda_graph(baseline_llm_generator, test_llm_generator,
+                                batch_size, output_len):
+    """Verify spec decode equality when cuda graphs are enabled.
+    """
+    run_greedy_equality_correctness_test(
+        baseline_llm_generator,
+        test_llm_generator,
+        batch_size,
+        max_output_len=output_len,
+        force_output_len=True,
+    )
--- a/tests/spec_decode/e2e/test_integration_dist.py
+++ b/tests/spec_decode/e2e/test_integration_dist.py
+"""Tests which cover integration of the speculative decoding framework with
+tensor parallelism.
+"""
+
+import pytest
+import torch
+
+from vllm.utils import is_hip
+
+from .conftest import run_greedy_equality_correctness_test
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+        "tensor_parallel_size": 2,
+
+        # Use AsyncLLM engine, so that the engine runs in its own process.
+        # Otherwise, since vLLM does not follow true SPMD, the test runner
+        # process will have both the engine and the rank0 worker. NCCL is not
+        # cleaned up properly, and its server host thread leaks, causing the
+        # second run of the test to fail with internal NCCL error.
+        "use_async": True,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_target_model_tp_gt_1(baseline_llm_generator, test_llm_generator,
+                              batch_size: int, output_len: int):
+    """Verify greedy equality when tensor parallelism is used.
+    """
+    if is_hip():
+        pytest.skip("hip is not well-supported yet")
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -536,6 +536,40 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
                                         force_output_len=True)


+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_disable_by_batch_size": 2,
+    },
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("output_len", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_disable_speculation(baseline_llm_generator, test_llm_generator,
+                             batch_size: int, output_len: int):
+    """Verify greedy equality when all sequences disable speculation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{

--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -57,7 +57,7 @@ from .conftest import run_greedy_equality_correctness_test
 @pytest.mark.parametrize("output_len", [
    256,
 ])
-@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
                                      test_llm_generator, batch_size: int,
@@ -170,3 +170,44 @@ def test_ngram_different_k(baseline_llm_generator, test_llm_generator,
                                         batch_size,
                                         max_output_len=output_len,
                                         force_output_len=True)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-68m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "speculative_model": "[ngram]",
+                             "num_speculative_tokens": 5,
+                             "ngram_prompt_lookup_max": 3,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_ngram_disable_queue(baseline_llm_generator, test_llm_generator,
+                             batch_size: int, output_len: int):
+    """Verify that ngram speculative decoding produces exact equality
+    to without spec decode with many different values of k and
+    different ngram_prompt_lookup_max.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.sequence import ExecuteModelRequest
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+from vllm.spec_decode.top1_proposer import Top1Proposer
+
+from .utils import create_batch, mock_worker
+
+
+@pytest.mark.parametrize('queue_size', [4])
+@pytest.mark.parametrize('batch_size', [1])
+@pytest.mark.parametrize('k', [1])
+@torch.inference_mode()
+def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+    """Verify that speculative tokens are disabled when the batch size
+    exceeds the threshold.
+    """
+    disable_by_batch_size = 3
+
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(proposer_worker=draft_worker,
+                              scorer_worker=target_worker,
+                              rejection_sampler=rejection_sampler,
+                              metrics_collector=metrics_collector,
+                              disable_by_batch_size=disable_by_batch_size)
+
+    exception_secret = 'artificial stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        running_queue_size=queue_size)
+
+    if queue_size > disable_by_batch_size:
+        with patch.object(worker,
+                          '_run_no_spec',
+                          side_effect=ValueError(exception_secret)), \
+            pytest.raises(ValueError, match=exception_secret):
+            worker.execute_model(execute_model_req=execute_model_req)
+
+    # When the batch size is larger than the threshold,
+    # we expect no speculative tokens (0).
+    expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
+    assert seq_group_metadata_list[
+        0].num_speculative_tokens == expected_num_spec_tokens
+
+    draft_worker.sampler_output.side_effect = ValueError(exception_secret)
+
+    proposer = Top1Proposer(
+        worker=draft_worker,
+        device='cpu',  # not used
+        vocab_size=100,  # not used
+        # Must be long enough to avoid being skipped due to length.
+        max_proposal_len=1024,
+    )
+
+    if queue_size < disable_by_batch_size:
+        # Should raise exception when executing the mocked draft model.
+        with pytest.raises(ValueError, match=exception_secret):
+            proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                num_lookahead_slots=k), )
+    else:
+        # Should not execute the draft model because spec decode is disabled
+        # for all requests. Accordingly, the proposal length should be 0.
+        proposals = proposer.get_proposals(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                num_lookahead_slots=k), )
+        assert proposals.proposal_lens.tolist() == [0] * batch_size
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -34,8 +34,8 @@ def test_ngram_algo_correctness_for_single_no_match():
        max_proposal_len=20,
    )

-    # set ngram window (0, 3], which is window=1/2/3
-    ngram_worker.set_ngram_window_size(0, 3)
+    # set ngram window [1, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(1, 3)

    prompts = [
        # shall find no candidate
@@ -90,8 +90,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
        max_proposal_len=20,
    )

-    # set ngram window (0, 3], which is window=1/2/3
-    ngram_worker.set_ngram_window_size(0, 3)
+    # set ngram window [1, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(1, 3)

    prompts = [
        # shall find no candidate
@@ -128,11 +128,12 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
    assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len])
    assert proposals.proposal_lens.shape == torch.Size([5])

+    # the first sequence has no match so proposal_len should be overwritten to 0
    assert proposals.proposal_lens.tolist(
-    ) == [proposal_len for _ in range(4)] + [0]
+    ) == [0] + [proposal_len for _ in range(3)] + [0]

    for i in range(proposal_len):
-        assert proposals.proposal_token_ids[0][i] == 0
+        assert proposals.proposal_token_ids[0][i] == -1
        assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1]
        assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3]
        assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5]
@@ -167,8 +168,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
        max_proposal_len=20,
    )

-    # set ngram window (0, 3], which is window=1/2/3
-    ngram_worker.set_ngram_window_size(0, 3)
+    # set ngram window [0, 3], which is window=1/2/3
+    ngram_worker.set_ngram_window_size(1, 3)

    prompts = [
        # shall find candidate 12,13,14,15,16

--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -7,8 +7,8 @@ import torch
 from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.utils import set_random_seed
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata, SequenceGroupOutput,
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           SamplerOutput, SequenceData, SequenceGroupMetadata,
                           SequenceOutput)
 from vllm.utils import get_distributed_init_method, get_ip, get_open_port
 from vllm.worker.cache_engine import CacheEngine
@@ -170,7 +170,7 @@ def create_sampler_output_list(

    return [
        SamplerOutput(outputs=[
-            SequenceGroupOutput(
+            CompletionSequenceGroupOutput(
                samples=[
                    SequenceOutput(
                        output_token=token_id,

--- a/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
+++ b/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
-import argparse
-import dataclasses
-import os
-import time
-import uuid
-from functools import partial
-from typing import Type
-
-import torch.nn as nn
-from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
-                        TensorSerializer, stream_io)
-from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
-from transformers import AutoConfig, PretrainedConfig
-
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
-from vllm.model_executor.models import ModelRegistry
-
-# yapf conflicts with isort for this docstring
-# yapf: disable
-"""
-tensorize_vllm_model.py is a script that can be used to serialize and 
-deserialize vLLM models. These models can be loaded using tensorizer directly 
-to the GPU extremely quickly. Tensor encryption and decryption is also 
-supported, although libsodium must be installed to use it. Install
-vllm with tensorizer support using `pip install vllm[tensorizer]`.
-
-To serialize a model, you can run something like this:
-
-python tensorize_vllm_model.py \
-   --model EleutherAI/gpt-j-6B \
-   --dtype float16 \
-   serialize \
-   --serialized-directory s3://my-bucket/ \
-   --suffix vllm
-
-Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
-and saves it to your S3 bucket. A local directory can also be used.
-
-You can also encrypt the model weights with a randomly-generated key by 
-providing a `--keyfile` argument.
-
-To deserialize a model, you can run something like this:
-
-python tensorize_vllm_model.py \
-   --model EleutherAI/gpt-j-6B \
-   --dtype float16 \
-   deserialize \
-   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
-
-Which downloads the model tensors from your S3 bucket and deserializes them.
-To provide S3 credentials, you can provide `--s3-access-key-id` and 
-`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
-the OpenAI entrypoint, as arguments for LLM(), or as environment variables
-in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
-
-
-You can also provide a `--keyfile` argument to decrypt the model weights if 
-they were serialized with encryption.
-
-For more information on the available arguments, run 
-`python tensorize_vllm_model.py --help`.
-"""
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="An example script that can be used to serialize and "
-                    "deserialize vLLM models. These models "
-                    "can be loaded using tensorizer directly to the GPU "
-                    "extremely quickly. Tensor encryption and decryption is "
-                    "also supported, although libsodium must be installed to "
-                    "use it.")
-    parser = TensorizerArgs.add_cli_args(EngineArgs.add_cli_args(parser))
-    subparsers = parser.add_subparsers(dest='command')
-
-    serialize_parser = subparsers.add_parser(
-        'serialize', help="Serialize a model to `--serialized-directory`")
-
-    serialize_parser.add_argument(
-        "--suffix",
-        type=str,
-        required=False,
-        help=(
-            "The suffix to append to the serialized model directory, which is "
-            "used to construct the location of the serialized model tensors, "
-            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
-            "`--suffix` is `v1`, the serialized model tensors will be "
-            "saved to "
-            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
-            "If none is provided, a random UUID will be used."))
-    serialize_parser.add_argument(
-        "--serialized-directory",
-        type=str,
-        required=True)
-
-    serialize_parser.add_argument(
-        "--keyfile",
-        type=str,
-        required=False,
-        help=("Encrypt the model weights with a randomly-generated binary key,"
-              " and save the key at this path"))
-
-    deserialize_parser = subparsers.add_parser(
-        'deserialize',
-        help=("Deserialize a model from `--path-to-tensors`"
-              " to verify it can be loaded and used."))
-
-    deserialize_parser.add_argument(
-        "--path-to-tensors",
-        type=str,
-        required=True,
-        help="The local path or S3 URI to the model tensors to deserialize. ")
-
-    deserialize_parser.add_argument(
-        "--keyfile",
-        type=str,
-        required=False,
-        help=("Path to a binary key to use to decrypt the model weights,"
-              " if the model was serialized with encryption"))
-
-    return parser.parse_args()
-
-
-def make_model_contiguous(model):
-    # Ensure tensors are saved in memory contiguously
-    for param in model.parameters():
-        param.data = param.data.contiguous()
-
-
-def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        model_cls = ModelRegistry.load_model_cls(arch)
-        if model_cls is not None:
-            return model_cls
-    raise ValueError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
-
-
-def serialize():
-    eng_args_dict = {f.name: getattr(args, f.name) for f in
-                     dataclasses.fields(EngineArgs)}
-    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
-    engine = LLMEngine.from_engine_args(engine_args)
-
-    model = (engine.model_executor.driver_worker.
-             model_runner.model)
-
-    encryption_params = EncryptionParams.random() if keyfile else None
-    if keyfile:
-        with _write_stream(keyfile) as stream:
-            stream.write(encryption_params.key)
-
-    with _write_stream(model_path) as stream:
-        serializer = TensorSerializer(stream, encryption=encryption_params)
-        serializer.write_module(model)
-        serializer.close()
-
-    print("Serialization complete. Model tensors saved to", model_path)
-    if keyfile:
-        print("Key saved to", keyfile)
-
-
-def deserialize():
-    config = AutoConfig.from_pretrained(model_ref)
-
-    with no_init_or_tensor():
-        model_class = _get_vllm_model_architecture(config)
-        model = model_class(config)
-
-    before_mem = get_mem_usage()
-    start = time.time()
-
-    if keyfile:
-        with _read_stream(keyfile) as stream:
-            key = stream.read()
-            decryption_params = DecryptionParams.from_key(key)
-            tensorizer_args.deserializer_params['encryption'] = \
-                decryption_params
-
-    with (_read_stream(model_path)) as stream, TensorDeserializer(
-            stream, **tensorizer_args.deserializer_params) as deserializer:
-        deserializer.load_into_module(model)
-        end = time.time()
-
-    # Brag about how fast we are.
-    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
-    duration = end - start
-    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
-    after_mem = get_mem_usage()
-    print(
-        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
-    )
-    print(f"Memory usage before: {before_mem}")
-    print(f"Memory usage after: {after_mem}")
-
-    return model
-
-
-args = parse_args()
-
-s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
-                    or None)
-s3_secret_access_key = (args.s3_secret_access_key
-                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
-
-s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
-
-_read_stream, _write_stream = (partial(
-    stream_io.open_stream,
-    mode=mode,
-    s3_access_key_id=s3_access_key_id,
-    s3_secret_access_key=s3_secret_access_key,
-    s3_endpoint=s3_endpoint,
-) for mode in ("rb", "wb+"))
-
-model_ref = args.model
-
-model_name = model_ref.split("/")[1]
-
-os.environ["MASTER_ADDR"] = "127.0.0.1"
-os.environ["MASTER_PORT"] = "8080"
-
-init_distributed_environment(world_size=1, rank=0, local_rank=0)
-initialize_model_parallel()
-
-keyfile = args.keyfile if args.keyfile else None
-
-if args.command == "serialize":
-    input_dir = args.serialized_directory.rstrip('/')
-    suffix = args.suffix if args.suffix else uuid.uuid4().hex
-    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
-    model_path = f"{base_path}/model.tensors"
-    serialize()
-elif args.command == "deserialize":
-    tensorizer_args = TensorizerArgs.from_cli_args(args)
-    model_path = args.path_to_tensors
-    deserialize()
-else:
-    raise ValueError("Either serialize or deserialize must be specified.")
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -9,11 +9,19 @@ import pytest
 import ray
 import torch

-from tests.entrypoints.test_openai_server import ServerRunner
 from vllm import SamplingParams
-from vllm.model_executor.model_loader.tensorizer import (
-    EncryptionParams, TensorizerConfig, TensorSerializer,
-    is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)
+# yapf: disable
+from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
+                                                         TensorSerializer,
+                                                         is_vllm_tensorized,
+                                                         load_with_tensorizer,
+                                                         open_stream,
+                                                         serialize_vllm_model)
+
+from ..utils import ServerRunner
+
+# yapf conflicts with isort for this docstring
+

 prompts = [
    "Hello, my name is",
@@ -39,7 +47,7 @@ def is_curl_installed():

 @pytest.fixture(autouse=True)
 def tensorizer_config():
-    config = TensorizerConfig(tensorizer_uri="vllm", vllm_tensorized=True)
+    config = TensorizerConfig(tensorizer_uri="vllm")
    return config


@@ -58,47 +66,6 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config):
    assert result == mock_agent_instance.deserialize.return_value


-def test_is_vllm_model_with_vllm_in_uri(tensorizer_config):
-    tensorizer_config.vllm_tensorized = True
-
-    result = is_vllm_serialized_tensorizer(tensorizer_config)
-
-    assert result is True
-
-
-def test_is_vllm_model_without_vllm_in_uri(tensorizer_config):
-    tensorizer_config.vllm_tensorized = False
-
-    result = is_vllm_serialized_tensorizer(tensorizer_config)
-
-    assert result is False
-
-
-def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    outputs = vllm_model.generate(prompts, sampling_params)
-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
-             model_runner.model)
-    with open_stream(model_path, "wb+") as stream:
-        serializer = TensorSerializer(stream)
-        serializer.write_module(model)
-    del vllm_model, model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
-        model_ref,
-        load_format="tensorizer",
-        model_loader_extra_config=TensorizerConfig(tensorizer_uri=model_path,
-                                                   num_readers=1,
-                                                   vllm_tensorized=True),
-    )
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
-
-    # Assumes SamplingParams being seeded ensures the outputs are deterministic
-    assert outputs == deserialized_outputs
-
-
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_can_deserialize_s3(vllm_runner):
    model_ref = "EleutherAI/pythia-1.4b"
@@ -109,7 +76,6 @@ def test_can_deserialize_s3(vllm_runner):
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=tensorized_path,
                                      num_readers=1,
-                                      vllm_tensorized=False,
                                      s3_endpoint="object.ord1.coreweave.com",
                                  ))

@@ -125,29 +91,26 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
    model_path = tmp_path / (model_ref + ".tensors")
    key_path = tmp_path / (model_ref + ".key")
    outputs = vllm_model.generate(prompts, sampling_params)
-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
-             model_runner.model)

-    encryption_params = EncryptionParams.random()
-    with open_stream(model_path, "wb+") as stream:
-        serializer = TensorSerializer(stream, encryption=encryption_params)
-        serializer.write_module(model)
-    with open_stream(key_path, "wb+") as stream:
-        stream.write(encryption_params.key)
-    del vllm_model, model
+    config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
+    serialize_vllm_model(vllm_model.model.llm_engine,
+                         config_for_serializing,
+                         encryption_key_path=key_path)
+
+    del vllm_model
    gc.collect()
    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(model_ref,
-                                    load_format="tensorizer",
-                                    model_loader_extra_config=TensorizerConfig(
-                                        tensorizer_uri=model_path,
-                                        encryption_keyfile=key_path,
-                                        num_readers=1,
-                                        vllm_tensorized=True))
+
+    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
+                                                encryption_keyfile=key_path)
+
+    loaded_vllm_model = vllm_runner(
+        model_ref,
+        load_format="tensorizer",
+        model_loader_extra_config=config_for_deserializing)

    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)

-    # Assumes SamplingParams being seeded ensures the outputs are deterministic
    assert outputs == deserialized_outputs


@@ -168,7 +131,7 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=model_path,
                                      num_readers=1,
-                                      vllm_tensorized=False))
+                                  ))

    deserialized_outputs = loaded_hf_model.generate_greedy(
        prompts, max_tokens=max_tokens)
@@ -189,12 +152,11 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    # Serialize model before deserializing and binding LoRA adapters
    vllm_model = vllm_runner(model_ref, )
    model_path = tmp_path / (model_ref + ".tensors")
-    model = (vllm_model.model.llm_engine.model_executor.driver_worker.
-             model_runner.model)
-    with open_stream(model_path, "wb+") as stream:
-        serializer = TensorSerializer(stream)
-        serializer.write_module(model)
-    del vllm_model, model
+
+    serialize_vllm_model(vllm_model.model.llm_engine,
+                         TensorizerConfig(tensorizer_uri=model_path))
+
+    del vllm_model
    gc.collect()
    torch.cuda.empty_cache()
    loaded_vllm_model = vllm_runner(
@@ -203,7 +165,6 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        model_loader_extra_config=TensorizerConfig(
            tensorizer_uri=model_path,
            num_readers=1,
-            vllm_tensorized=True,
        ),
        enable_lora=True,
        max_loras=1,
@@ -219,58 +180,28 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):

 def test_load_without_tensorizer_load_format(vllm_runner):
    with pytest.raises(ValueError):
-        vllm_runner(model_ref,
-                    model_loader_extra_config=TensorizerConfig(
-                        tensorizer_uri="test", vllm_tensorized=False))
-
-
-@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
-def test_tensorize_vllm_model(tmp_path):
-    # Test serialize command
-    serialize_args = [
-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
-        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
-        "--suffix", "tests"
-    ]
-    result = subprocess.run(serialize_args, capture_output=True, text=True)
-    print(result.stdout)  # Print the output of the serialize command
-
-    assert result.returncode == 0, (f"Serialize command failed with output:"
-                                    f"\n{result.stdout}\n{result.stderr}")
-
-    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
-
-    # Test deserialize command
-    deserialize_args = [
-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
-        "--dtype", "float16", "deserialize", "--path-to-tensors",
-        path_to_tensors
-    ]
-    result = subprocess.run(deserialize_args, capture_output=True, text=True)
-    assert result.returncode == 0, (f"Deserialize command failed with output:"
-                                    f"\n{result.stdout}\n{result.stderr}")
+        vllm_runner(
+            model_ref,
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))


 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
-def test_openai_apiserver_with_tensorizer(tmp_path):
+def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    ## Serialize model
-    serialize_args = [
-        "python3", tensorize_model_for_testing_script, "--model", model_ref,
-        "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
-        "--suffix", "tests"
-    ]
-    result = subprocess.run(serialize_args, capture_output=True, text=True)
-    print(result.stdout)  # Print the output of the serialize command
+    vllm_model = vllm_runner(model_ref, )
+    model_path = tmp_path / (model_ref + ".tensors")

-    assert result.returncode == 0, (f"Serialize command failed with output:"
-                                    f"\n{result.stdout}\n{result.stderr}")
+    serialize_vllm_model(vllm_model.model.llm_engine,
+                         TensorizerConfig(tensorizer_uri=model_path))

-    path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
    model_loader_extra_config = {
-        "tensorizer_uri": path_to_tensors,
-        "vllm_tensorized": True
+        "tensorizer_uri": str(model_path),
    }

+    del vllm_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
    ## Start OpenAI API server
    openai_args = [
        "--model", model_ref, "--dtype", "float16", "--load-format",
@@ -303,10 +234,10 @@ def test_openai_apiserver_with_tensorizer(tmp_path):

 def test_raise_value_error_on_invalid_load_format(vllm_runner):
    with pytest.raises(ValueError):
-        vllm_runner(model_ref,
-                    load_format="safetensors",
-                    model_loader_extra_config=TensorizerConfig(
-                        tensorizer_uri="test", vllm_tensorized=False))
+        vllm_runner(
+            model_ref,
+            load_format="safetensors",
+            model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))


 def test_tensorizer_with_tp(vllm_runner):
@@ -320,8 +251,29 @@ def test_tensorizer_with_tp(vllm_runner):
            model_loader_extra_config=TensorizerConfig(
                tensorizer_uri=tensorized_path,
                num_readers=1,
-                vllm_tensorized=False,
                s3_endpoint="object.ord1.coreweave.com",
            ),
            tensor_parallel_size=2,
        )
+
+
+def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
+    model_ref = "facebook/opt-125m"
+    model_path = tmp_path / (model_ref + ".tensors")
+    config = TensorizerConfig(tensorizer_uri=str(model_path))
+
+    vllm_model = vllm_runner(model_ref)
+    outputs = vllm_model.generate(prompts, sampling_params)
+    serialize_vllm_model(vllm_model.model.llm_engine, config)
+
+    assert is_vllm_tensorized(config)
+    del vllm_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    loaded_vllm_model = vllm_runner(model_ref,
+                                    load_format="tensorizer",
+                                    model_loader_extra_config=config)
+    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+    assert outputs == deserialized_outputs
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -70,8 +70,14 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
            for prompt in prompts:
                hashes[-1].append([])
                prompt_token_ids = tokenizer.encode(prompt)
-                seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
-                               tokenizer.tokenizer.eos_token_id, lora_request)
+                seq = Sequence(seq_id,
+                               inputs={
+                                   "prompt": prompt,
+                                   "prompt_token_ids": prompt_token_ids,
+                               },
+                               block_size=block_size,
+                               eos_token_id=tokenizer.tokenizer.eos_token_id,
+                               lora_request=lora_request)

                num_blocks = len(prompt_token_ids) // block_size
                for idx in range(num_blocks):

--- a/tests/test_config.py
+++ b/tests/test_config.py
+import pytest
+
 from vllm.config import ModelConfig

+MODEL_IDS_EXPECTED = [
+    ("Qwen/Qwen1.5-7B", 32768),
+    ("mistralai/Mistral-7B-v0.1", 4096),
+    ("mistralai/Mistral-7B-Instruct-v0.2", 32768),
+]
+
+
+@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
+def test_disable_sliding_window(model_id_expected):
+    model_id, expected = model_id_expected
+    model_config = ModelConfig(
+        model_id,
+        model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+        disable_sliding_window=True,
+    )
+    assert model_config.max_model_len == expected
+

 def test_get_sliding_window():
    TEST_SLIDING_WINDOW = 4096
@@ -36,4 +60,58 @@ def test_get_sliding_window():
    assert mistral_model_config.get_sliding_window() is None

    mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
-    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
\ No newline at end of file
+    assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
+
+
+def test_rope_scaling():
+    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
+    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
+
+    llama_model_config = ModelConfig(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
+    assert llama_model_config.max_model_len == 8192
+
+    llama_model_config = ModelConfig(
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        "meta-llama/Meta-Llama-3-8B-Instruct",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        rope_scaling=TEST_ROPE_SCALING,
+    )
+    assert getattr(llama_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert llama_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+    assert getattr(longchat_model_config.hf_config, "rope_scaling",
+                   None) == LONGCHAT_ROPE_SCALING
+    assert longchat_model_config.max_model_len == 16384
+
+    longchat_model_config = ModelConfig(
+        "lmsys/longchat-13b-16k",
+        "lmsys/longchat-13b-16k",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+        rope_scaling=TEST_ROPE_SCALING,
+    )
+    assert getattr(longchat_model_config.hf_config, "rope_scaling",
+                   None) == TEST_ROPE_SCALING
+    assert longchat_model_config.max_model_len == 4096
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
+from typing import List
+
+import pytest
+
+from vllm.inputs import parse_and_batch_prompt
+
+STRING_INPUTS = [
+    '',
+    'foo',
+    'foo bar',
+    'foo baz bar',
+    'foo bar qux baz',
+]
+
+TOKEN_INPUTS = [
+    [-1],
+    [1],
+    [1, 2],
+    [1, 3, 4],
+    [1, 2, 4, 3],
+]
+
+INPUTS_SLICES = [
+    slice(None, None, -1),
+    slice(None, None, 2),
+    slice(None, None, -2),
+]
+
+
+def test_parse_single_batch_empty():
+    with pytest.raises(ValueError, match="at least one prompt"):
+        parse_and_batch_prompt([])
+
+    with pytest.raises(ValueError, match="at least one prompt"):
+        parse_and_batch_prompt([[]])
+
+
+@pytest.mark.parametrize('string_input', STRING_INPUTS)
+def test_parse_single_batch_string_consistent(string_input: str):
+    assert parse_and_batch_prompt(string_input) \
+        == parse_and_batch_prompt([string_input])
+
+
+@pytest.mark.parametrize('token_input', TOKEN_INPUTS)
+def test_parse_single_batch_token_consistent(token_input: List[int]):
+    assert parse_and_batch_prompt(token_input) \
+        == parse_and_batch_prompt([token_input])
+
+
+@pytest.mark.parametrize('inputs_slice', INPUTS_SLICES)
+def test_parse_single_batch_string_slice(inputs_slice: slice):
+    assert parse_and_batch_prompt(STRING_INPUTS)[inputs_slice] \
+        == parse_and_batch_prompt(STRING_INPUTS[inputs_slice])
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.worker.model_runner import ModelRunner
+from vllm.utils import is_pin_memory_available


 class MockLogitsProcessor(LogitsProcessor):
@@ -30,21 +30,15 @@ class MockLogitsProcessor(LogitsProcessor):


 def _prepare_test(
-    batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor, ModelRunner]:
+        batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
    vocab_size = 32000
    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
    fake_logits = torch.full((batch_size, vocab_size),
                             1e-2,
                             dtype=input_tensor.dtype)
    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    model_runner = ModelRunner(model_config=None,
-                               parallel_config=None,
-                               scheduler_config=None,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)
-    return input_tensor, fake_logits, logits_processor, model_runner
+    return input_tensor, fake_logits, logits_processor


 RANDOM_SEEDS = list(range(128))
@@ -59,8 +53,7 @@ def test_logits_processors(seed: int, device: str):
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, logits_processor, model_runner = _prepare_test(
-        batch_size)
+    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)

    # This sample logits processor gives infinite score to the i-th token,
    # where i is the length of the input sequence.
@@ -87,8 +80,8 @@ def test_logits_processors(seed: int, device: str):
        seq_group_metadata_list,
        seq_lens,
        query_lens=seq_lens,
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
+        device=device,
+        pin_memory=is_pin_memory_available())
    logits_processor_output = logits_processor(
        embedding=None,
        hidden_states=input_tensor,
@@ -99,5 +92,3 @@ def test_logits_processors(seed: int, device: str):
    fake_logits *= logits_processor.scale
    assert torch.allclose(logits_processor_output[:, 1], fake_logits[:, 1],
                          1e-4)
-
-    del model_runner
--- a/tests/test_sequence.py
+++ b/tests/test_sequence.py
-import time
-from typing import Optional
-
 import pytest

-from vllm import SamplingParams
-from vllm.lora.request import LoRARequest
-from vllm.sequence import (SamplerOutput, Sequence, SequenceData,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput)
-
-
-def create_dummy_prompt(
-    request_id: str,
-    prompt_length: int,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-    use_beam_search: bool = False,
-    best_of: int = 1,
-) -> SequenceGroup:
-    if not block_size:
-        block_size = prompt_length
-
-    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size".
-    prompt_tokens = list(range(prompt_length))
-    prompt_str = " ".join([str(t) for t in prompt_tokens])
-    prompt = Sequence(int(request_id), prompt_str, prompt_tokens, block_size)
-    seq_group = SequenceGroup(
-        request_id, [prompt],
-        SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
-        time.time(), lora_request)
+from vllm.sequence import (CompletionSequenceGroupOutput, SamplerOutput,
+                           SequenceData, SequenceOutput)

-    return seq_group
+from .core.utils import create_dummy_prompt


 @pytest.fixture
 def sample_outputs():
    return [
-        SequenceGroupOutput(samples=[
+        CompletionSequenceGroupOutput(samples=[
            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
        ],
-                            prompt_logprobs=None) for i in range(5)
+                                      prompt_logprobs=None) for i in range(5)
    ]


@@ -60,10 +33,10 @@ def test_sampler_output_getitem(sampler_output, sample_outputs):


 def test_sampler_output_setitem(sampler_output):
-    new_output = SequenceGroupOutput(samples=[
+    new_output = CompletionSequenceGroupOutput(samples=[
        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
    ],
-                                     prompt_logprobs=None)
+                                               prompt_logprobs=None)
    sampler_output[2] = new_output
    assert sampler_output[2] == new_output

@@ -102,7 +75,7 @@ def test_sequence_data_prefill():


 def test_sequence_group_stage():
-    seq_group = create_dummy_prompt("1", 12)
+    _, seq_group = create_dummy_prompt("1", 12)
    assert seq_group.is_prefill() is True
    seq_group.update_num_computed_tokens(6)
    assert seq_group.is_prefill() is True

--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
+import os
+import shutil
+from tempfile import TemporaryDirectory
+
+import pytest
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import LLM, SamplingParams
+from vllm.model_executor.model_loader.loader import ShardedStateLoader
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0.8,
+    top_p=0.95,
+    seed=0,
+    max_tokens=256,
+    ignore_eos=True,
+)
+
+
+def test_filter_subtensors():
+    state_dict = {
+        "a": torch.empty(2),
+        "b": torch.empty((2, 4)),
+        "c": torch.empty((2, 4, 8)),
+    }
+    state_dict.update({
+        "x": state_dict["b"],
+        "y": state_dict["c"][1, 2, :],
+        "z": state_dict["c"][1, :, 4],
+    })
+    filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
+    assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
+    for key, tensor in filtered_state_dict.items():
+        assert tensor.equal(state_dict[key])
+
+
+@pytest.mark.parametrize("enable_lora", [False, True])
+def test_sharded_state_loader(enable_lora):
+    weights_patterns = ("*.bin", "*.pt", "*.safetensors")
+
+    with TemporaryDirectory() as cache_dir, TemporaryDirectory() as output_dir:
+        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
+                                      cache_dir=cache_dir)
+
+        llm = LLM(
+            model=input_dir,
+            worker_use_ray=True,
+            gpu_memory_utilization=0.3,
+        )
+
+        # Dump worker states to output directory
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(path=output_dir)
+        # Copy metadata files to output directory
+        for file in os.listdir(input_dir):
+            if not any(file.endswith(ext) for ext in weights_patterns):
+                shutil.copy(f"{input_dir}/{file}", output_dir)
+        del llm.llm_engine.model_executor
+
+        llm_before = LLM(
+            model=input_dir,
+            worker_use_ray=True,
+            enable_lora=enable_lora,
+            gpu_memory_utilization=0.3,
+        )
+        gen_before = llm_before.generate(prompts, sampling_params)
+        out_before = [gen.outputs[0].__dict__ for gen in gen_before]
+        del llm_before.llm_engine.model_executor
+
+        llm_after = LLM(
+            model=output_dir,
+            worker_use_ray=True,
+            enable_lora=enable_lora,
+            gpu_memory_utilization=0.3,
+            load_format="sharded_state",
+        )
+        gen_after = llm_after.generate(prompts, sampling_params)
+        out_after = [gen.outputs[0].__dict__ for gen in gen_after]
+        del llm_after.llm_engine.model_executor
+
+        assert out_before == out_after
--- a/tests/async_engine/test_merge_async_iterators.py
+++ b/tests/async_engine/test_merge_async_iterators.py
 import asyncio
-from typing import AsyncIterator, Tuple
+import sys
+from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol,
+                    Tuple, TypeVar)

 import pytest

-from vllm.utils import merge_async_iterators
+from vllm.utils import deprecate_kwargs, merge_async_iterators
+
+from .utils import error_on_warning
+
+if sys.version_info < (3, 10):
+    if TYPE_CHECKING:
+        _AwaitableT = TypeVar("_AwaitableT", bound=Awaitable[Any])
+        _AwaitableT_co = TypeVar("_AwaitableT_co",
+                                 bound=Awaitable[Any],
+                                 covariant=True)
+
+        class _SupportsSynchronousAnext(Protocol[_AwaitableT_co]):
+
+            def __anext__(self) -> _AwaitableT_co:
+                ...
+
+    def anext(i: "_SupportsSynchronousAnext[_AwaitableT]", /) -> "_AwaitableT":
+        return i.__anext__()


 @pytest.mark.asyncio
@@ -39,3 +58,61 @@ async def test_merge_async_iterators():
            print("Iterator was cancelled normally")
        except (Exception, asyncio.CancelledError) as e:
            raise AssertionError() from e
+
+
+def test_deprecate_kwargs_always():
+
+    @deprecate_kwargs("old_arg", is_deprecated=True)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning():
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_never():
+
+    @deprecate_kwargs("old_arg", is_deprecated=False)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with error_on_warning():
+        dummy(old_arg=1)
+
+    with error_on_warning():
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_dynamic():
+    is_deprecated = True
+
+    @deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="'old_arg'"):
+        dummy(old_arg=1)
+
+    with error_on_warning():
+        dummy(new_arg=1)
+
+    is_deprecated = False
+
+    with error_on_warning():
+        dummy(old_arg=1)
+
+    with error_on_warning():
+        dummy(new_arg=1)
+
+
+def test_deprecate_kwargs_additional_message():
+
+    @deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
+    def dummy(*, old_arg: object = None, new_arg: object = None):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="abcd"):
+        dummy(old_arg=1)
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -123,8 +123,10 @@ def create_sequence(prompt_token_ids=None):
    prompt_token_ids = prompt_token_ids or [1]
    return Sequence(
        seq_id=0,
-        prompt="<s>",
-        prompt_token_ids=prompt_token_ids,
+        inputs={
+            "prompt": "<s>",
+            "prompt_token_ids": prompt_token_ids,
+        },
        block_size=16,
    )


--- a/tests/utils.py
+++ b/tests/utils.py
+import os
+import subprocess
+import sys
+import time
+import warnings
+from contextlib import contextmanager
+
+import ray
+import requests
+
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.utils import get_open_port
+
+# Path to root of repository so that utilities can be imported by ray workers
+VLLM_PATH = os.path.abspath(os.path.join(__file__, os.pardir, os.pardir))
+
+
+@ray.remote(num_gpus=1)
+class ServerRunner:
+    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
+
+    def __init__(self, args):
+        env = os.environ.copy()
+        env["PYTHONUNBUFFERED"] = "1"
+        self.proc = subprocess.Popen(
+            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+        self._wait_for_server()
+
+    def ready(self):
+        return True
+
+    def _wait_for_server(self):
+        # run health check
+        start = time.time()
+        while True:
+            try:
+                if requests.get(
+                        "http://localhost:8000/health").status_code == 200:
+                    break
+            except Exception as err:
+                if self.proc.poll() is not None:
+                    raise RuntimeError("Server exited unexpectedly.") from err
+
+                time.sleep(0.5)
+                if time.time() - start > self.MAX_SERVER_START_WAIT_S:
+                    raise RuntimeError(
+                        "Server failed to start in time.") from err
+
+    def __del__(self):
+        if hasattr(self, "proc"):
+            self.proc.terminate()
+
+
+def init_test_distributed_environment(
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+    local_rank: int = -1,
+) -> None:
+    distributed_init_method = f"tcp://localhost:{distributed_init_port}"
+    init_distributed_environment(
+        world_size=pp_size * tp_size,
+        rank=rank,
+        distributed_init_method=distributed_init_method,
+        local_rank=local_rank)
+    ensure_model_parallel_initialized(tp_size, pp_size)
+
+
+def multi_process_tensor_parallel(
+    tp_size: int,
+    pp_size: int,
+    test_target,
+) -> None:
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init(runtime_env={"working_dir": VLLM_PATH})
+
+    distributed_init_port = get_open_port()
+    refs = []
+    for rank in range(tp_size * pp_size):
+        refs.append(
+            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
+
+
+@contextmanager
+def error_on_warning():
+    """
+    Within the scope of this context manager, tests will fail if any warning
+    is emitted.
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+
+        yield
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
 import pytest
 import torch

-from vllm.config import ModelConfig, SchedulerConfig
 from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 from vllm.utils import get_open_port
 from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size


+def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    model_runner = ModelRunner(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+        cache_config=engine_config.cache_config,
+        load_config=engine_config.load_config,
+        lora_config=engine_config.lora_config,
+        is_driver_worker=True,
+    )
+    return model_runner
+
+
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_prompt(batch_size):
-    scheduler_config = SchedulerConfig(100000,
-                                       100000,
-                                       100000,
-                                       enable_chunked_prefill=False)
-    model_runner = ModelRunner(model_config=None,
-                               parallel_config=None,
-                               scheduler_config=scheduler_config,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)
-    model_runner.set_block_size(16)
+    model_runner = _create_model_runner(
+        "facebook/opt-125m",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+    )

    seq_lens = []
    seq_group_metadata_list = []
@@ -47,19 +58,25 @@ def test_prepare_prompt(batch_size):
        expected_selected_token_indices.append(selected_token_start_idx +
                                               seq_len - 1)
        selected_token_start_idx += seq_len
-    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
-     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
+    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = model_input.slot_mapping
    assert return_seq_lens == seq_lens
    assert len(slot_mapping) == len(input_tokens)

    # Verify input metadata is correct for prompts.
    device = model_runner.device
-    assert attn_metadata.is_prompt is True
+    assert attn_metadata.num_prefills > 0
+    assert attn_metadata.num_decode_tokens == 0
    assert torch.allclose(
        attn_metadata.seq_lens_tensor,
        torch.tensor(seq_lens, device=device, dtype=torch.int))
    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_seq_len == max(seq_lens)
+    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
+    assert attn_metadata.max_decode_seq_len == 0

    # Test subquery start locs.
    start_idx = 0
@@ -68,11 +85,11 @@ def test_prepare_prompt(batch_size):
        start_idx += seq_len
        start_loc.append(start_idx)
    assert torch.allclose(
-        attn_metadata.subquery_start_loc,
+        attn_metadata.query_start_loc,
        torch.tensor(start_loc, dtype=torch.int32, device=device))

    # Test seq start locs. Note that for normal prefill it is
-    # equivalent to subquery_start_loc.
+    # equivalent to query_start_loc.
    start_idx = 0
    seq_start_loc = [start_idx]
    for seq_len in seq_lens:
@@ -112,7 +129,7 @@ def test_prepare_prompt(batch_size):
                            device=actual.device,
                            dtype=actual.dtype)
    torch.testing.assert_close(actual, expected)
-    assert input_tokens == input_positions
+    torch.allclose(input_tokens, input_positions)

    actual = sampling_metadata.selected_token_indices
    expected = torch.tensor(expected_selected_token_indices,
@@ -123,36 +140,28 @@ def test_prepare_prompt(batch_size):

 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 def test_prepare_decode_cuda_graph(batch_size):
-    model_config = ModelConfig(
+    model_runner = _create_model_runner(
        "facebook/opt-125m",
-        "facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
        seed=0,
        dtype="float16",
-        revision=None,
        enforce_eager=False,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
    )
-    scheduler_config = SchedulerConfig(100000,
-                                       100000,
-                                       100000,
-                                       enable_chunked_prefill=False)
-    model_runner = ModelRunner(model_config=model_config,
-                               parallel_config=None,
-                               scheduler_config=scheduler_config,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)
-    model_runner.set_block_size(16)

-    seq_lens = []
+    context_lens = []
    seq_group_metadata_list = []
+    # Assume each seq group finishes prefill.
    for i in range(batch_size):
        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        seq_data = list(range(seq_len))
+        context_len = i % (model_runner.block_size - 1) + 1
+        context_lens.append(context_len)
+        seq_data = list(range(context_len))
        seq_data = SequenceData(seq_data)
+        seq_data.update_num_computed_tokens(context_len)
+        # Append one token ID since prefill is finished.
+        seq_data.append_token_id(1, 0)
        seq_group_metadata = SequenceGroupMetadata(
            request_id=f"test_{i}",
            is_prompt=False,
@@ -163,18 +172,45 @@ def test_prepare_decode_cuda_graph(batch_size):
        assert seq_group_metadata.token_chunk_size == 1
        seq_group_metadata_list.append(seq_group_metadata)

-    input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
-        model_runner._prepare_decode(seq_group_metadata_list))
+    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata, slot_mapping = (
+        model_input.input_tokens, model_input.input_positions,
+        model_input.attn_metadata, model_input.slot_mapping)
    assert len(slot_mapping) == len(input_tokens)

    expected_bs = _get_graph_batch_size(len(seq_group_metadata_list))
    # Verify input metadata is correct for prompts.
    device = model_runner.device
-    assert attn_metadata.is_prompt is False
-    assert attn_metadata.seq_lens is None
-    assert attn_metadata.subquery_start_loc is None
-    assert attn_metadata.seq_start_loc is None
-    assert attn_metadata.max_seq_len == max(seq_lens)
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_prefill_tokens == 0
+    seq_lens = [context_len + 1 for context_len in context_lens]
+    # seq_lens are padded to expected_bs
+    for _ in range(expected_bs - len(seq_lens)):
+        seq_lens.append(1)
+    assert attn_metadata.seq_lens == seq_lens
+    start_idx = 0
+    start_loc = [start_idx]
+    for _ in context_lens:
+        # decode has only 1 token for query.
+        start_idx += 1
+        start_loc.append(start_idx)
+    assert torch.allclose(
+        attn_metadata.query_start_loc,
+        torch.tensor(start_loc, dtype=torch.int32, device=device))
+
+    start_idx = 0
+    seq_start_loc = [start_idx]
+    for seq_len in seq_lens:
+        start_idx += seq_len
+        seq_start_loc.append(start_idx)
+    assert torch.allclose(
+        attn_metadata.seq_start_loc,
+        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
+
+    assert torch.allclose(
+        attn_metadata.context_lens_tensor,
+        torch.tensor(context_lens, dtype=torch.int, device=device))
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
    assert torch.allclose(
        attn_metadata.seq_lens_tensor[:len(seq_lens)],
        torch.tensor(seq_lens, dtype=torch.int, device=device))
@@ -186,23 +222,23 @@ def test_prepare_decode_cuda_graph(batch_size):
    # It is padded up to
    assert attn_metadata.block_tables.shape[1] == (
        model_runner.get_max_block_per_batch())
-    # Cuda graph should not be used for prerill.
    assert attn_metadata.use_cuda_graph is True

    assert len(input_tokens) == expected_bs
    assert len(input_positions) == expected_bs
-    assert input_tokens == input_positions
+    torch.allclose(input_tokens, input_positions)

    # Verify Sampling
    expected_selected_token_indices = []
    selected_token_start_idx = 0
-    for seq_len in seq_lens:
+    for _ in context_lens:
        expected_selected_token_indices.append(selected_token_start_idx)
        selected_token_start_idx += 1
    sampling_metadata = SamplingMetadata.prepare(
        seq_group_metadata_list,
        seq_lens,
-        query_lens=seq_lens,
+        # query lens is all 1 for decode.
+        query_lens=[1 for _ in range(len(context_lens))],
        device=model_runner.device,
        pin_memory=model_runner.pin_memory)
    actual = sampling_metadata.selected_token_indices
@@ -214,33 +250,34 @@ def test_prepare_decode_cuda_graph(batch_size):

 def test_empty_seq_group():
    """Verify prepare prompt and decode returns empty output."""
-    model_config = ModelConfig(
-        "facebook/opt-125m",
+    model_runner = _create_model_runner(
        "facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
        seed=0,
        dtype="float16",
-        revision=None,
        enforce_eager=False,
    )
-    model_runner = ModelRunner(model_config=model_config,
-                               parallel_config=None,
-                               scheduler_config=None,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None)
-    model_runner.set_block_size(16)
    seq_group_metadata_list = []
-    input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
-        model_runner._prepare_decode(seq_group_metadata_list))
+    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    input_tokens, input_positions, attn_metadata, slot_mapping = (
+        model_input.input_tokens,
+        model_input.input_positions,
+        model_input.attn_metadata,
+        model_input.slot_mapping,
+    )
    assert len(input_tokens) == 0
    assert len(input_positions) == 0
    assert attn_metadata is None
    assert len(slot_mapping) == 0

-    (input_tokens, input_positions, attn_metadata, return_seq_lens, _, _, _, _,
-     _, slot_mapping) = (model_runner._prepare_prompt(seq_group_metadata_list))
+    model_input = model_runner._prepare_model_input(seq_group_metadata_list)
+    (input_tokens, input_positions, attn_metadata, slot_mapping,
+     return_seq_lens) = (
+         model_input.input_tokens,
+         model_input.input_positions,
+         model_input.attn_metadata,
+         model_input.slot_mapping,
+         model_input.seq_lens,
+     )
    assert len(input_tokens) == 0
    assert len(input_positions) == 0
    assert attn_metadata is None
@@ -260,29 +297,15 @@ def distributed_init():
 @pytest.mark.parametrize("batch_size", list(range(2, 128)))
 @pytest.mark.parametrize("enforce_eager", [True, False])
 def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
-
-    model_config = ModelConfig(
+    model_runner = _create_model_runner(
        "facebook/opt-125m",
-        "facebook/opt-125m",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
        seed=0,
        dtype="float16",
-        revision=None,
        enforce_eager=enforce_eager,
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=True,
    )
-    scheduler_config = SchedulerConfig(100000,
-                                       100000,
-                                       100000,
-                                       enable_chunked_prefill=True)
-    model_runner = ModelRunner(model_config=model_config,
-                               parallel_config=None,
-                               scheduler_config=scheduler_config,
-                               device_config=None,
-                               load_config=None,
-                               lora_config=None,
-                               is_driver_worker=True)
-    model_runner.set_block_size(16)

    # Add prefill requests.
    seq_lens = []
@@ -311,9 +334,11 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
    # Add decode requests
    for i in range(prefill_batch_size, batch_size):
        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        prompt_toks = list(range(seq_len))
+        context_len = i % (model_runner.block_size - 1) + 1
+        prompt_toks = list(range(context_len))
        seq_data = SequenceData(prompt_toks)
+        seq_data.append_token_id(1, 0)
+        seq_data.update_num_computed_tokens(context_len)
        seq_group_metadata = SequenceGroupMetadata(
            request_id=f"test_{i}",
            is_prompt=False,
@@ -333,25 +358,18 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):

    assert len(attn_metadata.slot_mapping) == len(input_tokens)
    assert len(input_positions) == len(input_tokens)
-    assert attn_metadata.kv_cache_dtype == "auto"
    assert attn_metadata.num_prefills == prefill_batch_size
-    if enforce_eager:
-        assert attn_metadata.num_decode_tokens == decode_batch_size
-    else:
-        assert attn_metadata.num_decode_tokens == _get_graph_batch_size(
-            decode_batch_size)
+    assert attn_metadata.num_decode_tokens == decode_batch_size
    assert attn_metadata.num_prefill_tokens == sum(seq_lens)

    # Verify attn metadata is consistent. We don't need to test individual
    # values here because they are tested above.
-    prefill_meta = model_runner._prepare_prompt(
-        prefill_metadata_list).attn_metadata
-    decode_meta = model_runner._prepare_decode(
-        decode_metadata_list).attn_metadata
+    attn_metadata = model_runner._prepare_model_input(
+        seq_group_metadata_list).attn_metadata

-    for attr_expected, attr_actual in zip(vars(prefill_meta),
+    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
                                          vars(prefill_meta_actual)):
        assert attr_expected[1] == attr_actual[1]
-    for attr_expected, attr_actual in zip(vars(decode_meta),
+    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
                                          vars(decode_meta_actual)):
        assert attr_expected[1] == attr_actual[1]