Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -99,7 +99,7 @@ def test_register_quantization_config():

 @pytest.mark.parametrize(argnames="model",
                         argvalues=[
-                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                             "meta-llama/Llama-3.2-1B-Instruct",
                         ])
 def test_custom_quant(vllm_runner, model):
    """Test infer with the custom quantization method."""

--- a/tests/runai_model_streamer_test/__init__.py
+++ b/tests/runai_model_streamer_test/__init__.py
--- a/tests/runai_model_streamer/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer/test_runai_model_streamer_loader.py
--- a/tests/runai_model_streamer/test_weight_utils.py
+++ b/tests/runai_model_streamer/test_weight_utils.py
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -12,7 +12,7 @@ from ..utils import models_path_prefix

 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")]
+MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -7,7 +7,7 @@ import os
 from vllm import SamplingParams
 from ..utils import models_path_prefix

-MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
+MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -11,7 +11,7 @@ from vllm import SamplingParams
 from ..conftest import VllmRunner
 from ..utils import models_path_prefix

-MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
+MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -77,7 +77,7 @@ class TestOneTokenBadWord:

 class TestTwoTokenBadWord:
    # Another model (with a different tokenizer behaviour)
-    MODEL = os.path.join(models_path_prefix, "openai-community/gpt2")
+    MODEL = os.path.join(models_path_prefix, "distilbert/distilgpt2")

    PROMPT = "How old are you? I am 10"
    TARGET_TOKEN1 = "years"

--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -6,7 +6,7 @@ import os
 from vllm import SamplingParams
 from ..utils import models_path_prefix

-MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
+MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -10,7 +10,7 @@ from ...utils import models_path_prefix


 @pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
+    "model": os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
    "speculative_model": "JackFram/llama-68m",
    "num_speculative_tokens": 5,
 }])
@@ -29,8 +29,8 @@ from ...utils import models_path_prefix
        },
        {
            # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
-            "speculative_max_model_len": 4096 + 1,
+            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_max_model_len": 131072 + 1,
        },
    ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])

--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -308,6 +308,150 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
                                  batch_size, output_len, seed)


+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                             per_test_common_llm_kwargs,
+                                             baseline_llm_kwargs,
+                                             test_llm_kwargs, batch_size: int,
+                                             output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                             per_test_common_llm_kwargs,
+                                             baseline_llm_kwargs,
+                                             test_llm_kwargs, batch_size: int,
+                                             output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "Qwen/Qwen2-7B-Instruct",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                            per_test_common_llm_kwargs,
+                                            baseline_llm_kwargs,
+                                            test_llm_kwargs, batch_size: int,
+                                            output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
 if __name__ == "__main__":
    import pytest
    pytest.main([__file__])
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
+# SPDX-License-Identifier: Apache-2.0
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, mtp would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_equality_correctness_test
+
+# main model
+MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
+
+# max. number of speculative tokens: this corresponds to
+# num_nextn_predict_layers in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 1
+
+# precision
+PRECISION = "bfloat16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                    per_test_common_llm_kwargs,
+                                    baseline_llm_kwargs, test_llm_kwargs,
+                                    batch_size: int, output_len: int,
+                                    seed: int):
+
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": False,
+    },
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": True,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs, test_llm_kwargs,
+                                 batch_size: int, output_len: int, seed: int,
+                                 logprobs: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  logprobs=logprobs,
+                                  prompt_logprobs=logprobs,
+                                  disable_logprobs=test_llm_kwargs[
+                                      'disable_logprobs_during_spec_decoding'])
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
+                                               per_test_common_llm_kwargs,
+                                               baseline_llm_kwargs,
+                                               test_llm_kwargs,
+                                               batch_size: int,
+                                               output_len: int, seed: int):
+    """Verify greedy equality with cuda graph enabled and different
+    batch sizes."""
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_different_k(vllm_runner, common_llm_kwargs,
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
+                         test_llm_kwargs, batch_size: int, output_len: int,
+                         seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -13,15 +13,18 @@ from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SequenceOutput
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                      SpecDecodeWorkerMetrics)
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                 split_num_cache_blocks_evenly)
+from vllm.worker.worker import Worker

 from .test_utils import mock_spec_decode_sampler
-from .utils import create_batch, create_sampler_output_list, mock_worker
+from .utils import (create_batch, create_sampler_output_list, create_worker,
+                    mock_worker)


 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -905,3 +908,38 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
            worker.execute_model(execute_model_req=execute_model_req)
        # but first draft still counted
        assert draft_worker.get_spec_proposals.call_count == 1
+
+
+def test_correctly_load_weight_for_eagle():
+    """
+        Verify SpecDecodeWorker loads lm_head weight for eagle correctly.
+    """
+    seed = 100
+    block_size = 32
+    num_gpu_blocks = 8096 // block_size
+    target_worker = create_worker(
+        Worker,
+        "JackFram/llama-68m",
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    draft_worker = create_worker(
+        MultiStepWorker,
+        "abhigoyal/vllm-eagle-llama-68m-random",
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+
+    spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler")
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              disable_logprobs=False)
+    worker.proposer_worker.maybe_load_lm_head_weight(
+        target_worker.model_runner.model.lm_head.weight.data)
+    assert torch.allclose(
+        worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
+        worker.scorer_worker.model_runner.model.lm_head.weight.data)
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -10,15 +10,22 @@ from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 from utils import models_path_prefix

+from .conftest import MODEL_WEIGHTS_S3_BUCKET
+

 @pytest.mark.parametrize(
    ("model_id", "expected_runner_type", "expected_task"),
    [
-        (os.path.join(models_path_prefix, "facebook/opt-125m"), "generate", "generate"),
-        (os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), "pooling", "embed"),
-        (os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach"), "pooling", "classify"),
-        (os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), "pooling", "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
+         "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
+         "pooling", "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
+         "classify"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
+         "pooling", "score"),
        (os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"), "pooling", "reward"),
+        (os.path.join(models_path_prefix, "openai/whisper-small"), "transcription", "transcription"),
    ],
 )
 def test_auto_task(model_id, expected_runner_type, expected_task):
@@ -252,7 +259,7 @@ def test_rope_customization():
 @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
    ("facebook/opt-125m", False),
    ("facebook/bart-base", True),
-    ("meta-llama/Llama-3.2-1B", False),
+    ("meta-llama/Llama-3.2-1B-Instruct", False),
    ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -12,6 +12,9 @@ import torch
 from vllm import LLM, SamplingParams
 from utils import models_path_prefix
 import os
+from vllm.config import LoadFormat
+
+from .conftest import MODEL_WEIGHTS_S3_BUCKET


 def test_duplicated_ignored_sequence_group():
@@ -20,7 +23,8 @@ def test_duplicated_ignored_sequence_group():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
-    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -33,7 +37,8 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["Just say hello!"]
@@ -43,7 +48,9 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"), enforce_eager=True)
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
    del llm

    gc.collect()

--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -11,6 +11,7 @@ from vllm.scalar_type import scalar_types
    (0, 15, scalar_types.uint4),
    (-8, 7, scalar_types.uint4b8),
    (-128, 127, scalar_types.uint8b128),
+    (-6., 6., scalar_types.float4_e2m1fn),
    (-28., 28., scalar_types.float6_e3m2f),
    (torch.int8, scalar_types.int8),
    (torch.uint8, scalar_types.uint8),

--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
+# SPDX-License-Identifier: Apache-2.0
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
+
+    Platform.seed_everything(42)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
+
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -47,12 +47,12 @@ def test_filter_subtensors():


 @pytest.fixture(scope="module")
-def llama_2_7b_files():
+def llama_3p2_1b_files():
    with TemporaryDirectory() as cache_dir:
-        # input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
+        # input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
        #                               cache_dir=cache_dir,
        #                               ignore_patterns=["*.bin*", "original/*"])
-        input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
+        input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
        yield input_dir


@@ -82,13 +82,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_2_7b_files):
+                              llama_3p2_1b_files):
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

    weights_patterns = ("*.safetensors", )
    gpu_memory_utilization = 0.8
-    input_dir = llama_2_7b_files
+    input_dir = llama_3p2_1b_files
    ctx = mp.get_context("spawn")

    # Run in separate processes for memory & CUDA isolation

--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -34,7 +34,7 @@ TOKENIZERS = [
    os.path.join(models_path_prefix, "bigscience/bloom-560m"),
    os.path.join(models_path_prefix, "mosaicml/mpt-7b"),
    os.path.join(models_path_prefix, "tiiuae/falcon-7b"),
-    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
    os.path.join(models_path_prefix, "codellama/CodeLlama-7b-hf"),
    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
 ]

--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -23,7 +23,7 @@ def test_get_llama3_eos_token():
    generation_config = try_get_generation_config(model_name,
                                                  trust_remote_code=False)
    assert generation_config is not None
-    assert generation_config.eos_token_id == [128001, 128009]
+    assert generation_config.eos_token_id == [128001, 128008, 128009]


 def test_get_blip2_eos_token():