merge v0.3.2

40542023 · zhuwenwen · 5e5b497d · 8fbd84bf · 40542023 · 40542023
Commit 40542023 authored Feb 24, 2024 by zhuwenwen
20 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -11,8 +11,16 @@ steps:
 - label: AsyncEngine Test
  command: pytest -v -s async_engine

- label: Distributed Test
-  command: pytest -v -s test_comm_ops.py
+- label: Basic Correctness Test
+  command: pytest -v -s --forked basic_correctness
+
+- label: Distributed Comm Ops Test
+  command: pytest -v -s --forked test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test
+  command: pytest -v -s --forked test_basic_distributed_correctness.py
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.


--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -60,6 +60,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
@@ -70,6 +71,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)

--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -49,4 +49,43 @@ the third parameter is the path to the LoRA adapter.


 Check out `examples/multilora_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py>`_
-for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
\ No newline at end of file
+for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
+
+Serving LoRA Adapters
+---------------------
+LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
+``--lora-modules {name}={path} {name}={path}`` to specify each LoRA module when we kickoff the server:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.api_server \
+        --model meta-llama/Llama-2-7b-hf \
+        --enable-lora \
+        --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
+
+The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
+etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along
+with its base model:
+
+.. code-block:: bash
+
+    curl localhost:8000/v1/models | jq .
+    {
+        "object": "list",
+        "data": [
+            {
+                "id": "meta-llama/Llama-2-7b-hf",
+                "object": "model",
+                ...
+            },
+            {
+                "id": "sql-lora",
+                "object": "model",
+                ...
+            }
+        ]
+    }
+
+Requests can specify the LoRA adapter as if it were any other model via the ``model`` request parameter. The requests will be
+processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
+LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -32,6 +32,9 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`FalconForCausalLM`
    - Falcon
    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+  * - :code:`GemmaForCausalLM`
+    - Gemma
+    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
  * - :code:`GPT2LMHeadModel`
    - GPT-2
    - :code:`gpt2`, :code:`gpt2-xl`, etc.
@@ -62,6 +65,9 @@ Alongside each architecture, we include some popular models that use it.
  * - :code:`MPTForCausalLM`
    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
+  * - :code:`OLMoForCausalLM`
+    - OLMo
+    - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML
    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.

--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@@ -12,7 +12,9 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
 from vllm.lora.request import LoRARequest


-def create_test_prompts(lora_path: str) -> List[Tuple[str, SamplingParams]]:
+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.
    
    2 requests for base model, 4 requests for the LoRA. We define 2

--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -6,7 +6,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-transformers >= 4.37.0  # Required for Mixtral.
+transformers >= 4.38.0  # Required for Gemma.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.

--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
-transformers >= 4.37.0 # Required for Qwen2
+transformers >= 4.38.0  # Required for Gemma.
 xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]

--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@ MAIN_CUDA_VERSION = "12.1"

 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx942", "gfx926","gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx926", "gfx942", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)


@@ -430,8 +430,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
    version += ".torch" + torch.__version__[:3]

    with open(add_version_path, encoding="utf-8",mode="w") as file:
-        file.write("__version__='0.3.1'\n")
-        file.write("__dcu_version__='0.3.1+{}'\n".format(version))
+        file.write("__version__='0.3.2'\n")
+        file.write("__dcu_version__='0.3.2+{}'\n".format(version))
    file.close()
    
    

--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@@ -64,7 +64,7 @@ def test_request_tracker():
    stream_5 = tracker.add_request("5")
    assert tracker.new_requests_event.flag
    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
+        RequestOutput("2", "output", [], [], [], bool(finished)))
    new, finished = tracker.get_new_and_finished_requests()
    assert not tracker.new_requests_event.flag
    assert len(finished) == 1

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
+"""
+import pytest
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,12 +13,10 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]


-def _read_prompts(filename: str) -> str:
-    prompts = []
+def _read_prompts(filename: str) -> List[str]:
    with open(filename, "r") as f:
-        prompt = f.readline()
-        prompts.append(prompt)
-    return prompts
+        prompts = f.readlines()
+        return prompts


 @pytest.fixture
@@ -165,6 +163,8 @@ class VllmRunner:
        model_name: str,
        tokenizer_name: Optional[str] = None,
        dtype: str = "half",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
    ) -> None:
        self.model = LLM(
            model=model_name,
@@ -172,6 +172,8 @@ class VllmRunner:
            trust_remote_code=True,
            dtype=dtype,
            swap_space=0,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
        )

    def generate(

--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
+"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
+
+Run `pytest tests/distributed/test_basic_distributed_correctness.py --forked`.
+"""
+import pytest
+import torch
+
+MODELS = [
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+]
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_model = hf_runner(model, dtype=dtype)
+    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+    del hf_model
+
+    vllm_model = vllm_runner(model, dtype=dtype, tensor_parallel_size=2)
+    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    del vllm_model
+
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -7,9 +7,11 @@ import pytest
 import requests
 import ray  # using Ray for overall ease of process management, parallel requests, and debugging.
 import openai  # use the official client for correctness check
+from huggingface_hub import snapshot_download  # downloading lora to test lora requests

 MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 60 seconds
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"  # any model with a chat template should work here
+LORA_NAME = "typeof/zephyr-7b-beta-lora"  # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here

 pytestmark = pytest.mark.asyncio

@@ -54,7 +56,12 @@ class ServerRunner:


 @pytest.fixture(scope="session")
-def server():
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="session")
+def server(zephyr_lora_files):
    ray.init()
    server_runner = ServerRunner.remote([
        "--model",
@@ -64,6 +71,17 @@ def server():
        "--max-model-len",
        "8192",
        "--enforce-eager",
+        # lora config below
+        "--enable-lora",
+        "--lora-modules",
+        f"zephyr-lora={zephyr_lora_files}",
+        f"zephyr-lora2={zephyr_lora_files}",
+        "--max-lora-rank",
+        "64",
+        "--max-cpu-loras",
+        "2",
+        "--max-num-seqs",
+        "128"
    ])
    ray.get(server_runner.ready.remote())
    yield server_runner
@@ -79,8 +97,25 @@ def client():
    yield client


-async def test_single_completion(server, client: openai.AsyncOpenAI):
-    completion = await client.completions.create(model=MODEL_NAME,
+async def test_check_models(server, client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    models = models.data
+    served_model = models[0]
+    lora_models = models[1:]
+    assert served_model.id == MODEL_NAME
+    assert all(model.root == MODEL_NAME for model in models)
+    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[1].id == "zephyr-lora2"
+
+
+@pytest.mark.parametrize(
+    # first test base model, then test loras
+    "model_name",
+    [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
+)
+async def test_single_completion(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
+    completion = await client.completions.create(model=model_name,
                                                 prompt="Hello, my name is",
                                                 max_tokens=5,
                                                 temperature=0.0)
@@ -104,7 +139,13 @@ async def test_single_completion(server, client: openai.AsyncOpenAI):
        completion.choices[0].text) >= 5


-async def test_single_chat_session(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_single_chat_session(server, client: openai.AsyncOpenAI,
+                                   model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -115,7 +156,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):

    # test single completion
    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        messages=messages,
        max_tokens=10,
    )
@@ -139,11 +180,17 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI):
    assert message.content is not None and len(message.content) >= 0


-async def test_completion_streaming(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_completion_streaming(server, client: openai.AsyncOpenAI,
+                                    model_name: str):
    prompt = "What is an LLM?"

    single_completion = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=prompt,
        max_tokens=5,
        temperature=0.0,
@@ -152,7 +199,7 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
    single_usage = single_completion.usage

    stream = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=prompt,
        max_tokens=5,
        temperature=0.0,
@@ -166,7 +213,13 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI):
    assert "".join(chunks) == single_output


-async def test_chat_streaming(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_chat_streaming(server, client: openai.AsyncOpenAI,
+                              model_name: str):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -177,7 +230,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):

    # test single completion
    chat_completion = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        messages=messages,
        max_tokens=10,
        temperature=0.0,
@@ -187,7 +240,7 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):

    # test streaming
    stream = await client.chat.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        messages=messages,
        max_tokens=10,
        temperature=0.0,
@@ -204,10 +257,16 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI):
    assert "".join(chunks) == output


-async def test_batch_completions(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    # just test 1 lora hereafter
+    "model_name",
+    [MODEL_NAME, "zephyr-lora"],
+)
+async def test_batch_completions(server, client: openai.AsyncOpenAI,
+                                 model_name: str):
    # test simple list
    batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=["Hello, my name is", "Hello, my name is"],
        max_tokens=5,
        temperature=0.0,
@@ -217,7 +276,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):

    # test n = 2
    batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=["Hello, my name is", "Hello, my name is"],
        n=2,
        max_tokens=5,
@@ -236,7 +295,7 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI):

    # test streaming
    batch = await client.completions.create(
-        model=MODEL_NAME,
+        model=model_name,
        prompt=["Hello, my name is", "Hello, my name is"],
        max_tokens=5,
        temperature=0.0,

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
+import pytest
+import vllm.engine.metrics
+
+MODELS = [
+    "facebook/opt-125m",
+]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_metrics(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    vllm_model = vllm_runner(model, dtype=dtype, disable_log_stats=False)
+    tokenizer = vllm_model.model.get_tokenizer()
+    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
+    # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding.
+    assert len(example_prompts) > 1, "at least 2 prompts are required"
+    assert prompt_token_counts[0] != prompt_token_counts[1], (
+        "prompts of different lengths are required")
+    vllm_prompt_token_count = sum(prompt_token_counts)
+
+    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+    metric_count = vllm.engine.metrics.counter_prompt_tokens.get_value({})
+
+    assert vllm_prompt_token_count == metric_count, (
+        f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}"
+    )
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -5,11 +5,20 @@ Run `pytest tests/models/test_models.py --forked`.
 import pytest

 MODELS = [
-    "facebook/opt-125m", "meta-llama/Llama-2-7b-hf",
-    "mistralai/Mistral-7B-v0.1", "Deci/DeciLM-7b", "tiiuae/falcon-7b", "gpt2",
-    "bigcode/tiny_starcoder_py", "EleutherAI/gpt-j-6b",
-    "EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b",
-    "microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"
+    "facebook/opt-125m",
+    "meta-llama/Llama-2-7b-hf",
+    "mistralai/Mistral-7B-v0.1",
+    "Deci/DeciLM-7b",
+    "tiiuae/falcon-7b",
+    "gpt2",
+    "bigcode/tiny_starcoder_py",
+    "EleutherAI/gpt-j-6b",
+    "EleutherAI/pythia-70m",
+    "bigscience/bloom-560m",
+    "mosaicml/mpt-7b",
+    "microsoft/phi-2",
+    "stabilityai/stablelm-3b-4e1t",
+    "allenai/OLMo-1B",
 ]



--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -26,6 +26,7 @@ def test_beam_search_single_input(
    max_tokens: int,
    beam_width: int,
 ) -> None:
+    example_prompts = example_prompts[:1]
    hf_model = hf_runner(model, dtype=dtype)
    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                               max_tokens)

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
 import random
-from typing import Tuple
+from typing import Tuple, List
 from unittest.mock import patch

 import pytest
 import torch
 from transformers import GenerationConfig, GenerationMixin
+from typing import Optional

 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.utils import set_random_seed
@@ -46,15 +47,13 @@ CUDA_DEVICES = [
 ]


-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
-        batch_size)
-
+def _do_sample(
+    batch_size: int,
+    input_tensor: torch.Tensor,
+    sampler: MockLogitsSampler,
+    model_runner: ModelRunner,
+    sampling_params: SamplingParams,
+):
    seq_group_metadata_list = []
    prompt_lens = []
    for i in range(batch_size):
@@ -63,7 +62,7 @@ def test_sampler_all_greedy(seed: int, device: str):
                request_id=f"test_{i}",
                is_prompt=True,
                seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0, ),
+                sampling_params=sampling_params,
                block_tables={0: [1]},
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
@@ -71,9 +70,23 @@ def test_sampler_all_greedy(seed: int, device: str):
    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens,
                                                     subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
+    return sampler(embedding=None,
+                   hidden_states=input_tensor,
+                   sampling_metadata=sampling_metadata)
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_greedy(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    sampling_params = SamplingParams(temperature=0)
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)
    expected = torch.argmax(fake_logits, dim=-1)
    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
@@ -94,28 +107,40 @@ def test_sampler_all_random(seed: int, device: str):
    for i in range(batch_size):
        fake_logits[i, i] = 1e2

-    seq_group_metadata_list = []
-    prompt_lens = []
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+    )
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)
+
+    for i, sequence_output in enumerate(sampler_output):
+        for nth_output in sequence_output.samples:
+            assert nth_output.output_token == i
+
+    del model_runner
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1.0,
-                    n=random.randint(1, 10),
-                ),
-                block_tables={0: [1]},
-            ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+        fake_logits[i, i] = 1e2
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                model_runner, sampling_params)

-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i
@@ -123,6 +148,31 @@ def test_sampler_all_random(seed: int, device: str):
    del model_runner


+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_sampler_all_random_seed_deterministic(seed: int, device: str):
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    first_sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                      model_runner, sampling_params)
+
+    second_sampler_output = _do_sample(batch_size, input_tensor, sampler,
+                                       model_runner, sampling_params)
+
+    assert first_sampler_output == second_sampler_output
+
+    del model_runner
+
+
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_all_beam(seed: int, device: str):
@@ -131,29 +181,13 @@ def test_sampler_all_beam(seed: int, device: str):
    batch_size = random.randint(1, 256)
    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)

-    seq_group_metadata_list = []
-    prompt_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=0,
-                    best_of=2,
-                    use_beam_search=True,
-                ),
-                block_tables={0: [1]},
-            ))
-        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler(embedding=None,
-            hidden_states=input_tensor,
-            sampling_metadata=sampling_metadata)
+    sampling_params = SamplingParams(
+        temperature=0,
+        best_of=2,
+        use_beam_search=True,
+    )
+    _do_sample(batch_size, input_tensor, sampler, model_runner,
+               sampling_params)
    # no assertion here as I am not sure how to determine whether
    # the outputs are expected - in other words, this just tests
    # whether there are no exceptions in the sampler
@@ -171,14 +205,15 @@ def test_sampler_mixed(seed: int, device: str):
        batch_size)

    seq_group_metadata_list = []
-    expected_tokens = []
+    expected_tokens: List[Optional[List[int]]] = []
    prompt_lens = []
    for i in range(batch_size):
-        n = 1
-        sampling_type = random.randint(0, 2)
+        expected: Optional[List[int]] = None
+        sampling_type = random.randint(0, 3)
        if sampling_type == 0:
            sampling_params = SamplingParams(temperature=0)
-        elif sampling_type == 1:
+            expected = [torch.argmax(fake_logits[i], dim=-1).item()]
+        elif sampling_type in (1, 2):
            n = random.randint(1, 10)
            sampling_params = SamplingParams(
                temperature=random.random() + 0.1,
@@ -187,13 +222,17 @@ def test_sampler_mixed(seed: int, device: str):
                n=n,
                presence_penalty=random.randint(0, 1),
            )
+            if sampling_type == 2:
+                sampling_params.seed = random.randint(0, 10000)
+            else:
+                for idx in range(n):
+                    fake_logits[i, i + idx] = 1e2
+                expected = list(range(i, i + n))
        else:
            sampling_params = SamplingParams(temperature=0,
                                             use_beam_search=True,
                                             best_of=2)
-        for idx in range(n):
-            fake_logits[i, i + idx] = 1e2
-            expected_tokens.append(i + idx)
+        expected_tokens.append(expected)
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
                request_id=f"test_{i}",
@@ -204,17 +243,50 @@ def test_sampler_mixed(seed: int, device: str):
            ))
        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
-                                                     prompt_lens,
-                                                     subquery_lens=prompt_lens)
-    sampler_output = sampler(embedding=None,
-                             hidden_states=input_tensor,
-                             sampling_metadata=sampling_metadata)
-    for i, sequence_output in enumerate(sampler_output):
-        if seq_group_metadata_list[i].sampling_params.use_beam_search:
-            continue
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token in expected_tokens
+    def test_sampling(model_runner: ModelRunner):
+        sampling_metadata = model_runner._prepare_sample(
+            seq_group_metadata_list, prompt_lens, subquery_lens=prompt_lens)
+        sampler_output = sampler(embedding=None,
+                                 hidden_states=input_tensor,
+                                 sampling_metadata=sampling_metadata)
+
+        for i, (sequence_output, metadata) in enumerate(
+                zip(sampler_output, seq_group_metadata_list)):
+            if metadata.sampling_params.use_beam_search:
+                continue
+
+            if metadata.sampling_params.seed is not None \
+                    and expected_tokens[i] is None:
+                # Record seeded random result to compare with results of second invocation
+                expected_tokens[i] = [
+                    nth_output.output_token
+                    for nth_output in sequence_output.samples
+                ]
+                continue
+
+            for n, nth_output in enumerate(sequence_output.samples):
+                if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None:
+                    # Ensure exact matches for greedy or random with seed
+                    assert nth_output.output_token == expected_tokens[i][n]
+                else:
+                    # For non-seeded random check that one of the high-logit tokens were chosen
+                    assert nth_output.output_token in expected_tokens[i]
+
+    # Test batch
+    test_sampling(model_runner)
+
+    # Shuffle the batch and resample
+    target_index = list(range(batch_size))
+    for list_to_shuffle in (target_index, seq_group_metadata_list,
+                            expected_tokens, prompt_lens):
+        random.Random(seed).shuffle(list_to_shuffle)
+    target_index = torch.tensor(target_index)
+    input_tensor.data = input_tensor.index_select(0, target_index)
+    fake_logits.data = fake_logits.index_select(0, target_index)
+
+    # This time, results of seeded random samples will be compared with the corresponding
+    # sample in the pre-shuffled batch
+    test_sampling(model_runner)

    del model_runner


--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
+"""Verify that seeded random sampling is deterministic.
+
+Run `pytest tests/samplers/test_seeded_generate.py --forked`.
+"""
+import copy
+import random
+from itertools import combinations
+
+import pytest
+
+from vllm.model_executor.utils import set_random_seed
+from vllm import SamplingParams
+
+MODEL = "facebook/opt-125m"
+RANDOM_SEEDS = list(range(5))
+
+
+@pytest.fixture
+def vllm_model(vllm_runner):
+    vllm_model = vllm_runner(MODEL, dtype="half")
+    yield vllm_model
+    del vllm_model
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+def test_random_sample_with_seed(
+    vllm_model,
+    example_prompts,
+    seed: int,
+) -> None:
+    set_random_seed(seed)
+
+    sampling_params = SamplingParams(
+        # Parameters to ensure sufficient randomness
+        temperature=2.0,
+        top_p=min(random.random() + 0.3, 1),
+        top_k=random.randint(5, 20),
+        n=random.randint(1, 10),
+        presence_penalty=random.randint(0, 1),
+        max_tokens=8,
+        ignore_eos=True,
+    )
+
+    sampling_params_seed_1 = copy.deepcopy(sampling_params)
+    sampling_params_seed_1.seed = 100
+    sampling_params_seed_2 = copy.deepcopy(sampling_params)
+    sampling_params_seed_2.seed = 200
+
+    llm = vllm_model.model
+
+    for prompt in example_prompts:
+        for params in (
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+                sampling_params,
+                sampling_params_seed_1,
+                sampling_params_seed_2,
+        ):
+            llm._add_request(
+                prompt=prompt,
+                prompt_token_ids=None,
+                sampling_params=params,
+            )
+
+    results = llm._run_engine(use_tqdm=False)
+    all_outputs = [[out.token_ids for out in output.outputs]
+                   for output in results]
+
+    for i in range(0, len(example_prompts), 6):
+        outputs = all_outputs[i:i + 6]
+
+        # verify all non-seeded requests differ
+        for output_a, output_b in combinations(
+            (outputs[0], outputs[1], outputs[2], outputs[3]),
+                2,
+        ):
+            assert output_a != output_b
+
+        # verify requests with the same seed match
+        assert outputs[1] == outputs[4]
+        assert outputs[2] == outputs[5]
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -9,7 +9,7 @@ from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.version import __dcu_version__

-__version__ = "0.3.1"
+__version__ = "0.3.2"

 __all__ = [
    "LLM",

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -44,6 +44,9 @@ class ModelConfig:
        revision: The specific model version to use. It can be a branch name,
            a tag name, or a commit id. If unspecified, will use the default
            version.
+        code_revision: The specific revision to use for the model code on
+            Hugging Face Hub. It can be a branch name, a tag name, or a 
+            commit id. If unspecified, will use the default version.
        tokenizer_revision: The specific tokenizer version to use. It can be a
            branch name, a tag name, or a commit id. If unspecified, will use
            the default version.
@@ -70,6 +73,7 @@ class ModelConfig:
        dtype: Union[str, torch.dtype],
        seed: int,
        revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
        tokenizer_revision: Optional[str] = None,
        max_model_len: Optional[int] = None,
        quantization: Optional[str] = None,
@@ -84,6 +88,7 @@ class ModelConfig:
        self.load_format = load_format
        self.seed = seed
        self.revision = revision
+        self.code_revision = code_revision
        self.tokenizer_revision = tokenizer_revision
        self.quantization = quantization
        self.enforce_eager = enforce_eager
@@ -103,7 +108,8 @@ class ModelConfig:
            self.download_dir = model_path
            self.tokenizer = model_path

-        self.hf_config = get_config(self.model, trust_remote_code, revision)
+        self.hf_config = get_config(self.model, trust_remote_code, revision,
+                                    code_revision)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
        self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                     max_model_len)