Merge tag 'v0.5.3.post1' into v0.5.3.post1-dtk24.04.1

500b93c8 · zhuwenwen · 99426767 · 38c4b7e8 · 500b93c8 · 500b93c8
Commit 500b93c8 authored Jul 25, 2024 by zhuwenwen
20 changed files
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -30,7 +30,7 @@ inline std::string str(T x) {
  return std::to_string(x);
 }

-namespace marlin {
+namespace marlin_dense {

 constexpr int ceildiv(int a, int b) { return (a + b - 1) / b; }

@@ -1040,7 +1040,7 @@ void marlin_cuda(const void* A, const void* B, void* C, void* s, int prob_m,
  }
 }

-}  // namespace marlin
+}  // namespace marlin_dense

 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                          torch::Tensor& b_scales, torch::Tensor& workspace,
@@ -1054,24 +1054,25 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
  TORCH_CHECK(size_k == a.size(1),
              "Shape mismatch: a.size(1) = " + str(a.size(1)) +
                  ", size_k = " + str(size_k));
-  TORCH_CHECK(size_k % marlin::tile_size == 0,
-              "size_k = " + str(size_k) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
-  TORCH_CHECK((size_k / marlin::tile_size) == b_q_weight.size(0),
+  TORCH_CHECK(size_k % marlin_dense::tile_size == 0,
+              "size_k = " + str(size_k) + " is not divisible by tile_size = " +
+                  str(marlin_dense::tile_size));
+  TORCH_CHECK((size_k / marlin_dense::tile_size) == b_q_weight.size(0),
              "Shape mismatch: b_q_weight.size(0) = " +
                  str(b_q_weight.size(0)) + ", size_k = " + str(size_k) +
-                  ", tile_size = " + str(marlin::tile_size));
+                  ", tile_size = " + str(marlin_dense::tile_size));

  // Verify N
  TORCH_CHECK(b_scales.size(1) == size_n,
              "b_scales.size(1) = " + str(b_scales.size(1)) +
                  ", size_n = " + str(size_n));
-  TORCH_CHECK(b_q_weight.size(1) % marlin::tile_size == 0,
-              "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
-                  " is not divisible by tile_size = " + str(marlin::tile_size));
+  TORCH_CHECK(
+      b_q_weight.size(1) % marlin_dense::tile_size == 0,
+      "b_q_weight.size(1) = " + str(b_q_weight.size(1)) +
+          " is not divisible by tile_size = " + str(marlin_dense::tile_size));

-  int actual_size_n =
-      (b_q_weight.size(1) / marlin::tile_size) * marlin::pack_factor_4bit;
+  int actual_size_n = (b_q_weight.size(1) / marlin_dense::tile_size) *
+                      marlin_dense::pack_factor_4bit;
  TORCH_CHECK(
      size_n == actual_size_n,
      "size_n = " + str(size_n) + ", actual_size_n = " + str(actual_size_n));
@@ -1116,21 +1117,22 @@ torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
              "Unexpected groupsize = " + str(groupsize));

  // Verify workspace size
-  TORCH_CHECK(
-      size_n % marlin::min_thread_n == 0,
-      "size_n = " + str(size_n) +
-          ", is not divisible by min_thread_n = " + str(marlin::min_thread_n));
-  int min_workspace_size = (size_n / marlin::min_thread_n) * marlin::max_par;
+  TORCH_CHECK(size_n % marlin_dense::min_thread_n == 0,
+              "size_n = " + str(size_n) +
+                  ", is not divisible by min_thread_n = " +
+                  str(marlin_dense::min_thread_n));
+  int min_workspace_size =
+      (size_n / marlin_dense::min_thread_n) * marlin_dense::max_par;
  TORCH_CHECK(workspace.numel() >= min_workspace_size,
              "workspace.numel = " + str(workspace.numel()) +
                  " is below min_workspace_size = " + str(min_workspace_size));

  int dev = a.get_device();
-  marlin::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
-                      b_scales.data_ptr(), size_m, size_n, size_k,
-                      workspace.data_ptr(), groupsize, dev,
-                      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n,
-                      sms, marlin::max_par);
+  marlin_dense::marlin_cuda(a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(),
+                            b_scales.data_ptr(), size_m, size_n, size_k,
+                            workspace.data_ptr(), groupsize, dev,
+                            at::cuda::getCurrentCUDAStream(dev), thread_k,
+                            thread_n, sms, marlin_dense::max_par);

  return c;
 }
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -27,8 +27,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
      "    int blocksparse_vert_stride, int blocksparse_block_size,"
      "    int blocksparse_head_sliding_step) -> ()");
  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
@@ -41,8 +41,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    Tensor value_cache, int num_kv_heads, float scale,"
      "    Tensor block_tables, Tensor seq_lens, int block_size,"
      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
+      "    str kv_cache_dtype, float k_scale, float v_scale,"
+      "    int tp_rank, int blocksparse_local_blocks,"
      "    int blocksparse_vert_stride, int blocksparse_block_size,"
      "    int blocksparse_head_sliding_step) -> ()");
  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
@@ -72,6 +72,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
  ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);

+  // prepare_inputs advance_step
+  ops.def("advance_step", &advance_step);
+  ops.impl("advance_step", torch::kCUDA, &advance_step);
+
  // Layernorm
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
  ops.def(
@@ -146,6 +150,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);

+  // awq_marlin repack from AWQ.
+  ops.def("awq_marlin_repack", &awq_marlin_repack);
+  ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
+
  // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
  ops.def("fp8_marlin_gemm", &fp8_marlin_gemm);
  ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
@@ -188,12 +196,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 //       "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
 //   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);

-  // Compute FP8 quantized tensor and scaling factor.
+
+  // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
 //   ops.def(
 //       "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
 //       "()");
 //   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);

+  // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
+//   ops.def(
+//       "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! "
+//       "scale, Tensor? scale_ub) -> "
+//       "()");
+//   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
+//            &dynamic_per_token_scaled_fp8_quant);
+
  // Aligning the number of tokens to be processed by each expert such
  // that it is divisible by the block size.
  ops.def(
@@ -236,7 +253,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
      "                  Tensor! key_cache, Tensor! value_cache,"
      "                  Tensor slot_mapping,"
      "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
+      "                  float k_scale, float v_scale) -> ()");
  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);

  // Reshape the key and value tensors and cache them.

--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -2,7 +2,7 @@ sphinx==6.2.1
 sphinx-book-theme==1.0.1
 sphinx-copybutton==0.5.2
 myst-parser==2.0.0
-sphinx-argparse
+sphinx-argparse==0.4.0

 # packages to install to build the documentation
 pydantic

--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -13,6 +13,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Databricks
 - DeepInfra
 - Dropbox
+- Google Cloud
 - Lambda Lab
 - NVIDIA
 - Replicate

--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -8,7 +8,7 @@ Multi-Modality
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.

 Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
-via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`.

 Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
 by following :ref:`this guide <adding_multimodal_plugin>`.

--- a/docs/source/dev/offline_inference/llm_inputs.rst
+++ b/docs/source/dev/offline_inference/llm_inputs.rst
 LLM Inputs
 ==========

-.. autodata:: vllm.inputs.PromptStrictInputs
+.. autodata:: vllm.inputs.PromptInputs

 .. autoclass:: vllm.inputs.TextPrompt
    :show-inheritance:

--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -3,7 +3,7 @@
 Installation with ROCm
 ======================

-vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
+vLLM supports AMD GPUs with ROCm 6.1.

 Requirements
 ------------
@@ -11,7 +11,7 @@ Requirements
 * OS: Linux
 * Python: 3.8 -- 3.11
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-* ROCm 6.0 and ROCm 5.7
+* ROCm 6.1

 Installation options:

@@ -27,10 +27,10 @@ You can build and install vLLM from source.

 First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.

-`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
+`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:

-* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
+* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
 * `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
 * `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
@@ -39,24 +39,17 @@ It provides flexibility to customize the build of docker image using the followi
 Their values can be passed in when running ``docker build`` with ``--build-arg`` options.


-To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default:

 .. code-block:: console

-    $ docker build -f Dockerfile.rocm -t vllm-rocm .
+    $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .

-To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
+To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:

 .. code-block:: console

-    $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
-
-To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
-
-.. code-block:: console
-
-    $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-       -f Dockerfile.rocm -t vllm-rocm . 
+    $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .

 To run the above docker image ``vllm-rocm``, use the below command:

@@ -85,39 +78,24 @@ Option 2: Build from source
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):

 - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `Pytorch <https://pytorch.org/>`_
+- `PyTorch <https://pytorch.org/>`_
 - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_

-For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
-
-Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
-
-For rocm6.0:
-
-.. code-block:: console
-
-    $ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
-
-
-For rocm5.7:
-
-.. code-block:: console
+For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`.

-    $ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
+Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_


 1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_

 Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_

-2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
+2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_

-Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
+Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
+Alternatively, wheels intended for vLLM use can be accessed under the releases.

 .. note::
-    - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
-    - If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
-    - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
    - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)

 3. Build vLLM.
@@ -131,7 +109,7 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl

 .. tip::

-    - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
-    - The ROCm version of pytorch, ideally, should match the ROCm driver version.
+    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+    - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. 
+    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -19,9 +19,6 @@ If you have already taken care of the above issues, but the vLLM instance still
 - Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
 - Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.

-  .. warning::
-    vLLM function tracing will generate a lot of logs and slow down the system. Only use it for debugging purposes.
-
 With more logging, hopefully you can find the root cause of the issue.

 If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error.
@@ -67,3 +64,7 @@ Here are some common issues that can cause hangs:
    If the script runs successfully, you should see the message ``sanity check is successful!``.

 If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs.
+
+.. warning::
+
+    After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on.
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -49,11 +49,10 @@ You can install vLLM using pip:
    .. code-block:: console

        $ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
-        $ export PYTHON_VERSION=310
-        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+        $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
        $ # You can also access a specific commit
        $ # export VLLM_COMMIT=...
-        $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+        $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl


 .. _build_from_source:

--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -73,16 +73,13 @@ Start the server:

 .. code-block:: console

-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-125m
+    $ vllm serve facebook/opt-125m

 By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:

 .. code-block:: console

-   $ python -m vllm.entrypoints.openai.api_server \
-   $     --model facebook/opt-125m \
-   $     --chat-template ./examples/template_chatml.jinja
+    $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja

 This server can be queried in the same format as OpenAI API. For example, list the models:


--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -56,7 +56,7 @@ First, install the dependencies:
    $ pip uninstall torch torch-xla -y

    $ # Install PyTorch and PyTorch XLA.
-    $ export DATE="+20240601"
+    $ export DATE="+20240713"
    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl
    $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl

@@ -85,7 +85,7 @@ Next, build vLLM from source. This will only take a few seconds:
        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory


-    You can install OpenBLAS with the following command:
+    Please install OpenBLAS with the following command:

    .. code-block:: console


--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -114,7 +114,7 @@ Just add the following lines in your code:
    from your_code import YourModelForCausalLM
    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)

-If you are running api server with `python -m vllm.entrypoints.openai.api_server args`, you can wrap the entrypoint with the following code:
+If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:

 .. code-block:: python

@@ -124,4 +124,4 @@ If you are running api server with `python -m vllm.entrypoints.openai.api_server
    import runpy
    runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')

-Save the above code in a file and run it with `python your_file.py args`.
+Save the above code in a file and run it with :code:`python your_file.py <args>`.
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -8,7 +8,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 .. argparse::
    :module: vllm.engine.arg_utils
    :func: _engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
    :nodefaultconst:

 Async Engine Arguments
@@ -19,5 +19,5 @@ Below are the additional arguments related to the asynchronous engine:
 .. argparse::
    :module: vllm.engine.arg_utils
    :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
+    :prog: vllm serve
    :nodefaultconst:
\ No newline at end of file
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@@ -61,10 +61,12 @@ LoRA adapted models can also be served with the Open-AI compatible vLLM server.

 .. code-block:: bash

-    python -m vllm.entrypoints.openai.api_server \
-        --model meta-llama/Llama-2-7b-hf \
+    vllm serve meta-llama/Llama-2-7b-hf \
        --enable-lora \
-        --lora-modules sql-lora=~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/
+        --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
+
+.. note::
+   The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.

 The server entrypoint accepts all other LoRA configuration parameters (``max_loras``, ``max_lora_rank``, ``max_cpu_loras``,
 etc.), which will apply to all forthcoming requests. Upon querying the ``/models`` endpoint, we should see our LoRA along

--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -94,8 +94,8 @@ Decoder-only Language Models
    - :code:`ai21labs/Jamba-v0.1`, etc.
    - ✅︎
  * - :code:`LlamaForCausalLM`
-    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
-    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
+    - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc.
    - ✅︎
  * - :code:`MiniCPMForCausalLM`
    - MiniCPM
@@ -182,6 +182,10 @@ Vision Language Models
    - Models
    - Example HuggingFace Models
    - :ref:`LoRA <lora>`
+  * - :code:`ChameleonForConditionalGeneration`
+    - Chameleon
+    - :code:`facebook/chameleon-7b` etc.
+    - 
  * - :code:`FuyuForCausalLM`
    - Fuyu
    - :code:`adept/fuyu-8b` etc.

--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -30,7 +30,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
    internally for each model.


-To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
+To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`:

 * ``prompt``: The prompt should follow the format that is documented on HuggingFace.
 * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
@@ -94,9 +94,7 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with

 .. code-block:: bash

-    python -m vllm.entrypoints.openai.api_server \
-        --model llava-hf/llava-1.5-7b-hf \
-        --chat-template template_llava.jinja
+    vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja

 .. important::
    We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow

--- a/docs/source/serving/deploying_with_cerebrium.rst
+++ b/docs/source/serving/deploying_with_cerebrium.rst
@@ -28,6 +28,9 @@ Next, to install the required packages, add the following to your cerebrium.toml

 .. code-block:: toml

+    [cerebrium.deployment]
+    docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+
    [cerebrium.dependencies.pip]
    vllm = "latest"


--- a/docs/source/serving/deploying_with_dstack.rst
+++ b/docs/source/serving/deploying_with_dstack.rst
@@ -40,7 +40,7 @@ Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7
        gpu: 24GB
    commands:
        - pip install vllm
-        - python -m vllm.entrypoints.openai.api_server --model $MODEL --port 8000
+        - vllm serve $MODEL --port 8000
    model:
        format: openai
        type: chat

--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
 .. _distributed_serving:

+Distributed Inference and Serving
+=================================
+
 How to decide the distributed inference strategy?
-=================================================
+-------------------------------------------------

 Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:

@@ -16,8 +19,8 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
 .. note::
    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.

-Distributed Inference and Serving
-=================================
+Details for Distributed Inference and Serving
+----------------------------------------------

 vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_.  We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray <https://github.com/ray-project/ray>`_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.

@@ -35,36 +38,73 @@ To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument wh

 .. code-block:: console

-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model facebook/opt-13b \
+    $ vllm serve facebook/opt-13b \
    $     --tensor-parallel-size 4

 You can also additionally specify :code:`--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism:

 .. code-block:: console

-    $ python -m vllm.entrypoints.openai.api_server \
-    $     --model gpt2 \
+    $ vllm serve gpt2 \
    $     --tensor-parallel-size 4 \
-    $     --pipeline-parallel-size 2 \
-    $     --distributed-executor-backend ray
+    $     --pipeline-parallel-size 2

 .. note::
-    Pipeline parallel is a beta feature. It is only supported for online serving and the ray backend for now, as well as LLaMa and GPT2 style models.
+    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models.
+
+Multi-Node Inference and Serving
+--------------------------------
+
+If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration.
+
+The first step, is to start containers and organize them into a cluster. We have provided a helper `script <https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh>`_ to start the cluster.
+
+Pick a node as the head node, and run the following command:
+
+.. code-block:: console
+
+    $ bash run_cluster.sh \
+    $                   vllm/vllm-openai \
+    $                   ip_of_head_node \
+    $                   --head \
+    $                   /path/to/the/huggingface/home/in/this/node

-To scale vLLM beyond a single machine, install and start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
+On the rest of the worker nodes, run the following command:

 .. code-block:: console

-    $ pip install ray
+    $ bash run_cluster.sh \
+    $                   vllm/vllm-openai \
+    $                   ip_of_head_node \
+    $                   --worker \
+    $                   /path/to/the/huggingface/home/in/this/node

-    $ # On head node
-    $ ray start --head
+Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster.

-    $ # On worker nodes
-    $ ray start --address=<ray-head-address>
+Then, on any node, use ``docker exec -it node /bin/bash`` to enter the container, execute ``ray status`` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.

-After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` multiplied by :code:`pipeline_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
+After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+
+.. code-block:: console
+
+    $ vllm serve /path/to/the/model/in/the/container \
+    $     --tensor-parallel-size 8 \
+    $     --pipeline-parallel-size 2
+
+You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16:
+
+.. code-block:: console
+
+    $ vllm serve /path/to/the/model/in/the/container \
+    $     --tensor-parallel-size 16
+
+To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like ``--privileged -e NCCL_IB_HCA=mlx5`` to the ``run_cluster.sh`` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with ``NCCL_DEBUG=TRACE`` environment variable set, e.g. ``NCCL_DEBUG=TRACE vllm serve ...`` and check the logs for the NCCL version and the network used. If you find ``[send] via NET/Socket`` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find ``[send] via NET/IB/GDRDMA`` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
+
+.. warning::
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the `sanity check script <https://docs.vllm.ai/en/latest/getting_started/debugging.html>`_ for more information.

 .. warning::
-    Please make sure you downloaded the model to all the nodes, or the model is downloaded to some distributed file system that is accessible by all nodes.
+
+    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
+
+    When you use huggingface repo id to refer to the model, you should append your huggingface token to the ``run_cluster.sh`` script, e.g. ``-e HF_TOKEN=``. The recommended way is to download the model first, and then use the path to refer to the model.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat

 You can start the server using Python, or using [Docker](deploying_with_docker.rst):
 ```bash
-python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```

 To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
@@ -97,9 +97,7 @@ template, or the template in string form. Without a chat template, the server wi
 and all chat requests will error.

 ```bash
-python -m vllm.entrypoints.openai.api_server \
-  --model ... \
-  --chat-template ./path-to-chat-template.jinja
+vllm serve <model> --chat-template ./path-to-chat-template.jinja
 ```

 vLLM community provides a set of chat templates for popular models. You can find them in the examples
@@ -110,7 +108,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 ```{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
-:prog: -m vllm.entrypoints.openai.api_server
+:prog: vllm serve
 ```

 ## Tool calling in the chat completion API