Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# vllm commit id, generated by setup.py
+vllm/commit_id.py
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21)

 project(vllm_extensions LANGUAGES CXX)

-option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
+set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")

 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@@ -31,9 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
-set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")

 #
 # Try to find python package with an executable that exactly matches
@@ -98,18 +98,11 @@ elseif(HIP_FOUND)
  # .hip extension automatically, HIP must be enabled explicitly.
  enable_language(HIP)

-  # ROCm 5.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
-      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
-  endif()
-
-  # ROCm 6.x
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
-      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.X and 6.X
+  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
+      "expected for ROCm build, saw ${Torch_VERSION} instead.")
  endif()
 else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@@ -178,10 +171,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")

  #
  # The CUTLASS kernels for Hopper require sm90a to be enabled.
@@ -189,7 +183,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")

--- a/Dockerfile
+++ b/Dockerfile
@@ -5,18 +5,35 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png

+ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
+
+ARG CUDA_VERSION=12.4.1
+ARG PYTHON_VERSION=3
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
+    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
+    && python3 --version \
+    && python3 -m pip --version

 RUN apt-get update -y \
-    && apt-get install -y python3-pip git
+    && apt-get install -y python3-pip git curl sudo

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 WORKDIR /workspace

@@ -24,12 +41,11 @@ WORKDIR /workspace
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-cuda.txt
+    python3 -m pip install -r requirements-cuda.txt

-# install development dependencies
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+COPY requirements-mamba.txt requirements-mamba.txt
+RUN python3 -m pip install packaging
+RUN python3 -m pip install -r requirements-mamba.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -39,14 +55,16 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################

-
 #################### WHEEL BUILD IMAGE ####################
-FROM dev AS build
+FROM base AS build
+
+ARG PYTHON_VERSION=3

 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
+
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
+    python3 -m pip install -r requirements-build.txt

 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
@@ -70,10 +88,32 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1

+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}
+
+ARG USE_SCCACHE
+# if USE_SCCACHE is set, use sccache to speed up compilation
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$USE_SCCACHE" = "1" ]; then \
+        echo "Installing sccache..." \
+        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        && tar -xzf sccache.tar.gz \
+        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && export SCCACHE_BUCKET=vllm-build-sccache \
+        && export SCCACHE_REGION=us-west-2 \
+        && export CMAKE_BUILD_TYPE=Release \
+        && sccache --show-stats \
+        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
+        && sccache --show-stats; \
+    fi
+
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist
+    if [ "$USE_SCCACHE" != "1" ]; then \
+        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
+    fi

 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@@ -81,9 +121,36 @@ RUN python3 check-wheel-size.py dist

 #################### EXTENSION Build IMAGE ####################

+#################### DEV IMAGE ####################
+FROM base as dev
+
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-dev.txt
+
+#################### DEV IMAGE ####################
+#################### MAMBA Build IMAGE ####################
+FROM dev as mamba-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+
+WORKDIR /usr/src/mamba
+
+COPY requirements-mamba.txt requirements-mamba.txt
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel -r requirements-mamba.txt \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### MAMBA Build IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
+ARG CUDA_VERSION=12.4.1
 WORKDIR /vllm-workspace

 RUN apt-get update -y \
@@ -93,12 +160,19 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.4/compat/
+RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/

 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    python3 -m pip install dist/*.whl --verbose
+
+RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################


@@ -111,7 +185,7 @@ ADD . /vllm-workspace/

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+    python3 -m pip install -r requirements-dev.txt

 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@@ -128,7 +202,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope
+    pip install accelerate hf_transfer 'modelscope!=1.15.0'

 ENV VLLM_USAGE_SOURCE production-docker-image


--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -3,9 +3,19 @@
 FROM ubuntu:22.04 AS cpu-test-1

 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

+# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
+# intel-openmp provides additional performance improvement vs. openmp
+# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
+RUN pip install intel-openmp
+
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
+
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+
 RUN pip install --upgrade pip \
    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy

@@ -17,10 +27,14 @@ WORKDIR /workspace/vllm

 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-CMD ["/bin/bash"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
+FROM mambaorg/micromamba
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+USER root
+
+RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# Some packages in requirements-cpu are installed here
+# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
+# Currently these may not be available for venv or pip directly
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+# These packages will be in rocketce eventually
+RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /vllm-workspace
+ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
-# default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-FROM $BASE_IMAGE
-
-ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-RUN echo "Base image is $BASE_IMAGE"
-
-# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
-# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
-
-
+# Default ROCm 6.1 base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Tested and supported base rocm/pytorch images
+ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
+    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
+    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+
+# Whether to build CK-based flash-attention
+# If 0, will not build flash attention
+# This is useful for gfx target where flash-attention is not supported
+# (i.e. those that do not appear in `FA_GFX_ARCHS`)
+# Triton FA is used by default on ROCm now so this is unnecessary.
+ARG BUILD_FA="1"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
-RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-
 ARG FA_BRANCH="ae7928c"
-RUN echo "FA_BRANCH is $FA_BRANCH"

-# whether to build flash-attention
-# if 0, will not build flash attention
-# this is useful for gfx target where flash-attention is not supported
-# In that case, we need to use the python reference attention implementation in vllm
-ARG BUILD_FA="1"
-
-# whether to build triton on rocm
+# Whether to build triton on rocm
 ARG BUILD_TRITON="1"
+ARG TRITON_BRANCH="0ef1848"

-# Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
+### Base image build stage
+FROM $BASE_IMAGE AS base
+
+# Import arg(s) defined before this build stage
+ARG PYTORCH_ROCM_ARCH

 # Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
 RUN apt-get update && apt-get install -y \
    curl \
    ca-certificates \
@@ -40,76 +40,165 @@ RUN apt-get update && apt-get install -y \
    build-essential \
    wget \
    unzip \
-    nvidia-cuda-toolkit \
    tmux \
+    ccache \
 && rm -rf /var/lib/apt/lists/*

-### Mount Point ###
-# When launching the container, mount the code directory to /app
+# When launching the container, mount the code directory to /vllm-workspace
 ARG APP_MOUNT=/vllm-workspace
-VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}

-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN pip install --upgrade pip
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
+# Install torch == 2.5.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
+        *"rocm-6.0"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
+        *"rocm-6.1"*) \
+            pip uninstall -y torch torchaudio torchvision \
+            && pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+                torchvision==0.20.0.dev20240710 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+        *) ;; esac

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:

-# Install ROCm flash-attention
-RUN if [ "$BUILD_FA" = "1" ]; then \
-    mkdir libs \
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+ENV CCACHE_DIR=/root/.cache/ccache
+
+
+### AMD-SMI build stage
+FROM base AS build_amdsmi
+# Build amdsmi wheel always
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=/install
+
+
+### Flash-Attention wheel build stage
+FROM base AS build_fa
+ARG BUILD_FA
+ARG FA_GFX_ARCHS
+ARG FA_BRANCH
+# Build ROCm flash-attention wheel if `BUILD_FA = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_FA" = "1" ]; then \
+    mkdir -p libs \
    && cd libs \
    && git clone https://github.com/ROCm/flash-attention.git \
    && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
+    && git checkout "${FA_BRANCH}" \
    && git submodule update --init \
-    && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
-        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
-    && python3 setup.py install \
-    && cd ..; \
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-5.7"*) \
+            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
+            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
+        *) ;; esac \
+    && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
    fi

-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually removed it so that later steps of numpy upgrade can continue
-RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
-    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

-# build triton
-RUN if [ "$BUILD_TRITON" = "1" ]; then \
+### Triton wheel build stage
+FROM base AS build_triton
+ARG BUILD_TRITON
+ARG TRITON_BRANCH
+# Build triton wheel if `BUILD_TRITON = 1`
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    if [ "$BUILD_TRITON" = "1" ]; then \
    mkdir -p libs \
    && cd libs \
-    && pip uninstall -y triton \
-    && git clone https://github.com/ROCm/triton.git \
-    && cd triton/python \
-    && pip3 install . \
-    && cd ../..; \
+    && git clone https://github.com/OpenAI/triton.git \
+    && cd triton \
+    && git checkout "${TRITON_BRANCH}" \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=/install; \
+    # Create an empty directory otherwise as later build stages expect one
+    else mkdir -p /install; \
    fi

-WORKDIR /vllm-workspace
+
+### Final vLLM build stage
+FROM base AS final
+# Import the vLLM development directory from the build context
 COPY . .

-#RUN python3 -m pip install pynvml # to be removed eventually
-RUN python3 -m pip install --upgrade pip numba
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
+        *) ;; esac
+
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade numba scipy huggingface-hub[cli]

-# make sure punica kernels are built (for LoRA)
+# Make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+# Silences the HF Tokenizers warning
+ENV TOKENIZERS_PARALLELISM=false

-ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
-
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=${CCACHE_DIR} \
+    --mount=type=cache,target=/root/.cache/pip \
    pip install -U -r requirements-rocm.txt \
-    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
-    && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
-    && cd ..
+    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.0"*) \
+            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
+        *"rocm-6.1"*) \
+            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
+            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
+            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
+            # Prevent interference if torch bundles its own HIP runtime
+            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
+        *) ;; esac \
+    && python3 setup.py clean --all \
+    && python3 setup.py develop
+
+# Copy amdsmi wheel into final image
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
+    mkdir -p libs \
+    && cp /install/*.whl libs \
+    # Preemptively uninstall to avoid same-version no-installs
+    && pip uninstall -y amdsmi;
+
+# Copy triton wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y triton; fi
+
+# Copy flash-attn wheel(s) into final image if they were built
+RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
+    mkdir -p libs \
+    && if ls /install/*.whl; then \
+        cp /install/*.whl libs \
+        # Preemptively uninstall to avoid same-version no-installs
+        && pip uninstall -y flash-attn; fi

+# Install wheels that were built to the final image
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if ls libs/*.whl; then \
+    pip install libs/*.whl; fi

 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
+ARG NIGHTLY_DATE="20240601"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+WORKDIR /workspace
+
+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install the TPU and Pallas dependencies.
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+# Build vLLM.
+COPY . /workspace/vllm
+ENV VLLM_TARGET_DEVICE="tpu"
+RUN cd /workspace/vllm && python setup.py develop
+
+# Re-install outlines to avoid dependency errors.
+# The outlines version must follow requirements-common.txt.
+RUN pip uninstall outlines -y
+RUN pip install "outlines>=0.0.43"
+
+CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
+RUN apt-get update  -y \
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-xpu.txt
+
+RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention

 ## 暂不支持的官方功能
 - **量化推理**：目前支持fp16的推理和gptq推理，awq-int4和mralin的权重量化、kv-cache fp8推理方案暂不支持
- **模块支持**：目前不支持Sliding window attention、 moe kernel和lora模块
+- **模块支持**：目前不支持Sliding window attention、 moe kernel模块


 ## 支持模型结构列表
@@ -74,7 +74,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.2；

 ## Known Issue
 - 无

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -4,10 +4,13 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional
+from typing import List, Optional, Union

 import aiohttp
+import huggingface_hub.constants
 from tqdm.asyncio import tqdm
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)

 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)

@@ -68,9 +71,13 @@ async def async_request_tgi(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@@ -218,8 +225,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "v1/completions"
-    ), "OpenAI Completions API URL must end with 'v1/completions'."
+        "completions"
+    ), "OpenAI Completions API URL must end with 'completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@@ -258,6 +265,9 @@ async def async_request_openai_completions(
                        else:
                            data = json.loads(chunk)

+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
@@ -266,12 +276,8 @@ async def async_request_openai_completions(
                                    output.ttft = ttft

                                # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
+                                output.itl.append(timestamp -
+                                                  most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
@@ -298,8 +304,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "v1/chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@@ -384,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
    return text


+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
+        from modelscope import snapshot_download
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+        return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str, trust_remote_code: bool
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+            pretrained_model_name_or_path):
+        pretrained_model_name_or_path = get_model(
+            pretrained_model_name_or_path)
+    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
+                                         trust_remote_code=trust_remote_code)
+
+
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
    "vllm": async_request_openai_completions,
@@ -392,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -10,8 +10,10 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser


 def main(args: argparse.Namespace):
@@ -19,25 +21,33 @@ def main(args: argparse.Namespace):

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              distributed_executor_backend=args.distributed_executor_backend)
+    llm = LLM(
+        model=args.model,
+        speculative_model=args.speculative_model,
+        num_speculative_tokens=args.num_speculative_tokens,
+        speculative_draft_tensor_parallel_size=\
+            args.speculative_draft_tensor_parallel_size,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+        max_model_len=args.max_model_len,
+        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        quantization_param_path=args.quantization_param_path,
+        device=args.device,
+        ray_workers_use_nsight=args.ray_workers_use_nsight,
+        use_v2_block_manager=args.use_v2_block_manager,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        download_dir=args.download_dir,
+        block_size=args.block_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        load_format=args.load_format,
+        distributed_executor_backend=args.distributed_executor_backend,
+        otlp_traces_endpoint=args.otlp_traces_endpoint,
+        enable_prefix_caching=args.enable_prefix_caching,
+    )

    sampling_params = SamplingParams(
        n=args.n,
@@ -96,7 +106,7 @@ def main(args: argparse.Namespace):
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
+    percentages = [10, 25, 50, 75, 90, 99]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):
@@ -114,12 +124,16 @@ def main(args: argparse.Namespace):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--speculative-model', type=str, default=None)
    parser.add_argument('--num-speculative-tokens', type=int, default=None)
+    parser.add_argument('--speculative-draft-tensor-parallel-size',
+                        '-spec-draft-tp',
+                        type=int,
+                        default=None)
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
@@ -145,6 +159,12 @@ if __name__ == '__main__':
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,
@@ -188,9 +208,10 @@ if __name__ == '__main__':
    parser.add_argument(
        "--device",
        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
@@ -200,6 +221,9 @@ if __name__ == '__main__':
        action='store_true',
        help='If True, the prefill requests can be chunked based on the '
        'max_num_batched_tokens')
+    parser.add_argument("--enable-prefix-caching",
+                        action='store_true',
+                        help="Enable automatic prefix caching")
    parser.add_argument('--use-v2-block-manager', action='store_true')
    parser.add_argument(
        "--ray-workers-use-nsight",
@@ -222,6 +246,29 @@ if __name__ == '__main__':
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
    parser.add_argument(
        '--distributed-executor-backend',
        choices=['ray', 'mp'],
@@ -229,5 +276,10 @@ if __name__ == '__main__':
        help='Backend to use for distributed serving. When more than 1 GPU '
        'is used, will be automatically set to "ray" if installed '
        'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--otlp-traces-endpoint',
+        type=str,
+        default=None,
+        help='Target URL to which OpenTelemetry traces will be sent.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
-import argparse
 import time

 from vllm import LLM, SamplingParams
+from vllm.utils import FlexibleArgumentParser

 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501

@@ -44,7 +44,7 @@ def main(args):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
        description='Benchmark the performance with or without automatic '
        'prefix caching.')
    parser.add_argument('--model',

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -2,8 +2,8 @@

 On the server side, run one of the following commands:
    vLLM OpenAI API server
-    python -m vllm.entrypoints.openai.api_server \
-        --model <your_model> --swap-space 16 \
+    vllm serve <your_model> \
+        --swap-space 16 \
        --disable-log-requests

    (TGI backend)
@@ -17,7 +17,7 @@ On the client side, run:
        --dataset-path <path to dataset> \
        --request-rate <request_rate> \ # By default <request_rate> is inf
        --num-prompts <num_prompts> # By default <num_prompts> is 1000
-        
+
    when using tgi backend, add
        --endpoint /generate_stream
    to the end of the command above.
@@ -31,7 +31,7 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

-from vllm.transformers_utils.tokenizer import get_tokenizer
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser


 @dataclass
@@ -52,12 +60,15 @@ class BenchmarkMetrics:
    output_throughput: float
    mean_ttft_ms: float
    median_ttft_ms: float
+    std_ttft_ms: float
    p99_ttft_ms: float
    mean_tpot_ms: float
    median_tpot_ms: float
+    std_tpot_ms: float
    p99_tpot_ms: float
    mean_itl_ms: float
    median_itl_ms: float
+    std_itl_ms: float
    p99_itl_ms: float


@@ -69,7 +80,6 @@ def sample_sharegpt_requests(
 ) -> List[Tuple[str, int, int]]:
    if fixed_output_len is not None and fixed_output_len < 4:
        raise ValueError("output_len too small")
-
    # Load the dataset.
    with open(dataset_path) as f:
        dataset = json.load(f)
@@ -177,6 +187,31 @@ def sample_sonnet_requests(
    return sampled_requests


+def sample_random_requests(
+        input_len: int, output_len: int, num_prompts: int, range_ratio: float,
+        tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(num_prompts):
+        prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+        input_requests.append(
+            (prompt, int(input_lens[i]), int(output_lens[i])))
+
+    return input_requests
+
+
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
@@ -188,6 +223,7 @@ async def get_request(
        if request_rate == float("inf"):
            # If the request rate is infinity, then we don't need to wait.
            continue
+
        # Sample the request interval from the exponential distribution.
        interval = np.random.exponential(1.0 / request_rate)
        # The next request will be sent after the interval.
@@ -200,18 +236,18 @@ def calculate_metrics(
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens = []
+    actual_output_lens: List[int] = []
    total_input = 0
    completed = 0
-    itls = []
-    tpots = []
-    ttfts = []
+    itls: List[float] = []
+    tpots: List[float] = []
+    ttfts: List[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
            # We use the tokenizer to count the number of output tokens for all
            # serving backends instead of looking at len(outputs[i].itl) since
            # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
+            # Note : this may inflate the output token count slightly
            output_len = len(
                tokenizer(outputs[i].generated_text,
                          add_special_tokens=False).input_ids)
@@ -241,12 +277,15 @@ def calculate_metrics(
        mean_ttft_ms=np.mean(ttfts or 0) *
        1000,  # ttfts is empty if streaming is not supported by backend
        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
        mean_itl_ms=np.mean(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )

@@ -265,7 +304,7 @@ async def benchmark(
    disable_tqdm: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+        request_func = ASYNC_REQUEST_FUNCS[backend]
    else:
        raise ValueError(f"Unknown backend: {backend}")

@@ -292,7 +331,7 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

    benchmark_start_time = time.perf_counter()
-    tasks = []
+    tasks: List[asyncio.Task] = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
        request_func_input = RequestFuncInput(
@@ -310,7 +349,7 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if not disable_tqdm:
+    if pbar is not None:
        pbar.close()

    benchmark_duration = time.perf_counter() - benchmark_start_time
@@ -363,12 +402,15 @@ async def benchmark(
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
        "mean_itl_ms": metrics.mean_itl_ms,
        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
@@ -448,6 +490,15 @@ def main(args: argparse.Namespace):
                              for prompt, prompt_formatted, prompt_len,
                              output_len in input_requests]

+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+        )
+
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")

@@ -466,7 +517,7 @@ def main(args: argparse.Namespace):

    # Save config and results to json
    if args.save_result:
-        result_json = {}
+        result_json: Dict[str, Any] = {}

        # Setup
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -499,6 +550,8 @@ def main(args: argparse.Namespace):
        # Save to file
        base_model_id = model_id.split("/")[-1]
        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_filename:
+            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
@@ -506,7 +559,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
        description="Benchmark the online serving throughput.")
    parser.add_argument(
        "--backend",
@@ -539,7 +592,7 @@ if __name__ == "__main__":
        "--dataset-name",
        type=str,
        default="sharegpt",
-        choices=["sharegpt", "sonnet"],
+        choices=["sharegpt", "sonnet", "random"],
        help="Name of the dataset to benchmark on.",
    )
    parser.add_argument("--dataset-path",
@@ -556,7 +609,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument(
        "--best-of",
@@ -599,6 +652,27 @@ if __name__ == "__main__":
        help=
        "Number of prefix tokens per request, used only for sonnet dataset.",
    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
    parser.add_argument(
        "--request-rate",
        type=float,
@@ -639,6 +713,15 @@ if __name__ == "__main__":
        help="Specify directory to save benchmark json results."
        "If not specified, results are saved in the current directory.",
    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -22,6 +22,12 @@ WEIGHT_SHAPES = {
        ([4096, 22016], 1),
        ([11008, 4096], 0),
    ],
+    "meta-llama/Llama-3-8b": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
    "meta-llama/Llama-2-13b-hf": [
        ([5120, 15360], 1),
        ([5120, 5120], 0),

--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py