Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -14,6 +14,7 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

+RUN python3 -m pip install -U pip
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
 # build vLLM with OpenVINO backend

--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -4,12 +4,12 @@ USER root

 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"

-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 

 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes

 COPY ./ /workspace/vllm

@@ -18,11 +18,9 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi

-# These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
-    pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
+    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        torch==2.3.1 \
        -r requirements-cpu.txt \
        xformers uvloop==0.20.0


--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
-# Default ROCm 6.2 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
+# default base image
+ARG REMOTE_VLLM="0"
+ARG USE_CYTHON="0"
+ARG BUILD_RPD="1"
+ARG COMMON_WORKDIR=/app
+ARG BASE_IMAGE=rocm/vllm-dev:base

-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+FROM ${BASE_IMAGE} AS base

-# Whether to install CK-based flash-attention
-# If 0, will not install flash-attention
-ARG BUILD_FA="1"
-ARG FA_GFX_ARCHS="gfx90a;gfx942"
-ARG FA_BRANCH="3cea2fb"
-
-# Whether to build triton on rocm
-ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="e192dba"
-
-### Base image build stage
-FROM $BASE_IMAGE AS base
-
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH
+ARG ARG_PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}

 # Install some basic utilities
-RUN apt-get update && apt-get install python3 python3-pip -y
-RUN apt-get update && apt-get install -y \
-    curl \
-    ca-certificates \
-    sudo \
-    git \
-    bzip2 \
-    libx11-6 \
-    build-essential \
-    wget \
-    unzip \
-    tmux \
-    ccache \
- && rm -rf /var/lib/apt/lists/*
-
-# When launching the container, mount the code directory to /vllm-workspace
-ARG APP_MOUNT=/vllm-workspace
-WORKDIR ${APP_MOUNT}
-
-RUN python3 -m pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
+RUN apt-get update -q -y && apt-get install -q -y \
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+# Remove sccache    
+RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}
+
+
+# -----------------------
+# vLLM fetch stages
+FROM base AS fetch_vllm_0
+ONBUILD COPY ./ vllm/
+FROM base AS fetch_vllm_1
+ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
+ARG VLLM_BRANCH="main"
+ONBUILD RUN git clone ${VLLM_REPO} \
+	    && cd vllm \
+	    && git checkout ${VLLM_BRANCH}
+FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
+
+# -----------------------
+# vLLM build stages
+FROM fetch_vllm AS build_vllm
+ARG USE_CYTHON
+# Build vLLM
+RUN cd vllm \
+    && python3 -m pip install -r requirements-rocm.txt \
+    && python3 setup.py clean --all  \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+FROM scratch AS export_vllm
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+
+# -----------------------
+# Test vLLM image
+FROM base AS test
+
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl
+
+WORKDIR /vllm-workspace
+ARG COMMON_WORKDIR
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
+
+# install development dependencies (for testing)
+RUN cd /vllm-workspace \
+    && rm -rf vllm \
+    && python3 -m pip install -e tests/vllm_test_utils \
+    && python3 -m pip install lm-eval[api]==0.4.4 \
+    && python3 -m pip install pytest-shard
+
+# -----------------------
+# Final vLLM image
+FROM base AS final

-# Install torch == 2.6.0 on ROCm
-RUN --mount=type=cache,target=/root/.cache/pip \
-    case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.2"*) \
-            python3 -m pip uninstall -y torch torchvision \
-            && python3 -m pip install --pre \
-                torch==2.6.0.dev20241113+rocm6.2 \
-                'setuptools-scm>=8' \
-                torchvision==0.20.0.dev20241113+rocm6.2 \
-                --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually remove it so that later steps of numpy upgrade can continue
+RUN case "$(which python3)" in \
+        *"/opt/conda/envs/py_3.9"*) \
+            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
        *) ;; esac

-ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
-ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
-ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
-
-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
-    && python3 -m pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_FA" = "1" ]; then \
-        mkdir -p libs \
-        && cd libs \
-        && git clone https://github.com/ROCm/flash-attention.git \
-        && cd flash-attention \
-        && git checkout "${FA_BRANCH}" \
-        && git submodule update --init \
-        && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_TRITON" = "1" ]; then \
-    mkdir -p libs \
-    && cd libs \
-    && python3 -m pip install ninja cmake wheel pybind11 \
-    && git clone https://github.com/OpenAI/triton.git \
-    && cd triton \
-    && git checkout "${TRITON_BRANCH}" \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Final vLLM build stage
-FROM base AS final
-# Import the vLLM development directory from the build context
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+RUN python3 -m pip install --upgrade huggingface-hub[cli]
+ARG BUILD_RPD
+RUN if [ ${BUILD_RPD} -eq "1" ]; then \
+    git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
+    && cd rocmProfileData/rpd_tracer \
+    && pip install -r requirements.txt && cd ../ \
+    && make && make install \
+    && cd hipMarker && python3 setup.py install ; fi

-RUN python3 -m pip install --upgrade pip
+# Install vLLM
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    cd /install \
+    && pip install -U -r requirements-rocm.txt \
+    && pip uninstall -y vllm \
+    && pip install *.whl

-# Package upgrades for useful functionality or to avoid dependency issues
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
+ARG COMMON_WORKDIR

+# Copy over the benchmark scripts as well
+COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
+COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples

-# Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
 ENV TOKENIZERS_PARALLELISM=false

-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=bind,source=.git,target=.git \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -Ur requirements-rocm.txt \
-    && python3 setup.py clean --all \
-    && python3 setup.py develop
-
-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
-    mkdir -p libs \
-    && cp /install/*.whl libs \
-    # Preemptively uninstall to avoid same-version no-installs
-    && python3 -m pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y triton; fi
-
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && python3 -m pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if ls libs/*.whl; then \
-    python3 -m pip install libs/*.whl; fi
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+# Performance environment variable.
+ENV HIP_FORCE_DEV_KERNARG=1

 CMD ["/bin/bash"]
+
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
+ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG LEGACY_HIPBLASLT_OPTION=
+ARG RCCL_BRANCH="648a58d"
+ARG RCCL_REPO="https://github.com/ROCm/rccl"
+ARG TRITON_BRANCH="e5be006"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+ARG PYTORCH_BRANCH="8d4926e"
+ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG FA_BRANCH="b7d29fb"
+ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+
+FROM ${BASE_IMAGE} AS base
+
+ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV ROCM_PATH=/opt/rocm
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+ARG PYTHON_VERSION=3.12
+
+RUN mkdir -p /app
+WORKDIR /app
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install Python and other dependencies
+RUN apt-get update -y \
+    && apt-get install -y software-properties-common git curl sudo vim less \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+       python${PYTHON_VERSION}-lib2to3 python-is-python3  \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+
+RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+
+FROM base AS build_hipblaslt
+ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
+# Set to "--legacy_hipblas_direct" for ROCm<=6.2
+ARG LEGACY_HIPBLASLT_OPTION
+RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN cd hipBLAS-common \
+    && git checkout ${HIPBLAS_COMMON_BRANCH} \
+    && mkdir build \
+    && cd build \
+    && cmake .. \
+    && make package \
+    && dpkg -i ./*.deb
+RUN git clone https://github.com/ROCm/hipBLASLt
+RUN cd hipBLASLt \
+    && git checkout ${HIPBLASLT_BRANCH} \
+    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && cd build/release \
+    && make package
+RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
+
+FROM base AS build_rccl
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+RUN git clone ${RCCL_REPO}
+RUN cd rccl \
+    && git checkout ${RCCL_BRANCH} \
+    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
+RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
+
+FROM base AS build_triton
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+RUN git clone ${TRITON_REPO}
+RUN cd triton \
+    && git checkout ${TRITON_BRANCH} \
+    && cd python \
+    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install
+
+FROM base AS build_amdsmi
+RUN cd /opt/rocm/share/amd_smi \
+    && pip wheel . --wheel-dir=dist
+RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
+
+FROM base AS build_pytorch
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN git clone ${PYTORCH_REPO} pytorch
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
+    pip install -r requirements.txt && git submodule update --init --recursive \
+    && python3 tools/amd_build/build_amd.py \
+    && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${PYTORCH_VISION_REPO} vision
+RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
+RUN git clone ${FA_REPO}
+RUN cd flash-attention \
+    && git checkout ${FA_BRANCH} \
+    && git submodule update --init \
+    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/flash-attention/dist/*.whl /app/install
+
+FROM base AS final
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
+    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
+    dpkg -i /install/*deb \
+    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
+    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+
+ARG BASE_IMAGE
+ARG HIPBLASLT_BRANCH
+ARG LEGACY_HIPBLASLT_OPTION
+ARG RCCL_BRANCH
+ARG RCCL_REPO
+ARG TRITON_BRANCH
+ARG TRITON_REPO
+ARG PYTORCH_BRANCH
+ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_REPO
+ARG PYTORCH_VISION_REPO
+ARG FA_BRANCH
+ARG FA_REPO
+RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
+    && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
+    && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
+    && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
+    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
+    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
+    && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
+    && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
-ARG NIGHTLY_DATE="20241017"
+ARG NIGHTLY_DATE="20250124"
 ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

 FROM $BASE_IMAGE

--- a/README.md
+++ b/README.md
@@ -33,7 +33,6 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention

 ## 安装
 vLLM支持
-+ Python 3.8.
 + Python 3.9.
 + Python 3.10.
 + Python 3.11.
@@ -84,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install （若调试，可使用V
 + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/

 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.6.post1;
+- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.7.1;

 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -16,6 +16,8 @@ Easy, fast, and cheap LLM serving for everyone
 ---

 *Latest News* 🔥
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
+- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
@@ -34,10 +36,12 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.

+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+
 vLLM is fast with:

 - State-of-the-art serving throughput
- Efficient management of attention key and value memory with **PagedAttention**
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
 - Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
@@ -68,16 +72,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod

 ## Getting Started

-Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
+Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):

 ```bash
 pip install vllm
 ```

-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
+Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
+- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
+- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)

 ## Contributing

@@ -90,28 +94,33 @@ vLLM is a community project. Our compute resources for development and testing a

 <!-- Note: Please sort them in alphabetical order. -->
 <!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
-
+Cash Donations:
 - a16z
+- Dropbox
+- Sequoia Capital
+- Skywork AI
+- ZhenFund
+
+Compute Resources:
 - AMD
 - Anyscale
 - AWS
 - Crusoe Cloud
 - Databricks
 - DeepInfra
- Dropbox
 - Google Cloud
 - Lambda Lab
 - Nebius
+- Novita AI
 - NVIDIA
 - Replicate
 - Roblox
 - RunPod
- Sequoia Capital
- Skywork AI
 - Trainy
 - UC Berkeley
 - UC San Diego
- ZhenFund
+
+Slack Sponsor: Anyscale

 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.


--- a/SECURITY.md
+++ b/SECURITY.md
@@ -4,7 +4,7 @@

 If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.

-Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
+Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).

 ---


--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -22,6 +22,7 @@ class RequestFuncInput:
    prompt_len: int
    output_len: int
    model: str
+    model_name: Optional[str] = None
    best_of: int = 1
    logprobs: Optional[int] = None
    extra_body: Optional[dict] = None
@@ -34,6 +35,7 @@ class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
+    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
    itl: List[float] = field(
        default_factory=list)  # List of inter-token latencies
@@ -49,7 +51,8 @@ async def async_request_tgi(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
        params = {
            "best_of": request_func_input.best_of,
            "max_new_tokens": request_func_input.output_len,
@@ -78,7 +81,7 @@ async def async_request_tgi(
                            continue
                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        #NOTE: Sometimes TGI returns a ping response without
+                        # NOTE: Sometimes TGI returns a ping response without
                        # any data, we should skip it.
                        if chunk_bytes.startswith(":"):
                            continue
@@ -121,7 +124,8 @@ async def async_request_trt_llm(
    api_url = request_func_input.api_url
    assert api_url.endswith("generate_stream")

-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1
        payload = {
            "accumulate_tokens": True,
@@ -155,7 +159,7 @@ async def async_request_trt_llm(
                        timestamp = time.perf_counter()
                        # First token
                        if ttft == 0.0:
-                            ttft = time.perf_counter() - st
+                            ttft = timestamp - st
                            output.ttft = ttft

                        # Decoding phase
@@ -185,7 +189,8 @@ async def async_request_deepspeed_mii(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
        assert request_func_input.best_of == 1

        payload = {
@@ -233,17 +238,23 @@ async def async_request_openai_completions(
        ("completions", "profile")
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
        payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
            "prompt": request_func_input.prompt,
            "temperature": 0.0,
            "best_of": request_func_input.best_of,
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
+            "stream_options": {
+                "include_usage": True,
+            },
        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
@@ -254,7 +265,6 @@ async def async_request_openai_completions(
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
-        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
@@ -269,15 +279,16 @@ async def async_request_openai_completions(

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
+                        if chunk != "[DONE]":
                            data = json.loads(chunk)

                            # NOTE: Some completion API might have a last
                            # usage summary response without a token so we
                            # want to check a token was generated
-                            if data["choices"][0]["text"]:
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
                                timestamp = time.perf_counter()
                                # First token
                                if not first_chunk_received:
@@ -291,7 +302,10 @@ async def async_request_openai_completions(
                                                      most_recent_timestamp)

                                most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["text"]
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
                    if first_chunk_received:
                        output.success = True
                    else:
@@ -300,7 +314,7 @@ async def async_request_openai_completions(
                            "Never received a valid chunk to calculate TTFT."
                            "This response will be marked as failed!")
                    output.generated_text = generated_text
-                    output.latency = latency
+                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
@@ -323,12 +337,14 @@ async def async_request_openai_chat_completions(
        "chat/completions"
    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."

-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        if request_func_input.multi_modal_content:
            content.append(request_func_input.multi_modal_content)
        payload = {
-            "model": request_func_input.model,
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
            "messages": [
                {
                    "role": "user",
@@ -338,8 +354,12 @@ async def async_request_openai_chat_completions(
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
-            "ignore_eos": request_func_input.ignore_eos,
+            "stream_options": {
+                "include_usage": True,
+            },
        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
@@ -365,17 +385,15 @@ async def async_request_openai_chat_completions(

                        chunk = chunk_bytes.decode("utf-8").removeprefix(
                            "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
+                        if chunk != "[DONE]":
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

-                            delta = data["choices"][0]["delta"]
-                            if delta.get("content", None):
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
                                # First token
                                if ttft == 0.0:
-                                    ttft = time.perf_counter() - st
+                                    ttft = timestamp - st
                                    output.ttft = ttft

                                # Decoding phase
@@ -383,13 +401,16 @@ async def async_request_openai_chat_completions(
                                    output.itl.append(timestamp -
                                                      most_recent_timestamp)

-                                generated_text += delta["content"]
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")

                            most_recent_timestamp = timestamp

                    output.generated_text = generated_text
                    output.success = True
-                    output.latency = latency
+                    output.latency = most_recent_timestamp - st
                else:
                    output.error = response.reason or ""
                    output.success = False
@@ -417,14 +438,35 @@ def get_model(pretrained_model_name_or_path: str) -> str:


 def get_tokenizer(
-    pretrained_model_name_or_path: str, trust_remote_code: bool
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
    if pretrained_model_name_or_path is not None and not os.path.exists(
            pretrained_model_name_or_path):
        pretrained_model_name_or_path = get_model(
            pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                         trust_remote_code=trust_remote_code)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError("MistralTokenizer requires vllm package.\n"
+                              "Please install it with `pip install vllm` "
+                              "to use mistral tokenizer mode.") from e
+        return MistralTokenizer.from_pretrained(
+            str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )


 ASYNC_REQUEST_FUNCS = {

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -13,6 +13,7 @@ from tqdm import tqdm
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser


@@ -40,6 +41,20 @@ def main(args: argparse.Namespace):
        "prompt_token_ids": batch
    } for batch in dummy_prompt_token_ids.tolist()]

+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
@@ -49,15 +64,11 @@ def main(args: argparse.Namespace):
                    ],
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        str(profile_dir))) as p:
-                llm.generate(dummy_prompts,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
+                llm_generate()
+            print(p.key_averages().table(sort_by="self_cuda_time_total"))
        else:
            start_time = time.perf_counter()
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm_generate()
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency

--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
+"""
+Offline benchmark to test the long document QA throughput.
+
+Example usage:
+    # This workload samples 8 different prompts with a default input
+    # length of 20000 tokens, then replicates each prompt 2 times 
+    # in random order.
+    python benchmark_long_document_qa_throughput.py \
+        --model meta-llama/Llama-2-7b-chat-hf \
+        --enable-prefix-caching \
+        --num-documents 8 \
+        --repeat-count 2 
+
+Commandline arguments:
+    --num-documents: The number of documents to sample prompts from.
+
+    --document-length: The length of each document in tokens. 
+                       (Optional, default: 20000)
+
+    --output-len: The number of tokens to generate for each prompt.
+                  (Optional, default: 10)
+
+    --repeat-count: The number of times to repeat each prompt.
+                    (Optional, default: 2)
+
+    --repeat-mode: The mode to repeat prompts. The supported modes are:
+        - 'random': shuffle the prompts randomly. (Default)
+        - 'tile': the entire prompt list is repeated in sequence. (Potentially
+                  lowest cache hit)
+        - 'interleave': each prompt is repeated consecutively before 
+                        moving to the next element. (Highest cache hit)
+    
+    --shuffle-seed: Random seed when the repeat mode is "random".
+                    (Optional, default: 0)
+
+In the meantime, it also supports all the vLLM engine args to initialize the 
+LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
+details.
+"""
+
+import dataclasses
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
+    """
+    Test long document QA with the given prompts and sampling parameters.
+    Print the time spent in processing all the prompts.
+
+    Args:
+        llm: The language model used for generating responses.
+        sampling_params: Sampling parameter used to generate the response.
+        prompts: A list of prompt strings to be processed by the LLM.
+    """
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
+
+
+def repeat_prompts(prompts, repeat_count, mode: str):
+    """
+    Repeat each prompt in the list for a specified number of times.
+    The order of prompts in the output list depends on the mode.
+
+    Args:
+        prompts: A list of prompts to be repeated.
+        repeat_count: The number of times each prompt is repeated.
+        mode: The mode of repetition. Supported modes are:
+            - 'random': Shuffle the prompts randomly after repetition.
+            - 'tile': Repeat the entire prompt list in sequence.
+              Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
+            - 'interleave': Repeat each prompt consecutively before moving to 
+              the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
+
+    Returns:
+        A list of repeated prompts in the specified order.
+
+    Raises:
+        ValueError: If an invalid mode is provided.
+    """
+    print("Repeat mode: ", mode)
+    if mode == 'random':
+        repeated_prompts = prompts * repeat_count
+        random.shuffle(repeated_prompts)
+        return repeated_prompts
+    elif mode == 'tile':
+        return prompts * repeat_count
+    elif mode == 'interleave':
+        repeated_prompts = []
+        for prompt in prompts:
+            repeated_prompts.extend([prompt] * repeat_count)
+        return repeated_prompts
+    else:
+        raise ValueError(f"Invalid mode: {mode}, only support "
+                         "'random', 'tile', 'interleave'")
+
+
+def main(args):
+    random.seed(args.shuffle_seed)
+
+    # Prepare the prompts:
+    # we append the document id at the beginning to avoid any of the document
+    # being the prefix of other documents
+    prompts = [
+        str(i) + ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)
+    ]
+
+    prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
+
+    warmup_prompts = [
+        "This is warm up request " + str(i) + \
+                ' '.join(['hi'] * args.document_length)
+        for i in range(args.num_documents)]
+
+    # Create the LLM engine
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+
+    print("------warm up------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=warmup_prompts,
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_long_document_qa(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description=
+        'Benchmark the performance with or without automatic prefix caching.')
+
+    parser.add_argument(
+        '--document-length',
+        type=int,
+        # Roughly the number of tokens for a system paper,
+        # excluding images
+        default=20000,
+        help='Range of input lengths for sampling prompts,'
+        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--num-documents',
+                        type=int,
+                        default=8,
+                        help='Range of input lengths for sampling prompts,'
+                        'specified as "min:max" (e.g., "128:256").')
+
+    parser.add_argument('--output-len', type=int, default=10)
+
+    parser.add_argument('--repeat-count',
+                        type=int,
+                        default=2,
+                        help='Number of times to repeat each prompt')
+
+    parser.add_argument("--repeat-mode",
+                        type=str,
+                        default='random',
+                        help='The mode to repeat prompts. The supported '
+                        'modes are "random", "tile", and "interleave". '
+                        'See repeat_prompts() in the source code for details.')
+
+    parser.add_argument("--shuffle-seed",
+                        type=int,
+                        default=0,
+                        help='Random seed when the repeat mode is "random"')
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -10,7 +10,8 @@ Fixed example usage:
        --model meta-llama/Llama-2-7b-chat-hf \
        --enable-prefix-caching \
        --num-prompts 1 \
-        --repeat-count 100
+        --repeat-count 100 \
+        --input-length-range 128:256

 ShareGPT example usage:
    # This command samples 20 prompts with input lengths

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,6 +25,7 @@ On the client side, run:
 import argparse
 import asyncio
 import base64
+import gc
 import io
 import json
 import os
@@ -199,7 +200,7 @@ def sample_sonnet_requests(
    return sampled_requests


-def sample_mmmu_pro_vision_requests(
+def sample_vision_arena_requests(
    dataset,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@@ -211,13 +212,7 @@ def sample_mmmu_pro_vision_requests(
        if len(sampled_requests) == num_requests:
            break

-        # MMMU-Pro vision direct prompt
-        # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5
-        prompt = (
-            "Answer with the option letter from the given choices directly. "
-            "The last line of your response should be of the following "
-            "format: 'Answer: $LETTER' (without quotes) where LETTER is one of "
-            "options.")
+        prompt = data["turns"][0][0]['content']

        prompt_token_ids = tokenizer(prompt).input_ids
        if fixed_output_len is None:
@@ -229,10 +224,10 @@ def sample_mmmu_pro_vision_requests(
        output_len = fixed_output_len

        assert isinstance(
-            data["image"],
+            data["images"][0],
            Image), ("Input image format must be `PIL.Image.Image`, "
                     f"given {type(data['image'])}.")
-        image: Image = data["image"]
+        image: Image = data["images"][0]
        image = image.convert("RGB")
        image_data = io.BytesIO()
        image.save(image_data, format='JPEG')
@@ -251,7 +246,7 @@ def sample_mmmu_pro_vision_requests(

 def sample_hf_requests(
    dataset_path: str,
-    dataset_subset: str,
+    dataset_subset: Optional[str],
    dataset_split: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@@ -259,19 +254,17 @@ def sample_hf_requests(
    fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:

-    # Special case for MMMU-Pro vision dataset
-    if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision':
-        assert dataset_split == "test"
+    # Special case for vision_arena dataset
+    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
+        and dataset_subset is None:
+        assert dataset_split == "train"
        dataset = load_dataset(dataset_path,
                               name=dataset_subset,
                               split=dataset_split,
                               streaming=True)
-        assert "image" in dataset.features, (
-            "MMMU/MMMU_Pro vision dataset must have 'image' column.")
-        filter_func = lambda x: isinstance(x["image"], Image)
-        dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-        return sample_mmmu_pro_vision_requests(dataset, num_requests,
-                                               tokenizer, fixed_output_len)
+        dataset = dataset.shuffle(seed=random_seed)
+        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
+                                            fixed_output_len)

    dataset = load_dataset(dataset_path,
                           name=dataset_subset,
@@ -423,7 +416,7 @@ def calculate_metrics(
    tokenizer: PreTrainedTokenizerBase,
    selected_percentile_metrics: List[str],
    selected_percentiles: List[float],
-    gootput_config_dict: Dict[str, float],
+    goodput_config_dict: Dict[str, float],
 ) -> Tuple[BenchmarkMetrics, List[int]]:
    actual_output_lens: List[int] = []
    total_input = 0
@@ -436,19 +429,23 @@ def calculate_metrics(
    e2els: List[float] = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note : this may inflate the output token count slightly
-            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+            output_len = outputs[i].output_tokens
+
+            if output_len is None:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(outputs[i].generated_text,
+                              add_special_tokens=False).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
            tpot = 0
            if output_len > 1:
-                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
-                                                                 1)
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
                tpots.append(tpot)
            # Note: if output_len <= 1, we regard tpot as 0 for goodput
            all_tpots.append(tpot)
@@ -459,21 +456,21 @@ def calculate_metrics(
        else:
            actual_output_lens.append(0)

-    if gootput_config_dict:
+    if goodput_config_dict:
        valid_metrics = []
        slo_values = []

-        if "ttft" in gootput_config_dict:
+        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(gootput_config_dict["ttft"] /
+            slo_values.append(goodput_config_dict["ttft"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)
-        if "tpot" in gootput_config_dict:
+        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(gootput_config_dict["tpot"] /
+            slo_values.append(goodput_config_dict["tpot"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)
-        if "e2el" in gootput_config_dict:
+        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(gootput_config_dict["e2el"] /
+            slo_values.append(goodput_config_dict["e2el"] /
                              MILLISECONDS_TO_SECONDS_CONVERSION)

        for req_metric in zip(*valid_metrics):
@@ -525,6 +522,7 @@ async def benchmark(
    api_url: str,
    base_url: str,
    model_id: str,
+    model_name: str,
    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    logprobs: Optional[int],
@@ -536,7 +534,7 @@ async def benchmark(
    selected_percentile_metrics: List[str],
    selected_percentiles: List[str],
    ignore_eos: bool,
-    gootput_config_dict: Dict[str, float],
+    goodput_config_dict: Dict[str, float],
    max_concurrency: Optional[int],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
@@ -553,6 +551,7 @@ async def benchmark(
            "Multi-modal content is only supported on 'openai-chat' backend.")
    test_input = RequestFuncInput(
        model=model_id,
+        model_name=model_name,
        prompt=test_prompt,
        api_url=api_url,
        prompt_len=test_prompt_len,
@@ -573,6 +572,7 @@ async def benchmark(
    if profile:
        print("Starting profiler...")
        profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
                                         prompt=test_prompt,
                                         api_url=base_url + "/start_profile",
                                         prompt_len=test_prompt_len,
@@ -616,6 +616,7 @@ async def benchmark(
    async for request in get_request(input_requests, request_rate, burstiness):
        prompt, prompt_len, output_len, mm_content = request
        request_func_input = RequestFuncInput(model=model_id,
+                                              model_name=model_name,
                                              prompt=prompt,
                                              api_url=api_url,
                                              prompt_len=prompt_len,
@@ -657,7 +658,7 @@ async def benchmark(
        tokenizer=tokenizer,
        selected_percentile_metrics=selected_percentile_metrics,
        selected_percentiles=selected_percentiles,
-        gootput_config_dict=gootput_config_dict,
+        goodput_config_dict=goodput_config_dict,
    )

    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -669,7 +670,7 @@ async def benchmark(
                                 metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
-    if gootput_config_dict:
+    if goodput_config_dict:
        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                        metrics.request_goodput))
    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
@@ -684,7 +685,7 @@ async def benchmark(
        "total_output_tokens": metrics.total_output,
        "request_throughput": metrics.request_throughput,
        "request_goodput:":
-        metrics.request_goodput if gootput_config_dict else None,
+        metrics.request_goodput if goodput_config_dict else None,
        "output_throughput": metrics.output_throughput,
        "total_token_throughput": metrics.total_token_throughput,
        "input_lens": [output.prompt_len for output in outputs],
@@ -740,11 +741,11 @@ async def benchmark(

 def check_goodput_args(args):
    # Check and parse goodput arguments
-    gootput_config_dict = {}
+    goodput_config_dict = {}
    VALID_NAMES = ["ttft", "tpot", "e2el"]
    if args.goodput:
-        gootput_config_dict = parse_goodput(args.goodput)
-        for slo_name, slo_val in gootput_config_dict.items():
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
            if slo_name not in VALID_NAMES:
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
@@ -755,22 +756,22 @@ def check_goodput_args(args):
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
                    "non-negative.")
-    return gootput_config_dict
+    return goodput_config_dict


 def parse_goodput(slo_pairs):
-    gootput_config_dict = {}
+    goodput_config_dict = {}
    try:
        for slo_pair in slo_pairs:
            slo_name, slo_val = slo_pair.split(":")
-            gootput_config_dict[slo_name] = float(slo_val)
+            goodput_config_dict[slo_name] = float(slo_val)
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
            "Specify service level objectives for goodput as \"KEY:VALUE\" "
            "pairs, where the key is a metric name, and the value is a "
            "number in milliseconds.") from err
-    return gootput_config_dict
+    return goodput_config_dict


 def main(args: argparse.Namespace):
@@ -780,6 +781,7 @@ def main(args: argparse.Namespace):

    backend = args.backend
    model_id = args.model
+    model_name = args.served_model_name
    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
    tokenizer_mode = args.tokenizer_mode

@@ -869,7 +871,11 @@ def main(args: argparse.Namespace):
    else:
        raise ValueError(f"Unknown dataset: {args.dataset_name}")

-    gootput_config_dict = check_goodput_args(args)
+    goodput_config_dict = check_goodput_args(args)
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()

    benchmark_result = asyncio.run(
        benchmark(
@@ -877,6 +883,7 @@ def main(args: argparse.Namespace):
            api_url=api_url,
            base_url=base_url,
            model_id=model_id,
+            model_name=model_name,
            tokenizer=tokenizer,
            input_requests=input_requests,
            logprobs=args.logprobs,
@@ -890,7 +897,7 @@ def main(args: argparse.Namespace):
                float(p) for p in args.metric_percentiles.split(",")
            ],
            ignore_eos=args.ignore_eos,
-            gootput_config_dict=gootput_config_dict,
+            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
        ))

@@ -919,8 +926,8 @@ def main(args: argparse.Namespace):
                    )

        # Traffic
-        result_json["request_rate"] = (
-            args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
        result_json["burstiness"] = args.burstiness
        result_json["max_concurrency"] = args.max_concurrency

@@ -1222,5 +1229,12 @@ if __name__ == "__main__":
        'always use the slow tokenizer. \n* '
        '"mistral" will always use the `mistral_common` tokenizer.')

+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
    args = parser.parse_args()
    main(args)
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -3,7 +3,7 @@ import copy
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from typing import Callable, Iterable, List, Optional, Tuple

 import torch
 import torch.utils.benchmark as TBenchmark
@@ -12,6 +12,8 @@ from utils import make_rand_tensors
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul)
 from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -38,8 +40,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
    ).blocked_autorange(min_run_time=min_run_time)


-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
+def bench_int8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark INT8-based kernels."""
    assert dtype == torch.int8
    a, b = make_rand_tensors(torch.int8, m, n, k)
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
@@ -48,155 +57,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
    azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)

+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "cutlass_i8_i8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_i8_i8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, None, bias),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp),
+        "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
+        lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
+                                          bfloat16, azp_adj, azp, bias),
+    }
+
    timers = []
-    # pytorch impl - bfloat16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16),
-                 b.to(dtype=torch.bfloat16)))
-
-    # pytorch impl - float16
-    timers.append(
-        bench_fn(label, sub_label,
-                 "pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
-                 a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-
-    # cutlass with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass with azp per-tensor
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj))
-
-    # cutlass with azp per-tensor + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, None, bias))
-
-    # cutlass with azp per-token
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp))
-
-    # cutlass with azp per-token + bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
-                 ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
-                 torch.bfloat16, azp_adj, azp, bias))
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))

    return timers


-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
+def bench_fp8(
+        dtype: torch.dtype,
+        m: int,
+        k: int,
+        n: int,
+        label: str,
+        sub_label: str,
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+    """Benchmark FP8-based kernels."""
    assert dtype == torch.float8_e4m3fn
    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    a_cont = a.contiguous()
    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    block_scale_a = torch.rand((m, k // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_b = torch.rand((k // 128, n // 128),
+                               device="cuda",
+                               dtype=torch.float32)
+    block_scale_a_M_major = block_scale_a.t().contiguous().t()
+    block_scale_b_K_major = block_scale_b.t().contiguous().t()
    bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)

-    timers = []
+    print(m, k, n)
+
+    bench_fns = {
+        "pytorch_bf16_bf16_bf16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
+                         ),
+        "pytorch_fp16_fp16_fp16_matmul-no-scales":
+        lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
+        "pytorch_fp8_fp8_fp16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.float16),
+        "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.float16,
+                                 use_fast_accum=True),
+        "pytorch_fp8_fp8_bf16_scaled_mm":
+        lambda: torch._scaled_mm(
+            a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
+        "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
+        lambda: torch._scaled_mm(a,
+                                 b,
+                                 scale_a,
+                                 scale_b,
+                                 out_dtype=torch.bfloat16,
+                                 use_fast_accum=True),
+        "cutlass_fp8_fp8_bf16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
+        "cutlass_fp8_fp8_fp16_scaled_mm":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
+        "cutlass_fp8_fp8_bf16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
+                                      bias),
+        "cutlass_fp8_fp8_fp16_scaled_mm_bias":
+        lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
+                                      bias.to(dtype=torch.float16)),
+        "triton_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
+                                      block_scale_b.t(), (128, 128)),
+        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
+        lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
+                                      block_scale_b_K_major, torch.float16),
+    }

-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
-                 torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda")))
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.bfloat16,
-                 use_fast_accum=True))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(label,
-                 sub_label,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
-                 torch._scaled_mm,
-                 a,
-                 b,
-                 scale_a=scale_a,
-                 scale_b=scale_b,
-                 out_dtype=torch.float16,
-                 use_fast_accum=True))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
-                 torch.bfloat16))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
-
-    # cutlass impl: bf16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
-                 bias))
-
-    # cutlass impl: fp16 output, with bias
-    timers.append(
-        bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
-                 ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
-                 bias.to(dtype=torch.float16)))
+    timers = []
+    for name, fn in bench_fns.items():
+        # If bench_kernels is None, run all. Otherwise, run only exact matches.
+        if bench_kernels is None or name in bench_kernels:
+            print(f"Running {name}")
+            timers.append(bench_fn(label, sub_label, name, fn))

    return timers


-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
+def bench(dtype: torch.dtype,
+          m: int,
+          k: int,
+          n: int,
+          label: str,
+          sub_label: str,
+          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
+        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
+        return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
    raise ValueError("unsupported type")


@@ -207,18 +193,22 @@ def print_timers(timers: Iterable[TMeasurement]):


 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[Tuple[int, int, int]],
+        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
    results = []
    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
+        timers = bench(dtype,
+                       m,
+                       k,
+                       n,
+                       f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})",
+                       bench_kernels=bench_kernels)
        print_timers(timers)
        results.extend(timers)
-
    return results


-# output makers
 def make_output(data: Iterable[TMeasurement],
                MKNs: Iterable[Tuple[int, int, int]],
                base_description: str,
@@ -232,15 +222,11 @@ def make_output(data: Iterable[TMeasurement],
        pkl.dump(data, f)


-# argparse runners
-
-
 def run_square_bench(args):
    dim_sizes = list(
        range(args.dim_start, args.dim_end + 1, args.dim_increment))
    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
    make_output(data, MKNs, f"square_bench-{args.dtype}")


@@ -251,8 +237,7 @@ def run_range_bench(args):
    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
+    data = run(args.dtype, MKNs, bench_kernels=args.kernels)
    make_output(data, MKNs, f"range_bench-{args.dtype}")


@@ -278,7 +263,7 @@ def run_model_bench(args):
            for k, n in KNs:
                MKNs.append((m, k, n))

-        data = run(args.dtype, MKNs)
+        data = run(args.dtype, MKNs, bench_kernels=args.kernels)
        model_bench_data.append(data)

    # Print all results
@@ -328,6 +313,15 @@ Benchmark Cutlass GEMM.
                        type=to_torch_dtype,
                        required=True,
                        help="Available options are ['int8', 'fp8']")
+    parser.add_argument(
+        "--kernels",
+        nargs="+",
+        type=str,
+        default=None,
+        help=
+        "Exact names of the kernels to benchmark. If not set, runs all kernels."
+    )
+
    subparsers = parser.add_subparsers(dest="cmd")

    square_parser = subparsers.add_parser("square_bench")
@@ -362,4 +356,4 @@ Benchmark Cutlass GEMM.
    model_parser.set_defaults(func=run_model_bench)

    args = parser.parse_args()
-    args.func(args)
\ No newline at end of file
+    args.func(args)
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
+import argparse
+import copy
+import json
+import pickle
+import time
+from dataclasses import dataclass
+from enum import Enum, auto
+from itertools import product
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from utils import ArgPool, Bench, CudaGraphBenchParams
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
+from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
+from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
+from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
+from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
+DEFAULT_TP_SIZES = [1]
+DEFAULT_BATCH_SIZES = [
+    1, 16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 640, 768, 896, 1024,
+    2048, 3072, 4096, 5120, 6144, 7168, 8192
+]
+DEFAULT_HIDDEN_SIZES = [1024, 2048, 4096, 8192, 16384]
+DEFAULT_LORA_RANKS = [16]
+DEFAULT_NUM_LORAS = [1, 2, 3, 4]
+DEFAULT_SORT_BY_LORA_IDS = [False, True]
+DEFAULT_SEQ_LENGTHS = [1]
+DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
+
+
+# Utilities
+def dtype_to_str(dtype: torch.dtype):
+    if dtype == torch.float16:
+        return "f16"
+    if dtype == torch.bfloat16:
+        return "bf16"
+    if dtype == torch.float32:
+        return "f32"
+    raise ValueError(f"Unsupported dtype {dtype}")
+
+
+def make_rand_lora_weight_tensor(k: int,
+                                 n: int,
+                                 num_loras: int,
+                                 dtype: torch.dtype,
+                                 device: str = "cuda") -> torch.Tensor:
+
+    # LoRA weights column major
+    return torch.rand((num_loras, n, k), dtype=dtype).to(device)
+
+
+def make_rand_tensors(
+    a_shape: Tuple[int],
+    b_shape: Tuple[int],
+    c_shape: Tuple[int],
+    a_dtype: torch.dtype,
+    b_dtype: torch.dtype,
+    c_dtype: torch.dtype,
+    num_slices: int,
+    device: str = "cuda",
+) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+    """
+    Make LoRA input/output matrices.
+    """
+    A = torch.rand(a_shape, dtype=a_dtype).to(device)
+
+    # LoRA weights column major
+    Bs = [
+        torch.rand(b_shape, dtype=b_dtype).to(device)
+        for _ in range(num_slices)
+    ]
+
+    C = torch.zeros(c_shape, dtype=c_dtype).to(device)
+    return A, Bs, C
+
+
+def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
+                             sort_by_lora_id: bool,
+                             device: str) -> torch.Tensor:
+    """
+    All prompts are mapped to a Lora ID in range [0, num_active_loras).
+    where 0 refers to first lora, 1 refers to second lora and so on.
+    """
+    assert num_active_loras > 0
+
+    if not sort_by_lora_id:
+        return torch.randint(0,
+                             num_active_loras, (num_prompts, ),
+                             dtype=torch.long)
+
+    # Divide LoRAs equally and in order.
+    part_size = num_prompts // num_active_loras
+    part_size = max(part_size, 1)
+
+    lora_id = 0
+    prompt_lora_mapping = []
+    while len(prompt_lora_mapping) < num_prompts:
+        prompt_lora_mapping.extend([lora_id] * part_size)
+        lora_id = lora_id + 1 if lora_id + 1 < num_active_loras else lora_id
+    return torch.tensor(prompt_lora_mapping[:num_prompts],
+                        dtype=torch.long,
+                        device=device)
+
+
+def make_token_lora_mapping(num_tokens: int, num_prompts: int,
+                            prompt_lora_mapping: torch.Tensor,
+                            seq_len_tensor: torch.Tensor, device: str):
+    """
+    Make token_lora_mapping from prompt_lora_mapping and seq_lens_tensor
+    """
+    assert prompt_lora_mapping.shape[0] == num_prompts
+
+    # token to lora index mapping
+    token_lora_mapping = [0] * num_tokens
+    current_offset = 0
+    for b_id in range(num_prompts):
+        lora_index = prompt_lora_mapping[b_id].item()
+        s = current_offset
+        e = s + seq_len_tensor[b_id].item()
+        token_lora_mapping[s:e] = [lora_index] * (e - s)
+        current_offset += seq_len_tensor[b_id].item()
+
+    return torch.tensor(token_lora_mapping, dtype=torch.long, device=device)
+
+
+def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
+                   lora_weights: List[torch.Tensor],
+                   seq_lens_cpu: torch.Tensor,
+                   prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
+                   add_inputs: Optional[bool]):
+    """
+    Torch group gemm reference implementation to test correctness of
+    benchmarking operations.
+    """
+    batches = seq_lens_cpu.size(0)
+    out_list = []
+    current_offset = 0
+    for lora_index, b_length in zip(range(batches), seq_lens_cpu):
+        x = input[current_offset:b_length + current_offset, :]
+        current_offset += b_length
+        w = lora_weights[prompt_lora_mapping_cpu[lora_index]]
+        result = torch.nn.functional.linear(x, w)
+        result *= scaling
+        out_list.append(result)
+    torch.cat(out_list, dim=0)
+
+    cat_result = torch.cat(out_list, dim=0)
+
+    if add_inputs:
+        ref_out += cat_result
+    else:
+        ref_out.copy_(cat_result)
+
+
+class OpType(Enum):
+    """
+    LoRA Ops to benchmark and its properties.
+    """
+    SGMV_SHRINK = auto()
+    BGMV_SHRINK = auto()
+    SGMV_EXPAND = auto()
+    BGMV_EXPAND = auto()
+    BGMV_EXPAND_SLICE = auto()
+
+    @staticmethod
+    def from_str(s: str) -> "OpType":
+        if s.lower() == 'sgmv_shrink':
+            return OpType.SGMV_SHRINK
+        if s.lower() == 'sgmv_expand':
+            return OpType.SGMV_EXPAND
+        if s.lower() == 'bgmv_shrink':
+            return OpType.BGMV_SHRINK
+        if s.lower() == 'bgmv_expand':
+            return OpType.BGMV_EXPAND
+        if s.lower() == "bgmv_expand_slice":
+            return OpType.BGMV_EXPAND_SLICE
+        raise ValueError(f"Unrecognized str {s} to convert to OpType")
+
+    def is_shrink_fn(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
+
+    def is_expand_fn(self) -> bool:
+        return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
+
+    def is_prefill_op(self) -> bool:
+        return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
+
+    def is_decode_op(self) -> bool:
+        return self in [
+            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
+        ]
+
+    def is_expand_slice_fn(self) -> bool:
+        return self in [OpType.BGMV_EXPAND_SLICE]
+
+    def num_slices(self) -> List[int]:
+        if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
+            # SGMV kernels supports slices
+            return [1, 2, 3]
+        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
+            return [1]
+        if self in [OpType.BGMV_EXPAND_SLICE]:
+            return [2, 3]
+        raise ValueError(f"Unrecognized OpType {self}")
+
+    def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int) -> Tuple[int, int, int]:
+        num_tokens = batch_size * seq_length
+        if self.is_shrink_fn():
+            m = num_tokens
+            k = hidden_size
+            n = lora_rank
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            m = num_tokens
+            k = lora_rank
+            n = hidden_size
+        return m, k, n
+
+    def matmul_dtypes(
+            self, op_dtype: torch.dtype
+    ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
+        """
+        return a type, b type and c type for A x B = C
+        """
+        if self.is_shrink_fn():
+            return op_dtype, op_dtype, torch.float32
+        else:
+            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            return torch.float32, op_dtype, op_dtype
+
+    def matmul_shapes(
+            self, batch_size: int, seq_length: int, hidden_size: int,
+            lora_rank: int, num_loras: int,
+            num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
+        """
+        Given num_slices, return the shapes of the A, B, and C matrices
+        in A x B = C, for the op_type
+        """
+        m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
+
+        b_shape = (num_loras, n, k)  # col-major
+        if self == OpType.SGMV_SHRINK:
+            # SGMV shrink supports num_slices inherently in the kernel
+            return ((m, k), b_shape, (num_slices, m, n))
+        if self == OpType.SGMV_EXPAND:
+            # SGMV expand supports num_slices inherently in the kernel
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+        if self == OpType.BGMV_SHRINK:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND:
+            return ((m, k), b_shape, (m, n))
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return ((num_slices, m, k), b_shape, (m, n * num_slices))
+
+        raise ValueError(f"Unrecognized op_type {self}")
+
+    def bench_fn(self) -> Callable:
+
+        def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
+            for x in kwargs_list:
+                bgmv_expand_slice(**x)
+
+        if self == OpType.SGMV_SHRINK:
+            return sgmv_shrink
+        if self == OpType.SGMV_EXPAND:
+            return sgmv_expand
+        if self == OpType.BGMV_SHRINK:
+            return bgmv_shrink
+        if self == OpType.BGMV_EXPAND:
+            return bgmv_expand
+        if self == OpType.BGMV_EXPAND_SLICE:
+            return emulate_bgmv_expand_slice
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
+                           lora_weights: List[torch.Tensor],
+                           **kwargs) -> Callable:
+        """Each benchmark operation expected the input, lora_weights and outputs
+           in a slightly different format. Refer to self.matmul_shapes().
+           run_ref_group_gemm accounts for those differences in executing a
+           reference group gemm for correctness testing.
+        """
+        w_dtype = lora_weights[0].dtype
+        num_slices = len(lora_weights)
+        if self == OpType.SGMV_SHRINK:
+            for slice_idx in range(num_slices):
+                ref_group_gemm(ref_out=output[slice_idx, :],
+                               input=input,
+                               lora_weights=lora_weights[slice_idx],
+                               **kwargs)
+        if self == OpType.SGMV_EXPAND:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        if self == OpType.BGMV_SHRINK:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input,
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND:
+            assert num_slices == 1
+            ref_group_gemm(ref_out=output,
+                           input=input.clone().to(dtype=w_dtype),
+                           lora_weights=lora_weights[0],
+                           **kwargs)
+        if self == OpType.BGMV_EXPAND_SLICE:
+            hidden_size = lora_weights[0].shape[1]
+            for slice_idx in range(num_slices):
+                slice_offset = slice_idx * hidden_size
+                ref_group_gemm(
+                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
+                    input=input[slice_idx].clone().to(dtype=w_dtype),
+                    lora_weights=lora_weights[slice_idx],
+                    **kwargs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+
+@dataclass
+class BenchmarkContext:
+    """
+    LoRA benchmark context
+    """
+    batch_size: int
+    hidden_size: int
+    num_loras: int
+    num_active_loras: int
+    lora_rank: int
+    sort_by_lora_id: bool
+    dtype: torch.dtype
+    seq_length: Optional[int] = None
+    num_slices: Optional[int] = None  # num_slices for slice based ops
+
+    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.seq_length = seq_length
+        return ctx
+
+    def with_num_slices(self, num_slices: int) -> "BenchmarkContext":
+        ctx = copy.copy(self)
+        ctx.num_slices = num_slices
+        return ctx
+
+    def bench_label(self) -> str:
+        return f"lora-{self.dtype}"
+
+    def bench_sublabel(self, op_type: OpType) -> str:
+        m, k, n = op_type.mkn(self.batch_size, self.seq_length,
+                              self.hidden_size, self.lora_rank)
+        desc = {
+            'bs': self.batch_size,
+            'sl': self.seq_length,
+            'm': m,
+            'k': k,
+            'n': n,
+            'num_loras': self.num_loras,
+            'sort_by_lora': self.sort_by_lora_id,
+            'num_slices': self.num_slices,
+        }
+        return json.dumps(desc)
+
+
+@dataclass
+class BenchmarkTensors:
+    """
+    Input/Output tensors used for benchmarks
+    """
+    # matmul tensors
+    input: torch.Tensor
+    lora_weights_lst: List[torch.Tensor]
+    output: torch.Tensor
+    # metadata tensors
+    seq_lens: torch.Tensor
+    seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def io_types(self) -> str:
+        return (f"{dtype_to_str(self.input.dtype)}x"
+                f"{dtype_to_str(self.lora_weights_lst[0].dtype)}=>"
+                f"{dtype_to_str(self.output.dtype)}")
+
+    @staticmethod
+    def make(ctx: BenchmarkContext,
+             op_type: OpType,
+             device: str = "cuda") -> "BenchmarkTensors":
+
+        # Make input / output matmul tensors.
+        a_shape, b_shape, c_shape = op_type.matmul_shapes(
+            ctx.batch_size, ctx.seq_length, ctx.hidden_size, ctx.lora_rank,
+            ctx.num_loras, ctx.num_slices)
+        a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
+        input_tensor, lora_weights, output_tensor = \
+            make_rand_tensors(a_shape, b_shape, c_shape, a_type, b_type, c_type,
+                              num_slices = ctx.num_slices)
+
+        # Make metadata tensors.
+        # Keep the metadata tensors in the CPU for further processing if needed.
+        # The tensors get moved to the GPU before benchmarking.
+        assert ctx.num_active_loras <= ctx.num_loras
+        total_tokens = ctx.batch_size * ctx.seq_length
+
+        # Prepare seq lens tensor
+        seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
+                                       (ctx.batch_size, ))
+        # Prepare seq_start_loc tensor
+        seq_start_loc_tensor = torch.cumsum(torch.tensor(
+            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
+                                            dim=0)
+        assert total_tokens == seq_len_tensor.sum()
+        # Prepare prompt lora indices tensor
+        prompt_lora_indices_tensor = make_prompt_lora_mapping(
+            ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
+        # Prepare token lora indices tensor
+        token_lora_indices_tensor = make_token_lora_mapping(
+            total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
+            seq_len_tensor, "cpu")
+
+        return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
+                                seq_len_tensor, seq_start_loc_tensor,
+                                prompt_lora_indices_tensor,
+                                token_lora_indices_tensor)
+
+    def sanity_check(self) -> None:
+        """
+        Fails asserts when non-conformality is detected.
+        """
+        num_tokens = self.input.shape[-2]
+        # check metadata tensors
+        assert torch.sum(self.seq_lens) == num_tokens
+        num_seqs = self.seq_lens.shape[0]
+        assert self.seq_start_loc.shape[0] == num_seqs
+        assert self.prompt_lora_mapping.shape[0] == num_seqs
+        assert self.token_lora_mapping.shape[0] == num_tokens
+
+    def to_device(self, device: str):
+        """
+        Transfer tensors to device if the tensors aren't already on the device
+        """
+
+        def to_device(tensor: torch.Tensor):
+            if tensor.device != device:
+                tensor = tensor.to(device=device)
+            return tensor
+
+        self.input = to_device(self.input)
+        self.output = to_device(self.output)
+        self.seq_lens = to_device(self.seq_lens)
+        self.seq_start_loc = to_device(self.seq_start_loc)
+        self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
+        self.token_lora_mapping = to_device(self.token_lora_mapping)
+        for i in range(len(self.lora_weights_lst)):
+            self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
+
+    def metadata(self) -> Tuple[int, int, int]:
+        """
+        Return num_seqs, num_tokens and max_seq_len
+        """
+        num_seqs = self.seq_lens.shape[0]
+        num_tokens = self.token_lora_mapping.shape[0]
+        max_seq_len = torch.max(self.seq_lens).item()
+        num_slices = len(self.lora_weights_lst)
+        return num_seqs, num_tokens, max_seq_len, num_slices
+
+    def convert_to_sgmv_benchmark_tensors(self):
+        """
+        For sgmv punica kernels, when consecutive sequences have the
+        same LoRA ID, we just merge them together.
+        This happens in punica.py::compute_metadata
+        """
+
+        # Collapse seq_lens and seq_start_loc
+        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
+                                               return_counts=True)
+        cum_result = torch.cumsum(seq_lens, dim=0)
+        seq_start_loc = torch.zeros_like(seq_lens)
+        seq_start_loc[1:].copy_(cum_result[:-1])
+
+        # Collapse prompt mapping
+        prompt_lora_mapping = torch.unique_consecutive(
+            self.prompt_lora_mapping)
+
+        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
+         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
+
+        self.prompt_lora_mapping = prompt_lora_mapping.to(
+            dtype=self.prompt_lora_mapping.dtype)
+        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
+        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
+
+    def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'scaling': 1.0,
+        }
+
+    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        self.convert_to_sgmv_benchmark_tensors()
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'b_seq_start_loc': self.seq_start_loc,
+            'seq_len_tensor': self.seq_lens,
+            'lora_indices_tensor': self.prompt_lora_mapping,
+            'batches': num_seqs,
+            'max_seq_length': max_seq_len,
+            'token_nums': num_tokens,
+            'offset_start': 0,
+            'add_inputs': add_inputs,
+        }
+
+    def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_tokens, lora_rank]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'scaling': 1.0
+        }
+
+    def as_bgmv_expand_kwargs(self, add_inputs: bool):
+        assert len(self.lora_weights_lst) == 1
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, _ = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, lora_rank]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        lora_rank = i_shape[1]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst[0],
+            'output_tensor': self.output,
+            'lora_indices_tensor': self.token_lora_mapping,
+            'add_inputs': add_inputs
+        }
+
+    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+
+        _, num_tokens, _, num_slices = self.metadata()
+        # Sanity check shapes
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        self.to_device(self.input.device)
+
+        kwargs_list = []
+        for i in range(num_slices):
+            kwargs_list.append({
+                'inputs': self.input[i],
+                'lora_b_weights': self.lora_weights_lst[i],
+                'output_tensor': self.output,
+                'lora_indices_tensor': self.token_lora_mapping,
+                'slice_offset': i * hidden_size,
+                'slice_size': hidden_size,
+                'add_inputs': add_inputs,
+            })
+        return {'kwargs_list': kwargs_list}
+
+    def bench_fn_kwargs(self,
+                        op_type: OpType,
+                        add_inputs: Optional[bool] = None) -> Dict[str, Any]:
+        if op_type.is_shrink_fn():
+            assert add_inputs is None
+        else:
+            assert add_inputs is not None
+
+        if op_type == OpType.SGMV_SHRINK:
+            return self.as_sgmv_shrink_kwargs()
+        if op_type == OpType.SGMV_EXPAND:
+            return self.as_sgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_SHRINK:
+            return self.as_bgmv_shrink_kwargs()
+        if op_type == OpType.BGMV_EXPAND:
+            return self.as_bgmv_expand_kwargs(add_inputs)
+        if op_type == OpType.BGMV_EXPAND_SLICE:
+            return self.as_bgmv_expand_slice_kwargs(add_inputs)
+        raise ValueError(f"Unrecognized optype {self}")
+
+    def test_correctness(self, op_type: OpType,
+                         expand_fn_add_inputs: Optional[bool]) -> bool:
+        """
+        Test correctness of op_type implementation against a grouped gemm
+        reference implementation.
+        """
+        seq_lens_cpu = self.seq_lens.to(device="cpu")
+        prompt_lora_mapping_cpu = self.prompt_lora_mapping.to(device="cpu")
+        ref_output = self.output.clone()
+
+        self.output.zero_()
+        op_type.bench_fn()(
+            **self.bench_fn_kwargs(op_type, expand_fn_add_inputs))
+
+        op_type.run_ref_group_gemm(
+            ref_output,
+            self.input,
+            self.lora_weights_lst,
+            seq_lens_cpu=seq_lens_cpu,
+            prompt_lora_mapping_cpu=prompt_lora_mapping_cpu,
+            scaling=1.0,
+            add_inputs=expand_fn_add_inputs)
+
+        rtol, atol = {
+            torch.float16: (6e-2, 6e-2),
+            torch.bfloat16: (6e-2, 6e-2),
+            torch.float32: (1e-2, 1e-2),
+        }[self.output.dtype]
+
+        return torch.allclose(ref_output, self.output, rtol=rtol, atol=atol)
+
+
+def bench_optype(ctx: BenchmarkContext,
+                 arg_pool_size: int,
+                 op_type: OpType,
+                 cuda_graph_nops: Optional[int] = None,
+                 expand_fn_add_inputs: Optional[bool] = None,
+                 test_correctness: bool = False) -> TMeasurement:
+
+    assert arg_pool_size >= 1
+    if op_type.is_shrink_fn():
+        assert expand_fn_add_inputs is None
+    else:
+        assert expand_fn_add_inputs is not None
+
+    # BenchmarkContext -> BenchmarkTensors
+    bench_tensors : List[BenchmarkTensors] = \
+        [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
+    for bt in bench_tensors:
+        bt.sanity_check()
+
+    # Test correctness of our implementation.
+    if test_correctness:
+        assert all([
+            bt.test_correctness(op_type, expand_fn_add_inputs)
+            for bt in bench_tensors
+        ])
+
+    # BenchmarkTensors -> Dict (kwargs)
+    kwargs_list = [
+        bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
+        for bt in bench_tensors
+    ]
+
+    # Clear LoRA optimization hash-maps.
+    _LORA_A_PTR_DICT.clear()
+    _LORA_B_PTR_DICT.clear()
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    for kwargs in kwargs_list:
+        op_type.bench_fn()(**kwargs)
+    torch.cuda.synchronize()
+
+    # Merge into a single kwargs and qualify arguments as ArgPool
+    kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
+    for _kwargs in kwargs_list:
+        for k, v in _kwargs.items():
+            kwargs[k].values.append(v)
+
+    describe_args = (f"add_inputs={expand_fn_add_inputs}"
+                     if expand_fn_add_inputs is not None else "")
+    description = (
+        f"{op_type.name}({describe_args}) ({bench_tensors[0].io_types()})")
+
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    timer = None
+    with Bench(cuda_graph_params,
+               ctx.bench_label(), ctx.bench_sublabel(op_type), description,
+               op_type.bench_fn(), **kwargs) as bench:
+        timer = bench.run()
+    return timer
+
+
+def bench_torch_mm(ctx: BenchmarkContext,
+                   arg_pool_size: int,
+                   op_type: OpType,
+                   cuda_graph_nops: Optional[int] = None) -> TMeasurement:
+    """
+    Benchmark basic torch.mm as a roofline.
+
+    When all the input tokens have the same LoRA ID, the LoRA kernels are just
+    a matmul. This torch.mm benchmark serves as a roofline for that case. 
+
+    input op_type is used in determining the m, k, n dimensions for the matmul.
+    """
+
+    batch_size, hidden_size, lora_rank, seq_length, dtype = (ctx.batch_size,
+                                                             ctx.hidden_size,
+                                                             ctx.lora_rank,
+                                                             ctx.seq_length,
+                                                             ctx.dtype)
+
+    m, k, n = op_type.mkn(batch_size, seq_length, hidden_size, lora_rank)
+    # For a fairer comparison.
+    n = n * ctx.num_slices
+
+    # Get matmul input and output tensors for A x B = C
+    As, Bs, Cs = [], [], []
+    for _ in range(arg_pool_size):
+        As.append(torch.rand((m, k), dtype=dtype).to("cuda"))
+        Bs.append(torch.rand((n, k), dtype=dtype).to("cuda").t())
+        Cs.append(torch.rand((m, n), dtype=dtype).to("cuda"))
+
+    # Make torch.mm kwargs
+    mm_kwargs = {'input': ArgPool(As), 'mat2': ArgPool(Bs), 'out': ArgPool(Cs)}
+
+    description = (
+        f"single-lora roofline using torch.mm ({dtype_to_str(dtype)}"
+        f"x{dtype_to_str(dtype)}"
+        f"=>{dtype_to_str(dtype)})")
+    cuda_graph_params = None
+    if cuda_graph_nops:
+        cuda_graph_params = CudaGraphBenchParams(cuda_graph_nops)
+    with Bench(cuda_graph_params, ctx.bench_label(),
+               ctx.bench_sublabel(op_type), description, torch.mm,
+               **mm_kwargs) as bench:
+        return bench.run()
+
+
+# runner
+def use_cuda_graph_recommendation() -> str:
+    return """
+            Triton kernels have a significant launch overhead with
+            launched directly via python. This overhead is more noticeable
+            for small the problem sizes. For these cases, it is recommended
+            to use the script with `--cuda-graph-nops N` to benchmark N
+            consecutive invocations of the benchmarking operations from 
+            inside a CUDA Graph. Note that the returned measurement is for N 
+            invocations of the operation.
+            """
+
+
+def print_timers(timers: List[TMeasurement],
+                 args: Optional[argparse.Namespace] = None):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+    if args and args.cuda_graph_nops:
+        print(
+            f"Note : The timings reported above is for {args.cuda_graph_nops} "
+            "consecutive invocations of the benchmarking functions. "
+            f"Please divide by {args.cuda_graph_nops} for single invocation "
+            "timings.")
+
+    print("Note on Comparison with torch.mm : The torch.mm numbers are "
+          "benchmark numbers of a simple matmul emulating the single lora "
+          "case. It is provided as a roofline for comparing our LoRA Kernel "
+          "implementations. It is expected that the LoRA kernels will be "
+          "slower than torch.mm in cases where num_loras is big. But for "
+          "small num_loras the goal should be to match the torch.mm numbers.")
+
+
+def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
+
+    if args.cuda_graph_nops is not None:
+        assert args.cuda_graph_nops > 0
+        print(f"Benchmarking {args.cuda_graph_nops} invocations inside a CUDA "
+              "Graph")
+    else:
+        print(f"CUDA Graphs not enabled.\n{use_cuda_graph_recommendation()}")
+
+    timers = []
+    for bench_ctx in bench_ctxs:
+        for seq_len in args.seq_lengths:
+            bench_ops: List[OpType] = []
+            if seq_len == 1:
+                # bench all decode ops
+                bench_ops = [op for op in args.op_types if op.is_decode_op()]
+            else:
+                # bench all prefill ops
+                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
+
+            seq_len_timers = []
+            for bench_op in bench_ops:
+                for num_slices in bench_op.num_slices():
+                    _ctx = bench_ctx.with_seq_length(seq_len).with_num_slices(
+                        num_slices)
+                    # Benchmark torch.mm as a roofline
+                    seq_len_timers.append(
+                        bench_torch_mm(_ctx, args.arg_pool_size, bench_op,
+                                       args.cuda_graph_nops))
+
+                    # Benchmark bench_op
+                    expand_fn_add_inputs = [
+                        None
+                    ] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
+                    for add_input_arg in expand_fn_add_inputs:
+                        seq_len_timers.append(
+                            bench_optype(_ctx, args.arg_pool_size, bench_op,
+                                         args.cuda_graph_nops, add_input_arg,
+                                         args.test_correctness))
+
+            print_timers(seq_len_timers)
+            timers.extend(seq_len_timers)
+
+    # Result stdout dump
+    print("== All Results ====")
+    print_timers(timers, args)
+
+    if args.output_directory:
+        # Result file dump
+        od = Path(args.output_directory)
+        if not od.exists():
+            od.mkdir()
+
+        timestamp = int(time.time())
+        pkl_file = od / f"lora_bench-{timestamp}.pkl"
+        print(f"Writing benchmarks to {pkl_file}")
+        with open(pkl_file, "wb") as f:
+            pickle.dump(timers, f)
+
+
+def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
+                          args: argparse.Namespace) -> List[BenchmarkContext]:
+
+    ctxs: List[BenchmarkContext] = []
+    for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
+            args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
+            args.sort_by_lora_id):
+        ctxs.append(
+            BenchmarkContext(
+                batch_size=batch_size,
+                hidden_size=hidden_size,
+                lora_rank=lora_rank,
+                num_loras=num_loras,
+                num_active_loras=args.num_active_loras
+                if args.num_active_loras else num_loras,
+                # To be filled based on the OpType to benchmark
+                seq_length=None,
+                sort_by_lora_id=sort_by_lora_id,
+                dtype=args.dtype,
+                # To be filled based on the OpType to benchmark
+                num_slices=None))
+
+    return ctxs
+
+
+def run_list_bench(args: argparse.Namespace):
+    print(args)
+
+    print("List bench :\n"
+          f"  Hidden Sizes {args.hidden_sizes}"
+          f"  LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_range_bench(args: argparse.Namespace):
+    print(args)
+
+    hidden_sizes = list(
+        range(args.hidden_sizes_start, args.hidden_sizes_end + 1,
+              args.hidden_sizes_increment))
+    lora_ranks = list(
+        range(args.lora_ranks_start, args.lora_ranks_end + 1,
+              args.lora_ranks_increment))
+
+    print("Range bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+def run_model_bench(args: argparse.Namespace):
+    print(args)
+
+    def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]:
+        hidden_sizes = set()
+        for KN, tp_split_dim in WEIGHT_SHAPES[model]:
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            hidden_sizes.add(KN[1])
+        return hidden_sizes
+
+    # Get all hidden sizes
+    hidden_sizes: set[int] = set()
+    for model_name, tp_size in product(args.models, args.tp_sizes):
+        hidden_sizes = hidden_sizes.union(
+            hidden_sizes_from_model(model_name, tp_size))
+
+    print("Model bench :\n"
+          f" Hidden Sizes {hidden_sizes}"
+          f" LoRA Ranks {args.lora_ranks}")
+
+    # Get all benchmarking contexts
+    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+        hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
+
+    run(args, bench_contexts)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "torch.float16":
+            return torch.float16
+        if dt == "torch.bfloat16":
+            return torch.bfloat16
+        raise ValueError("unsupported dtype")
+
+    def get_bool(s: str) -> bool:
+        return s.lower() in ['true', '1']
+
+    def add_common_command_args(p: argparse.ArgumentParser):
+        p.add_argument(
+            "--dtype",
+            type=to_torch_dtype,
+            required=True,
+            help="Available options are ['torch.float16', 'torch.bfloat16']")
+
+        p.add_argument(
+            "--arg-pool-size",
+            type=int,
+            default=32,
+            help="Run profiles with a pool of input/output/meta tensors instead"
+            "of simply reusing the same tensors for all runs. A bigger arg-pool"
+            "mitigates hardware caching effects during benchmarking.")
+
+        p.add_argument(
+            "--cuda-graph-nops",
+            type=int,
+            help=("when set profiling is done using cudagraph, "
+                  "with the given number of operations in a graph."
+                  "Note that the measurement returned is the time "
+                  "taken for N consecutive executions of the benchmarking "
+                  "functions, where N is the value of this argument."))
+        p.add_argument("--num-loras",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_NUM_LORAS)
+        p.add_argument("--num-active-loras",
+                       type=int,
+                       default=None,
+                       help="Active LoRAs. When None, all LoRAs are active")
+        p.add_argument("--sort-by-lora-id",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_SORT_BY_LORA_IDS)
+        p.add_argument("--op-types",
+                       nargs="+",
+                       type=OpType.from_str,
+                       default=list(OpType))
+        p.add_argument('--seq-lengths',
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_SEQ_LENGTHS)
+        p.add_argument("--batch-sizes",
+                       nargs="+",
+                       type=int,
+                       default=DEFAULT_BATCH_SIZES)
+        p.add_argument("--expand-fn-add-inputs",
+                       nargs="+",
+                       type=get_bool,
+                       default=DEFAULT_EXPAND_FN_ADD_INPUTS)
+        p.add_argument(
+            '-o',
+            '--output-directory',
+            type=str,
+            help=("Output directory to store a the list of benchmarking"
+                  "TMeasurement objects as a pickle file"))
+
+        p.add_argument(
+            "--test-correctness",
+            action='store_true',
+            help=("When enabled, the benchmarking functions are tested"
+                  "for correctness before the actual benchmarking"))
+
+    parser = FlexibleArgumentParser(
+        description=f"""
+Benchmark LoRA kernels:
+    {use_cuda_graph_recommendation()}
+
+    list_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+
+    model_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+
+    range_bench example:
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    list_parser = subparsers.add_parser("list_bench")
+    list_parser.add_argument("--hidden-sizes",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_HIDDEN_SIZES)
+    list_parser.add_argument("--lora-ranks",
+                             nargs="+",
+                             type=int,
+                             default=DEFAULT_LORA_RANKS)
+    add_common_command_args(list_parser)
+    list_parser.set_defaults(func=run_list_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--hidden-sizes-start", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-end", type=int, required=True)
+    range_parser.add_argument("--hidden-sizes-increment",
+                              type=int,
+                              required=True)
+    range_parser.add_argument("--lora-ranks-start", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-end", type=int, required=True)
+    range_parser.add_argument("--lora-ranks-increment",
+                              type=int,
+                              required=True)
+    add_common_command_args(range_parser)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--lora-ranks",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_LORA_RANKS)
+    add_common_command_args(model_parser)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
 import argparse
 import time
 from datetime import datetime
+from itertools import product
 from typing import Any, Dict, List, Tuple, TypedDict

 import ray
@@ -13,6 +14,9 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser

+FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
+) else torch.float8_e4m3fn
+

 class BenchmarkConfig(TypedDict):
    BLOCK_SIZE_M: int
@@ -80,8 +84,8 @@ def benchmark_config(
        a1_scale = torch.randn(1, dtype=torch.float32)
        a2_scale = torch.randn(1, dtype=torch.float32)

-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
+        w1 = w1.to(FP8_DTYPE)
+        w2 = w2.to(FP8_DTYPE)

    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)

@@ -141,28 +145,172 @@ def benchmark_config(
    return avg


-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
+def get_rocm_tuning_space(use_fp16):
+    block_mn_range = [16, 32, 64, 128, 256]
+    block_k_range = [16, 32, 64, 128, 256]
+    if not use_fp16:
+        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    num_stage_range = [2]
+    waves_per_eu_range = [0]
+    matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
+    kpack_range = [1, 2] if use_fp16 else []
+
+    param_ranges = {
+        "BLOCK_SIZE_M": block_mn_range,
+        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_K": block_k_range,
+        "GROUP_SIZE_M": group_m_range,
+        "num_warps": num_warps_range,
+        "num_stages": num_stage_range,
+        "waves_per_eu": waves_per_eu_range,
+    }
+    if use_fp16:
+        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+        param_ranges["kpack"] = kpack_range
+
+    return param_ranges
+
+
+def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
    configs: List[BenchmarkConfig] = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
+
+    if current_platform.is_rocm():
+        param_ranges = get_rocm_tuning_space(use_fp16)
+    else:
+        # Reduced search space for faster tuning.
+        # TODO(woosuk): Increase the search space and use a performance model to
+        # prune the search space.
+        block_m_range = [16, 32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [64, 128, 256]
+        num_warps_range = [4, 8]
+        group_m_range = [1, 16, 32, 64]
+        num_stage_range = [2, 3, 4, 5]
+
+        param_ranges = {
+            "BLOCK_SIZE_M": block_m_range,
+            "BLOCK_SIZE_N": block_n_range,
+            "BLOCK_SIZE_K": block_k_range,
+            "GROUP_SIZE_M": group_m_range,
+            "num_warps": num_warps_range,
+            "num_stages": num_stage_range,
+        }
+
+    keys, values = zip(*param_ranges.items())
+    for config_values in product(*values):
+        config = dict(zip(keys, config_values))
+        configs.append(config)
    return configs


+def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
+                            search_space, is_fp16):
+    N1, K1 = shard_intermediate_size, hidden_size
+    N2, K2 = hidden_size, shard_intermediate_size // 2
+    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
+                                        is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
+                                        is_fp16)
+    search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
+    return search_space
+
+
+# The following code is inspired by ROCm/Triton GEMM tuning script:
+# https://github.com/ROCm/triton/blob/triton-mlir/scripts/amd/gemm/tune_gemm.py#L89
+def prune_rocm_configs(M, N, K, configs, is_fp16=True):
+    pruned_configs = []
+    elemBytes_a = 2 if is_fp16 else 1
+    elemBytes_b = 2 if is_fp16 else 1
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+
+        if is_fp16:
+            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+            if matrix_instr_nonkdim > mfma:
+                continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = config.get("SPLIT_K", 1)
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if is_fp16:
+            if (matrix_instr_nonkdim > BLOCK_SIZE_M
+                    or matrix_instr_nonkdim > BLOCK_SIZE_N):
+                continue
+            if (matrix_instr_nonkdim >= M
+                    and matrix_instr_nonkdim != BLOCK_SIZE_M):
+                continue
+            if (matrix_instr_nonkdim >= N
+                    and matrix_instr_nonkdim != BLOCK_SIZE_N):
+                continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a +
+               BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b)
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def need_split_k(SIZE_M, SIZE_N, SIZE_K):
+    return (SIZE_M < 64 or SIZE_N < 64) and SIZE_K > 1024
+
+
+def merge_unique_dicts(list1, list2):
+    result = []
+    combined_list = list1.copy()
+    combined_list.extend(list2)
+    for dictionary in combined_list:
+        if dictionary not in result:
+            result.append(dictionary)
+    return result
+
+
 @ray.remote(num_gpus=1)
 class BenchmarkWorker:

@@ -170,6 +318,10 @@ class BenchmarkWorker:
        torch.set_default_device("cuda")
        current_platform.seed_everything(seed)
        self.seed = seed
+        # Get the device ID to allocate tensors and kernels
+        # on the respective GPU. This is required for Ray to work
+        # correctly with multi-GPU tuning on the ROCm platform.
+        self.device_id = int(ray.get_gpu_ids()[0])

    def benchmark(
        self,
@@ -191,9 +343,13 @@ class BenchmarkWorker:
        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
                                    dtype_str)
        if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
+            config = get_default_config(num_tokens,
+                                        num_experts,
+                                        shard_intermediate_size,
+                                        hidden_size,
+                                        topk,
+                                        dtype_str,
+                                        is_marlin=False)
        else:
            config = op_config[min(op_config.keys(),
                                   key=lambda x: abs(x - num_tokens))]
@@ -217,25 +373,33 @@ class BenchmarkWorker:
    ) -> Dict[str, int]:
        best_config = None
        best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8_w8a8,
-                                               use_int8_w8a16,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
+        if current_platform.is_rocm():
+            is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+            search_space = prune_rocm_search_space(num_tokens,
+                                                   shard_intermediate_size,
+                                                   hidden_size, search_space,
+                                                   is_fp16)
+
+        with torch.cuda.device(self.device_id):
+            for config in tqdm(search_space):
+                try:
+                    kernel_time = benchmark_config(config,
+                                                   num_tokens,
+                                                   num_experts,
+                                                   shard_intermediate_size,
+                                                   hidden_size,
+                                                   topk,
+                                                   dtype,
+                                                   use_fp8_w8a8,
+                                                   use_int8_w8a16,
+                                                   num_iters=20)
+                except triton.runtime.autotuner.OutOfResources:
+                    # Some configurations may be invalid and fail to compile.
+                    continue
+
+                if kernel_time < best_time:
+                    best_time = kernel_time
+                    best_config = config
        now = datetime.now()
        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
        assert best_config is not None
@@ -244,12 +408,27 @@ class BenchmarkWorker:

 def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
    return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
+        "BLOCK_SIZE_M":
+        config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N":
+        config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K":
+        config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M":
+        config["GROUP_SIZE_M"],
+        "num_warps":
+        config["num_warps"],
+        "num_stages":
+        config["num_stages"],
+        **({
+            "waves_per_eu": config["waves_per_eu"]
+        } if "waves_per_eu" in config else {}),
+        **({
+            "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
+        } if "matrix_instr_nonkdim" in config else {}),
+        **({
+            "kpack": config["kpack"]
+        } if "kpack" in config else {}),
    }


@@ -275,7 +454,8 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
 def main(args: argparse.Namespace):
    print(args)

-    config = AutoConfig.from_pretrained(args.model)
+    config = AutoConfig.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code)
    if config.architectures[0] == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
@@ -286,7 +466,7 @@ def main(args: argparse.Namespace):
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] == "DeepseekV2ForCausalLM":
+    elif config.architectures[0] == "DeepseekV3ForCausalLM":
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
@@ -299,7 +479,7 @@ def main(args: argparse.Namespace):
        shard_intermediate_size = 2 * intermediate_size // args.tp_size

    hidden_size = config.hidden_size
-    dtype = config.torch_dtype
+    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"

@@ -329,7 +509,8 @@ def main(args: argparse.Namespace):
        return ray.get(outputs)

    if args.tune:
-        search_space = get_configs_compute_bound()
+        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
+        search_space = get_configs_compute_bound(is_fp16)
        print(f"Start tuning over {len(search_space)} configurations...")

        start = time.time()
@@ -361,7 +542,11 @@ if __name__ == "__main__":
    parser.add_argument("--model",
                        type=str,
                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--tp-size",
+                        "-tp",
+                        "--tensor-parallel-size",
+                        type=int,
+                        default=2)
    parser.add_argument("--dtype",
                        type=str,
                        choices=["auto", "fp8_w8a8", "int8_w8a16"],
@@ -369,6 +554,7 @@ if __name__ == "__main__":
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--batch-size", type=int, required=False)
    parser.add_argument("--tune", action="store_true")
+    parser.add_argument("--trust-remote-code", action="store_true")
    args = parser.parse_args()

    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -100,7 +100,9 @@ def main(
        start_time = time.perf_counter()

        # Using default kv_scale
-        k_scale = v_scale = 1.0
+        k_scale = v_scale = torch.tensor(1.0,
+                                         dtype=torch.float32,
+                                         device=device)

        for _ in range(num_iters):
            if version == "v1":

--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
+import dataclasses
+from typing import Any, Callable, Iterable, Optional
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+
+
+@dataclasses.dataclass
+class CudaGraphBenchParams:
+    num_ops_in_cuda_graph: int
+
+
+@dataclasses.dataclass
+class ArgPool:
+    """
+    When some argument of the benchmarking function is annotated with this type,
+    the benchmarking class (BenchMM) will collapse the argument to a pick a
+    single value from the given list of values, during function invocation.
+    For every invocation during a benchmarking run, it will choose a
+    different value from the list.
+    """
+    values: Iterable[Any]
+
+    def __getitem__(self, index):
+        return self.values[index]
+
+
+class Bench:
+
+    class ArgsIterator:
+
+        def __init__(self, args_list, kwargs_list):
+            assert len(args_list) == len(kwargs_list)
+            self.args_list = args_list
+            self.kwargs_list = kwargs_list
+            self.n = len(self.args_list)
+            self.idx = 0
+
+        def __next__(self):
+            while True:
+                yield (self.args_list[self.idx], self.kwargs_list[self.idx])
+                self.idx += 1
+                self.idx = self.idx % self.n
+
+        def reset(self):
+            self.idx = 0
+
+        @property
+        def n_args(self):
+            return self.n
+
+    def __init__(self, cuda_graph_params: Optional[CudaGraphBenchParams],
+                 label: str, sub_label: str, description: str, fn: Callable,
+                 *args, **kwargs):
+
+        self.cuda_graph_params = cuda_graph_params
+        self.use_cuda_graph = self.cuda_graph_params is not None
+        self.label = label
+        self.sub_label = sub_label
+        self.description = description
+        self.fn = fn
+
+        # Process args
+        self._args = args
+        self._kwargs = kwargs
+        self.args_list, self.kwargs_list = self.collapse_argpool(
+            *args, **kwargs)
+        self.args_iterator = self.ArgsIterator(self.args_list,
+                                               self.kwargs_list)
+
+        # Cudagraph runner
+        self.g = None
+        if self.use_cuda_graph:
+            self.g = self.get_cuda_graph_runner()
+
+        # benchmark run params
+        self.min_run_time = 1
+
+    def collapse_argpool(self, *args, **kwargs):
+        argpool_args = [arg for arg in args if isinstance(arg, ArgPool)] + [
+            arg for arg in kwargs.values() if isinstance(arg, ArgPool)
+        ]
+        if len(argpool_args) == 0:
+            return [args], [kwargs]
+
+        # Make sure all argpools are of the same size
+        argpool_size = len(argpool_args[0].values)
+        assert all([argpool_size == len(arg.values) for arg in argpool_args])
+
+        # create copies of the args
+        args_list = []
+        kwargs_list = []
+        for _ in range(argpool_size):
+            args_list.append(args)
+            kwargs_list.append(kwargs.copy())
+
+        for i in range(argpool_size):
+            # collapse args; Just pick the ith value
+            args_list[i] = tuple([
+                arg[i] if isinstance(arg, ArgPool) else arg
+                for arg in args_list[i]
+            ])
+
+            # collapse kwargs
+            kwargs_i = kwargs_list[i]
+            arg_pool_keys = [
+                k for k, v in kwargs_i.items() if isinstance(v, ArgPool)
+            ]
+            for k in arg_pool_keys:
+                # again just pick the ith value
+                kwargs_i[k] = kwargs_i[k][i]
+            kwargs_list[i] = kwargs_i
+
+        return args_list, kwargs_list
+
+    def get_cuda_graph_runner(self):
+        assert self.use_cuda_graph
+        assert self.args_iterator is not None
+
+        num_graph_ops = self.cuda_graph_params.num_ops_in_cuda_graph
+
+        # warmup
+        args_it = self.args_iterator.__next__()
+        for _ in range(2):
+            args, kwargs = next(args_it)
+            self.fn(*args, **kwargs)
+
+        self.args_iterator.reset()
+        args_it = self.args_iterator.__next__()
+        stream = torch.cuda.Stream()
+        with torch.cuda.stream(stream):
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                for _ in range(num_graph_ops):
+                    args, kwargs = next(args_it)
+                    self.fn(*args, **kwargs)
+        return g
+
+    def run_cudagrah(self) -> TMeasurement:
+        assert self.use_cuda_graph
+        globals = {'g': self.g}
+
+        return TBenchmark.Timer(
+            stmt="g.replay()",
+            globals=globals,
+            label=(
+                f"{self.label}"
+                f" | cugraph {self.cuda_graph_params.num_ops_in_cuda_graph} ops"
+            ),
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run_eager(self) -> TMeasurement:
+        setup = None
+        stmt = None
+        globals = None
+
+        has_arg_pool = self.args_iterator.n_args > 1
+        if has_arg_pool:
+            setup = '''
+                    args_iterator.reset()
+                    args_it = args_iterator.__next__()
+                    '''
+            stmt = '''
+                    args, kwargs = next(args_it)
+                    fn(*args, **kwargs)
+                    '''
+            globals = {'fn': self.fn, 'args_iterator': self.args_iterator}
+        else:
+            # no arg pool. Just use the args and kwargs directly
+            self.args_iterator.reset()
+            args_it = self.args_iterator.__next__()
+            args, kwargs = next(args_it)
+
+            setup = ""
+            stmt = '''
+                    fn(*args, **kwargs)
+                   '''
+            globals = {'fn': self.fn, 'args': args, 'kwargs': kwargs}
+
+        return TBenchmark.Timer(
+            stmt=stmt,
+            setup=setup,
+            globals=globals,
+            label=self.label,
+            sub_label=self.sub_label,
+            description=self.description,
+        ).blocked_autorange(min_run_time=self.min_run_time)
+
+    def run(self) -> TMeasurement:
+        timer = None
+        if self.use_cuda_graph:  # noqa SIM108
+            timer = self.run_cudagrah()
+        else:
+            timer = self.run_eager()
+        if not timer.meets_confidence() or timer.has_warnings:
+            print("Doesn't meet confidence - re-running bench ...")
+            return self.run()
+        return timer
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if exc_type:
+            print(f"exc type {exc_type}")
+            print(f"exc value {exc_value}")
+            print(f"exc traceback {traceback}")
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    set(MACOSX_FOUND TRUE)
+endif()
+
+
 #
 # Define environment variables for special configurations
 #
@@ -13,6 +18,9 @@ endif()

 include_directories("${CMAKE_SOURCE_DIR}/csrc")

+
+set (ENABLE_NUMA TRUE)
+
 #
 # Check the compile flags
 #
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
        "-mf16c"
    )
 endif()
-list(APPEND CXX_COMPILE_FLAGS
-    "-fopenmp"
-    "-DVLLM_CPU_EXTENSION")

-execute_process(COMMAND cat /proc/cpuinfo
-                RESULT_VARIABLE CPUINFO_RET
-                OUTPUT_VARIABLE CPUINFO)
+if(MACOSX_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS
+        "-Xpreprocessor"
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+else()
+    list(APPEND CXX_COMPILE_FLAGS
+        "-fopenmp"
+        "-DVLLM_CPU_EXTENSION")
+endif()

-if (NOT CPUINFO_RET EQUAL 0)
-    message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+if (NOT MACOSX_FOUND)
+    execute_process(COMMAND cat /proc/cpuinfo
+                    RESULT_VARIABLE CPUINFO_RET
+                    OUTPUT_VARIABLE CPUINFO)
+    if (NOT CPUINFO_RET EQUAL 0)
+        message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+    endif()
 endif()

+
 function (find_isa CPUINFO TARGET OUT)
    string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
    if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()

 is_avx512_disabled(AVX512_DISABLED)

-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
-find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
-find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(APPLE_SILICON_FOUND TRUE)
+else()
+    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+endif()
+

 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
    endif()
    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+elseif(APPLE_SILICON_FOUND)
+    message(STATUS "Apple Silicon Detected")
+    set(ENABLE_NUMA OFF)
 else()
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -139,7 +165,12 @@ endif()

 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

-list(APPEND LIBS numa)
+if(ENABLE_NUMA)
+    list(APPEND LIBS numa)
+else()
+    message(STATUS "NUMA is disabled")
+    add_compile_definitions(-DVLLM_NUMA_DISABLED)
+endif()

 #
 # _C extension

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -58,8 +58,8 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
  #
  set(SRCS ${ORIG_SRCS})
  set(CXX_SRCS ${ORIG_SRCS})
-  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
-  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)|(hip)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)|(hip)$")

  #
  # Generate ROCm/HIP source file names from CUDA file names.
@@ -264,7 +264,7 @@ endmacro()
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
 # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
 #  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result. 
+#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -275,34 +275,47 @@ endmacro()
 #
 function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS)
  list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
+  set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})

  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
  set(_CUDA_ARCHS)
  if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
    list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
-    if ("9.0" IN_LIST TGT_CUDA_ARCHS)
+    if ("9.0" IN_LIST TGT_CUDA_ARCHS_)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0")
      set(_CUDA_ARCHS "9.0a")
    endif()
  endif()

  list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)

-  # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is 
-  # less or eqault to ARCH
-  foreach(_ARCH ${CUDA_ARCHS})
-  set(_TMP_ARCH)
-  foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
-    if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
-      set(_TMP_ARCH ${_SRC_ARCH})
-    else()
-      break()
+  # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
+  # is less or equal to ARCH (but has the same major version since SASS binary
+  # compatibility is only forward compatible within the same major version).
+  foreach(_ARCH ${TGT_CUDA_ARCHS_})
+    set(_TMP_ARCH)
+    # Extract the major version of the target arch
+    string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}")
+    foreach(_SRC_ARCH ${SRC_CUDA_ARCHS})
+      # Extract the major version of the source arch
+      string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}")
+      # Check major-version match AND version-less-or-equal
+      if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH)
+        if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR)
+          set(_TMP_ARCH "${_SRC_ARCH}")
+        endif()
+      else()
+        # If we hit a version greater than the target, we can break
+        break()
+      endif()
+    endforeach()
+
+    # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS
+    if (_TMP_ARCH)
+      list(APPEND _CUDA_ARCHS "${_TMP_ARCH}")
    endif()
  endforeach()
-  if (_TMP_ARCH)
-    list(APPEND _CUDA_ARCHS ${_TMP_ARCH})
-  endif()
-  endforeach()

  list(REMOVE_DUPLICATES _CUDA_ARCHS)
  set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE)