merge v0.3.3

e00b0a19 · zhuwenwen · ead94d93 · 3f1166ab · e00b0a19 · e00b0a19
Commit e00b0a19 authored Mar 23, 2024 by zhuwenwen
20 changed files
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 set -ex
+set -o pipefail
 # cd into parent directory of this file
 cd "$(dirname "${BASH_SOURCE[0]}")/.."
-# run benchmarks and upload the result to buildkite
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+# run python-based benchmarks and upload the result to buildkite
 python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+bench_latency_exit_code=$?
 python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+bench_throughput_exit_code=$?
+# run server-based benchmarks and upload the result to buildkite
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
+server_pid=$!
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+# wait for server to start, timeout after 600 seconds
+timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+python3 benchmarks/benchmark_serving.py \
+    --backend openai \
+    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --model meta-llama/Llama-2-7b-chat-hf \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
+bench_serving_exit_code=$?
+kill $server_pid
 # write the results into a markdown file
 echo "### Latency Benchmarks" >> benchmark_results.md
-sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
 echo "### Throughput Benchmarks" >> benchmark_results.md
-sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
+echo "### Serving Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+# exit with the exit code of the benchmarks
+if [ $bench_latency_exit_code -ne 0 ]; then
+    exit $bench_latency_exit_code
+fi
+if [ $bench_throughput_exit_code -ne 0 ]; then
+    exit $bench_throughput_exit_code
+fi
+if [ $bench_serving_exit_code -ne 0 ]; then
+    exit $bench_serving_exit_code
+fi
+/workspace/buildkite-agent artifact upload openai-*.json
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -11,14 +11,25 @@ steps:
 - label: AsyncEngine Test
  command: pytest -v -s async_engine
- label: Distributed Test
+- label: Basic Correctness Test
-  command: pytest -v -s test_comm_ops.py
+  command: pytest -v -s --forked basic_correctness
+- label: Distributed Comm Ops Test
+  command: pytest -v -s --forked test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+- label: Distributed Correctness Test
+  command: pytest -v -s --forked test_basic_distributed_correctness.py
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.
 - label: Engine Test
  command: pytest -v -s engine
+- label: Entrypoints Test
+  command: pytest -v -s entrypoints
 - label: Kernels Test
  command: pytest -v -s kernels
  soft_fail: true
@@ -28,14 +39,31 @@ steps:
    - pytest -v -s models --forked
  soft_fail: true
+- label: Prefix Caching Test
+  commands:
+    - pytest -v -s prefix_caching
 - label: Samplers Test
  command: pytest -v -s samplers --forked
 - label: Worker Test
  command: pytest -v -s worker
+- label: LoRA Test
+  command: pytest -v -s lora --forked
+- label: Metrics Test
+  command: pytest -v -s metrics
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  commands:
  - pip install aiohttp
  - bash run-benchmarks.sh
+- label: Documentation Build
+  working_dir: "/vllm-workspace/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -5,10 +5,14 @@
 steps:
  - label: ":docker: build image"
    commands:
-      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
      - "docker push {{ docker_image }}"
    env:
      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
  - wait
  {% for step in steps %}
@@ -31,13 +35,15 @@ steps:
              - image: "{{ docker_image }}"
                command: ["bash"]
                args:
-                - "-c"
+                - '-c'
                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
                resources:
                  requests:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                  limits:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
                env:
                  - name: HF_TOKEN
                    valueFrom:

--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -25,7 +25,10 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
    - name: Analysing the code with ruff
      run: |
        ruff vllm tests
+    - name: Spelling check with codespell
+      run: |
+         codespell --toml pyproject.toml
\ No newline at end of file
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements.txt
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -28,4 +28,4 @@ jobs:
        pip install toml==0.10.2
    - name: Running yapf
      run: |
-        yapf --diff --recursive vllm tests
+        yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@@ -181,3 +181,6 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+# Benchmark dataset
+*.json
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,12 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
    && apt-get install -y python3-pip git
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
 WORKDIR /workspace
 # install build and runtime dependencies
@@ -45,6 +51,8 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 RUN python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
@@ -67,8 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
 #################### RUNTIME BASE IMAGE ####################
-# use CUDA base as CUDA runtime dependencies are already installed via pip
+# We used base cuda image because pytorch installs its own cuda libraries.
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+# However cupy depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 # libnccl required for ray
 RUN apt-get update -y \

--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
-FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+FROM $BASE_IMAGE
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+RUN echo "Base image is $BASE_IMAGE"
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+ARG FA_GFX_ARCHS="gfx90a;gfx942"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+ARG FA_BRANCH="3d2b6f5"
+RUN echo "FA_BRANCH is $FA_BRANCH"
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
@@ -33,26 +56,36 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
 # Install ROCm flash-attention
-RUN mkdir libs \
+RUN if [ "$BUILD_FA" = "1" ]; then \
+    mkdir libs \
    && cd libs \
-    && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
+    && git clone https://github.com/ROCm/flash-attention.git \
    && cd flash-attention \
-    && git checkout 3d2b6f5 \
+    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && export GPU_ARCHS=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) \
+    && export GPU_ARCHS=${FA_GFX_ARCHS} \
-    && patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
    && python3 setup.py install \
-    && cd ..
+    && cd ..; \
+    fi
 COPY ./ /app/vllm
 RUN python3 -m pip install --upgrade pip
-RUN pip install xformers==0.0.23 --no-deps
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
 RUN cd /app \
    && cd vllm \
    && pip install -U -r requirements-rocm.txt \
-    && bash patch_xformers.rocm.sh \
+    && if [ "$BUILD_FA" = "1" ]; then \
+       bash patch_xformers.rocm.sh; fi \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
    && python3 setup.py install \
    && cd ..

--- a/README.md
+++ b/README.md
 # <div align="center"><strong>vLLM</strong></div>
 ## 简介
-vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention高效管理kv内存，Continuous batching传入请求，支持很多Hugging Face模型，如LLaMA & LLaMA-2、Qwen、Baichuan & Baichuan2等。
+vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention高效管理kv内存，Continuous batching传入请求，支持很多Hugging Face模型，如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。
 ## 安装
 vLLM支持
@@ -31,7 +31,8 @@ git clone https://developer.hpccube.com/codes/aicomponent/vllm # 根据需要的
 ```
 1. 编译whl包并安装
 python setup.py bdist_wheel 
-pip install dist/vllm*
+cd dist
+pip install vllm*
 2. 源码编译安装
 python3 setup.py install 
@@ -41,7 +42,7 @@ python3 setup.py install
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.2.7；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.3.3；
 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -26,7 +26,9 @@ Please register [here](https://lu.ma/ygxbpzhl) and join us!
 ---
 *Latest News* 🔥
- [2023/12] Added ROCm support to vLLM.
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2024/01] Added ROCm 6.0 support to vLLM.
+- [2023/12] Added ROCm 5.7 support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
 - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
 - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
@@ -45,7 +47,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with **PagedAttention**
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629)
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 vLLM is flexible and easy to use with:
@@ -56,6 +58,8 @@ vLLM is flexible and easy to use with:
 - Streaming outputs
 - OpenAI-compatible API server
 - Support NVIDIA GPUs and AMD GPUs
+- (Experimental) Prefix caching support
+- (Experimental) Multi-lora support
 vLLM seamlessly supports many Hugging Face models, including the following architectures:
@@ -65,18 +69,25 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
 - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Optional
+import aiohttp
+from tqdm.asyncio import tqdm
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    best_of: int = 1
+    use_beam_search: bool = False
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0
+    ttft: float = 0
+    prompt_len: int = 0
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        params = {
+            "best_of": request_func_input.best_of,
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for data in response.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+                    body = data.decode("utf-8").lstrip("data:")
+                    output.generated_text = json.loads(body)["generated_text"]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_vllm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "prompt": request_func_input.prompt,
+            "n": 1,
+            "best_of": request_func_input.best_of,
+            "use_beam_search": request_func_input.use_beam_search,
+            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "ignore_eos": True,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for data in response.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+                    # When streaming, '\0' is appended to the end of the response.
+                    body = data.decode("utf-8").strip("\0")
+                    output.generated_text = json.loads(
+                        body)["text"][0][len(request_func_input.prompt):]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        assert request_func_input.best_of == 1
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload) as resp:
+                if resp.status == 200:
+                    async for data in resp.content.iter_any():
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                    output.latency = time.perf_counter() - st
+                    body = data.decode("utf-8").lstrip("data:")
+                    output.generated_text = json.loads(body)["text_output"]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert request_func_input.best_of == 1
+        assert not request_func_input.use_beam_search
+        payload = {
+            "prompts": request_func_input.prompt,
+            "max_new_tokens": request_func_input.output_len,
+            "ignore_eos": True,
+            "do_sample": True,
+            "temperature":
+            0.01,  # deepspeed-mii does not accept 0.0 temperature.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
+        # https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url,
+                                    json=payload) as resp:
+                if resp.status == 200:
+                    parsed_resp = await resp.json()
+                    output.latency = time.perf_counter() - st
+                    output.generated_text = parsed_resp[0]["generated_text"]
+                    output.success = True
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+        if pbar:
+            pbar.update(1)
+        return output
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("v1/completions")
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        if ttft == 0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            body = json.loads(chunk)
+                            generated_text += body["choices"][0]["text"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.success = False
+        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+            output.success = False
+    if pbar:
+        pbar.update(1)
+    return output
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_vllm,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "tensorrt-llm": async_request_trt_llm,
+}
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -24,6 +24,8 @@ def main(args: argparse.Namespace):
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
        enforce_eager=args.enforce_eager,
+        kv_cache_dtype=args.kv_cache_dtype,
+        device=args.device,
    )
    sampling_params = SamplingParams(
@@ -35,7 +37,10 @@ def main(args: argparse.Namespace):
        max_tokens=args.output_len,
    )
    print(sampling_params)
-    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
@@ -65,9 +70,11 @@ def main(args: argparse.Namespace):
    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
-            profile_dir = Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            profile_dir = Path(
+                "."
+            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=args.profile_result_dir)
+        run_to_completion(profile_dir=profile_dir)
        return
    # Benchmark.
@@ -115,6 +122,13 @@ if __name__ == '__main__':
    parser.add_argument('--enforce-eager',
                        action='store_true',
                        help='enforce eager mode and disable CUDA graph')
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=['auto', 'fp8_e5m2'],
+        default='auto',
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
    parser.add_argument(
        '--profile',
        action='store_true',
@@ -123,9 +137,13 @@ if __name__ == '__main__':
        '--profile-result-dir',
        type=str,
        default=None,
-        help=(
+        help=('path to save the pytorch profiler output. Can be visualized '
-            'path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
-            'with ui.perfetto.dev or Tensorboard.'
+    parser.add_argument(
-        ))
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,7 +7,7 @@ On the server side, run one of the following commands:
        --disable-log-requests
    (TGI backend)
-    ./launch_hf_server.sh <your_model>
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 On the client side, run:
    python benchmarks/benchmark_serving.py \
@@ -20,15 +20,36 @@ import asyncio
 import json
 import random
 import time
+from dataclasses import dataclass
+from datetime import datetime
 from typing import AsyncGenerator, List, Tuple
-import aiohttp
 import numpy as np
+from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 from vllm.transformers_utils.tokenizer import get_tokenizer
-# (prompt len, output len, latency)
+from backend_request_func import (
-REQUEST_LATENCY: List[Tuple[int, int, float]] = []
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    input_throughput: float
+    output_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    p99_tpot_ms: float
 def sample_requests(
@@ -40,15 +61,15 @@ def sample_requests(
    with open(dataset_path) as f:
        dataset = json.load(f)
    # Filter out the conversations with less than 2 turns.
-    dataset = [
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-        data for data in dataset
-        if len(data["conversations"]) >= 2
-    ]
    # Only keep the first two turns of each conversation.
-    dataset = [
+    dataset = [(data["conversations"][0]["value"],
-        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+                data["conversations"][1]["value"]) for data in dataset]
-        for data in dataset
-    ]
+    # some of these will be filtered out, so sample more than we need
+    sampled_indices = random.sample(range(len(dataset)),
+                                    int(num_requests * 1.2))
+    dataset = [dataset[i] for i in sampled_indices]
    # Tokenize the prompts and completions.
    prompts = [prompt for prompt, _ in dataset]
@@ -96,79 +117,125 @@ async def get_request(
        await asyncio.sleep(interval)
-async def send_request(
+def calculate_metrics(
-    backend: str,
+    input_requests: List[Tuple[str, int, int]],
-    api_url: str,
+    outputs: List[RequestFuncOutput],
-    prompt: str,
+    dur_s: float,
-    prompt_len: int,
+    tokenizer: PreTrainedTokenizerBase,
-    output_len: int,
+) -> BenchmarkMetrics:
-    best_of: int,
+    total_output = 0
-    use_beam_search: bool,
+    total_input = 0
-) -> None:
+    completed = 0
-    request_start_time = time.perf_counter()
+    per_token_latencies = []
+    ttfts = []
-    headers = {"User-Agent": "Benchmark Client"}
+    for i in range(len(outputs)):
-    if backend == "vllm":
+        if outputs[i].success:
-        pload = {
+            output_len = len(tokenizer.encode(outputs[i].generated_text))
-            "prompt": prompt,
+            total_output += output_len
-            "n": 1,
+            total_input += input_requests[i][1]
-            "best_of": best_of,
+            per_token_latencies.append(outputs[i].latency / output_len)
-            "use_beam_search": use_beam_search,
+            ttfts.append(outputs[i].ttft)
-            "temperature": 0.0 if use_beam_search else 1.0,
+            completed += 1
-            "top_p": 1.0,
-            "max_tokens": output_len,
-            "ignore_eos": True,
-            "stream": False,
-        }
-    elif backend == "tgi":
-        assert not use_beam_search
-        params = {
-            "best_of": best_of,
-            "max_new_tokens": output_len,
-            "do_sample": True,
-        }
-        pload = {
-            "inputs": prompt,
-            "parameters": params,
-        }
-    else:
-        raise ValueError(f"Unknown backend: {backend}")
-    timeout = aiohttp.ClientTimeout(total=3 * 3600)
-    async with aiohttp.ClientSession(timeout=timeout) as session:
-        while True:
-            async with session.post(api_url, headers=headers, json=pload) as response:
-                chunks = []
-                async for chunk, _ in response.content.iter_chunks():
-                    chunks.append(chunk)
-            output = b"".join(chunks).decode("utf-8")
-            output = json.loads(output)
-            # Re-send the request if it failed.
+    metrics = BenchmarkMetrics(
-            if "error" not in output:
+        completed=completed,
-                break
+        total_input=total_input,
+        total_output=total_output,
+        request_throughput=completed / dur_s,
+        input_throughput=total_input / dur_s,
+        output_throughput=total_output / dur_s,
+        mean_ttft_ms=np.mean(ttfts) * 1000,
+        median_ttft_ms=np.median(ttfts) * 1000,
+        p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
+        mean_tpot_ms=np.mean(per_token_latencies) * 1000,
+        median_tpot_ms=np.median(per_token_latencies) * 1000,
+        p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+    )
-    request_end_time = time.perf_counter()
+    return metrics
-    request_latency = request_end_time - request_start_time
-    REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
 async def benchmark(
    backend: str,
    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
    input_requests: List[Tuple[str, int, int]],
    best_of: int,
    use_beam_search: bool,
    request_rate: float,
-) -> None:
+    disable_tqdm: bool,
-    tasks: List[asyncio.Task] = []
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+    print(f"Traffic request rate: {request_rate}")
+    benchmark_start_time = time.perf_counter()
+    tasks = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
-        task = asyncio.create_task(send_request(backend, api_url, prompt,
+        request_func_input = RequestFuncInput(
-                                                prompt_len, output_len,
+            model=model_id,
-                                                best_of, use_beam_search))
+            prompt=prompt,
-        tasks.append(task)
+            api_url=api_url,
-    await asyncio.gather(*tasks)
+            prompt_len=prompt_len,
+            output_len=output_len,
+            best_of=best_of,
+            use_beam_search=use_beam_search,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input,
+                             pbar=pbar)))
+    outputs = await asyncio.gather(*tasks)
+    if not disable_tqdm:
+        pbar.close()
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+    metrics = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+    )
+    print(f"Successful requests: {metrics.completed}")
+    print(f"Benchmark duration: {benchmark_duration:2f} s")
+    print(f"Total input tokens: {metrics.total_input}")
+    print(f"Total generated tokens: {metrics.total_output}")
+    print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
+    print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
+    print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
+    print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
+    print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
+    print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
+    print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
+    print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
+    print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_inthroughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms
+    }
+    return result
 def main(args: argparse.Namespace):
@@ -176,58 +243,145 @@ def main(args: argparse.Namespace):
    random.seed(args.seed)
    np.random.seed(args.seed)
-    api_url = f"http://{args.host}:{args.port}/generate"
+    backend = args.backend
-    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+    model_id = args.model
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+    tokenizer = get_tokenizer(tokenizer_id,
+                              trust_remote_code=args.trust_remote_code)
    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
-    benchmark_start_time = time.perf_counter()
+    benchmark_result = asyncio.run(
-    asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
+        benchmark(
-                          args.use_beam_search, args.request_rate))
+            backend=backend,
-    benchmark_end_time = time.perf_counter()
+            api_url=api_url,
-    benchmark_time = benchmark_end_time - benchmark_start_time
+            model_id=model_id,
-    print(f"Total time: {benchmark_time:.2f} s")
+            tokenizer=tokenizer,
-    print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
+            input_requests=input_requests,
+            best_of=args.best_of,
-    # Compute the latency statistics.
+            use_beam_search=args.use_beam_search,
-    avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
+            request_rate=args.request_rate,
-    print(f"Average latency: {avg_latency:.2f} s")
+            disable_tqdm=args.disable_tqdm,
-    avg_per_token_latency = np.mean([
+        ))
-        latency / (prompt_len + output_len)
-        for prompt_len, output_len, latency in REQUEST_LATENCY
+    # Save config and results to json
-    ])
+    if args.save_result:
-    print(f"Average latency per token: {avg_per_token_latency:.2f} s")
+        result_json = {}
-    avg_per_output_token_latency = np.mean([
-        latency / output_len
+        # Setup
-        for _, output_len, latency in REQUEST_LATENCY
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
-    ])
+        result_json["date"] = current_dt
-    print("Average latency per output token: "
+        result_json["backend"] = backend
-          f"{avg_per_output_token_latency:.2f} s")
+        result_json["version"] = args.version
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["use_beam_search"] = args.use_beam_search
+        result_json["num_prompts"] = args.num_prompts
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf")
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        with open(file_name, "w") as outfile:
+            json.dump(result_json, outfile)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark the online serving throughput.")
-    parser.add_argument("--backend", type=str, default="vllm",
+    parser.add_argument(
-                        choices=["vllm", "tgi"])
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        default="N/A",
+        help="Version of the serving backend/engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--dataset", type=str, required=True,
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/generate",
+        help="API endpoint.",
+    )
+    parser.add_argument("--dataset",
+                        type=str,
+                        required=True,
                        help="Path to the dataset.")
-    parser.add_argument("--tokenizer", type=str, required=True,
+    parser.add_argument(
-                        help="Name or path of the tokenizer.")
+        "--model",
-    parser.add_argument("--best-of", type=int, default=1,
+        type=str,
-                        help="Generates `best_of` sequences per prompt and "
+        required=True,
-                             "returns the best one.")
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default model tokenizer.",
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and "
+        "returns the best one.",
+    )
    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument("--num-prompts", type=int, default=1000,
+    parser.add_argument(
-                        help="Number of prompts to process.")
+        "--num-prompts",
-    parser.add_argument("--request-rate", type=float, default=float("inf"),
+        type=int,
-                        help="Number of requests per second. If this is inf, "
+        default=1000,
-                             "then all the requests are sent at time 0. "
+        help="Number of prompts to process.",
-                             "Otherwise, we use Poisson process to synthesize "
+    )
-                             "the request arrival times.")
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize "
+        "the request arrival times.",
+    )
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument('--trust-remote-code', action='store_true',
+    parser.add_argument(
-                        help='trust remote code from huggingface')
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -71,6 +71,8 @@ def run_vllm(
    dtype: str,
    max_model_len: Optional[int],
    enforce_eager: bool,
+    kv_cache_dtype: str,
+    device: str,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@@ -83,6 +85,8 @@ def run_vllm(
        dtype=dtype,
        max_model_len=max_model_len,
        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        device=device,
    )
    # Add the requests to the engine.
@@ -206,7 +210,8 @@ def main(args: argparse.Namespace):
                                args.quantization, args.tensor_parallel_size,
                                args.seed, args.n, args.use_beam_search,
                                args.trust_remote_code, args.dtype,
-                                args.max_model_len, args.enforce_eager)
+                                args.max_model_len, args.enforce_eager,
+                                args.kv_cache_dtype, args.device)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -284,6 +289,19 @@ if __name__ == "__main__":
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8_e5m2"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        choices=["cuda"],
+        help='device type for vLLM execution, supporting CUDA only currently.')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model

--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
+import json
+import os
+import sys
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+from vllm.model_executor.layers.fused_moe import fused_moe
+import torch
+import torch.nn.functional as F
+import triton
+def main():
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs, method=method)
+def run_grid(bs, method):
+    d_model = 4096
+    num_total_experts = 8
+    top_k = 2
+    tp_size = 2
+    model_intermediate_size = 14336
+    num_layers = 32
+    num_calls = 100
+    num_warmup_trials = 1
+    num_trials = 1
+    configs = []
+    if bs <= 16:
+        BLOCK_SIZES_M = [16]
+    elif bs <= 32:
+        BLOCK_SIZES_M = [16, 32]
+    elif bs <= 64:
+        BLOCK_SIZES_M = [16, 32, 64]
+    elif bs <= 128:
+        BLOCK_SIZES_M = [16, 32, 64, 128]
+    else:
+        BLOCK_SIZES_M = [16, 32, 64, 128, 256]
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in BLOCK_SIZES_M:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        configs.append({
+                            "BLOCK_SIZE_M": block_size_m,
+                            "BLOCK_SIZE_N": block_size_n,
+                            "BLOCK_SIZE_K": block_size_k,
+                            "GROUP_SIZE_M": group_size_m,
+                            "num_warps": num_warps,
+                            "num_stages": 4,
+                        })
+    best_config = None
+    best_time_us = 1e20
+    for config in configs:
+        print(f'{tp_size=} {bs=}')
+        print(f'{config}')
+        # warmup
+        print(f'warming up')
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+        # trial
+        print(f'benchmarking')
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+            )
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+            print(
+                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
+            )
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+    filename = "/tmp/config.jsonl"
+    print(f"writing config to file {filename}")
+    with open(filename, "a") as f:
+        f.write(json.dumps({str(bs): best_config}) + "\n")
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.bfloat16,
+    )
+    ws = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    w2s = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=ws,
+            w2=w2s,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+        )
+    end_event.record()
+    end_event.synchronize()
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+if __name__ == "__main__":
+    sys.exit(main())
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
+from typing import Optional
 import argparse
 import random
 import time
 import torch
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 from vllm._C import ops
 NUM_BLOCKS = 1024
@@ -23,17 +25,20 @@ def main(
    dtype: torch.dtype,
    seed: int,
    do_profile: bool,
+    device: str = "cuda",
+    kv_cache_dtype: Optional[str] = None,
 ) -> None:
    random.seed(seed)
    torch.random.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
    scale = float(1.0 / (head_size**0.5))
    query = torch.empty(num_seqs,
                        num_query_heads,
                        head_size,
                        dtype=dtype,
-                        device="cuda")
+                        device=device)
    query.uniform_(-scale, scale)
    assert num_query_heads % num_kv_heads == 0
@@ -41,11 +46,11 @@ def main(
    if use_alibi:
        alibi_slopes = torch.randn(num_query_heads,
                                   dtype=torch.float,
-                                   device="cuda")
+                                   device=device)
    context_lens = [context_len for _ in range(num_seqs)]
    max_context_len = max(context_lens)
-    context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
+    context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
    # Create the block tables.
    max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
@@ -56,18 +61,18 @@ def main(
            for _ in range(max_num_blocks_per_seq)
        ]
        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
    # Create the KV cache.
-    x = 16 // torch.tensor([], dtype=dtype).element_size()
+    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
-    key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
+                                                            block_size,
-    key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
+                                                            1,
-    key_cache.uniform_(-scale, scale)
+                                                            num_kv_heads,
-    value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
+                                                            head_size,
-    value_cache = torch.empty(size=value_cache_shape,
+                                                            kv_cache_dtype,
-                              dtype=dtype,
+                                                            dtype,
-                              device="cuda")
+                                                            device=device)
-    value_cache.uniform_(-scale, scale)
+    key_cache, value_cache = key_caches[0], value_caches[0]
    # Prepare for the paged attention kernel.
    output = torch.empty_like(query)
@@ -86,7 +91,7 @@ def main(
        )
        max_logits = torch.empty_like(exp_sums)
-    def run_benchmark(num_iters: int, profile: bool = False) -> float:
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
        torch.cuda.synchronize()
        if profile:
            torch.cuda.cudart().cudaProfilerStart()
@@ -106,6 +111,7 @@ def main(
                    block_size,
                    max_context_len,
                    alibi_slopes,
+                    kv_cache_dtype,
                )
            elif version == "v2":
                ops.paged_attention_v2(
@@ -123,6 +129,7 @@ def main(
                    block_size,
                    max_context_len,
                    alibi_slopes,
+                    kv_cache_dtype,
                )
            else:
                raise ValueError(f"Invalid version: {version}")
@@ -135,6 +142,7 @@ def main(
    # Warmup.
    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
    run_benchmark(num_iters=3, profile=False)
    # Benchmark.
@@ -168,16 +176,19 @@ if __name__ == '__main__':
                        default="half")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--profile", action="store_true")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        type=str,
+        choices=["auto", "fp8_e5m2"],
+        default="auto",
+        help=
+        'Data type for kv cache storage. If "auto", will use model data type.')
+    parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
    args = parser.parse_args()
    print(args)
    if args.num_query_heads % args.num_kv_heads != 0:
        raise ValueError("num_query_heads must be divisible by num_kv_heads")
-    dtype_to_torch_dtype = {
-        "half": torch.half,
-        "bfloat16": torch.bfloat16,
-        "float": torch.float,
-    }
    main(
        version=args.version,
        num_seqs=args.batch_size,
@@ -187,7 +198,8 @@ if __name__ == '__main__':
        head_size=args.head_size,
        block_size=args.block_size,
        use_alibi=args.use_alibi,
-        dtype=dtype_to_torch_dtype[args.dtype],
+        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
        seed=args.seed,
        do_profile=args.profile,
+        kv_cache_dtype=args.kv_cache_dtype,
    )
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -6,7 +6,7 @@ TOKENS=$2
 docker run --gpus all --shm-size 1g -p $PORT:80 \
           -v $PWD/data:/data \
-           ghcr.io/huggingface/text-generation-inference:0.8 \
+           ghcr.io/huggingface/text-generation-inference:1.4.0 \
           --model-id $MODEL \
           --sharded false  \
           --max-input-length 1024 \

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -2,19 +2,16 @@
 #include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <cmath>
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 namespace vllm {
-template<typename T>
+// Activation and gating kernel template.
-__device__ __forceinline__ T silu(const T& x) {
+template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
-  // x * sigmoid(x)
+__global__ void act_and_mul_kernel(
-  return (T) (((float) x) / (1.0f + expf((float) -x)));
-}
-template<typename scalar_t>
-__global__ void silu_and_mul_kernel(
  scalar_t* __restrict__ out,               // [..., d]
  const scalar_t* __restrict__ input,       // [..., 2, d]
  const int d) {
@@ -22,32 +19,58 @@ __global__ void silu_and_mul_kernel(
  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
-    out[token_idx * d + idx] = silu(x) * y;
+    out[token_idx * d + idx] = ACT_FN(x) * y;
  }
 }
+template<typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T) (((float) x) / (1.0f + expf((float) -x)));
+}
+template<typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  const float f = (float) x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
 } // namespace vllm
+// Launch activation and gating kernel.
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
+  int d = input.size(-1) / 2;                                                             \
+  int64_t num_tokens = input.numel() / input.size(-1);                                    \
+  dim3 grid(num_tokens);                                                                  \
+  dim3 block(std::min(d, 1024));                                                          \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
+    input.scalar_type(),                                                                  \
+    "act_and_mul_kernel",                                                                 \
+    [&] {                                                                                 \
+      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
+        out.data_ptr<scalar_t>(),                                                         \
+        input.data_ptr<scalar_t>(),                                                       \
+        d);                                                                               \
+    });
 void silu_and_mul(
  torch::Tensor& out,      // [..., d]
  torch::Tensor& input)    // [..., 2 * d]
 {
-  int64_t num_tokens = input.numel() / input.size(-1);
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
-  int d = input.size(-1) / 2;
+}
-  dim3 grid(num_tokens);
+void gelu_and_mul(
-  dim3 block(std::min(d, 1024));
+  torch::Tensor& out,      // [..., d]
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  torch::Tensor& input)    // [..., 2 * d]
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+{
-  VLLM_DISPATCH_FLOATING_TYPES(
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
-    input.scalar_type(),
-    "silu_and_mul_kernel",
-    [&] {
-      vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(),
-        input.data_ptr<scalar_t>(),
-        d);
-    });
 }
 namespace vllm {

--- a/csrc/attention/attention_dtypes.h
+++ b/csrc/attention/attention_dtypes.h
@@ -3,4 +3,5 @@
 #include "attention_generic.cuh"
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
-// #include "dtype_bfloat16.cuh"
+#include "dtype_bfloat16.cuh"
+// #include "dtype_fp8_e5m2.cuh"