merge v0.4.0

7c4f76e3 · zhuwenwen · 2da0dd3e · 51c31bc1 · 7c4f76e3 · 7c4f76e3
Commit 7c4f76e3 authored Apr 15, 2024 by zhuwenwen
20 changed files
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -70,16 +70,16 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
    && cd ..; \
    fi
-COPY ./ /app/vllm
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
+COPY ./ /app/vllm
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install xformers==0.0.23 --no-deps
 RUN cd /app \
    && cd vllm \
    && pip install -U -r requirements-rocm.txt \
@@ -90,6 +90,6 @@ RUN cd /app \
    && cd ..
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
 CMD ["/bin/bash"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
 include LICENSE
 include requirements.txt
+include CMakeLists.txt
+recursive-include cmake *
 recursive-include csrc *
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ python3 setup.py install
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.3.3；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.4.0；
 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -16,12 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
 ---
-**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
+**The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
-We are thrilled to announce our second vLLM Meetup!
+We are thrilled to announce our third vLLM Meetup!
 The vLLM team will share recent updates and roadmap.
-We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
+We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
-Please register [here](https://lu.ma/ygxbpzhl) and join us!
+Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
 ---
@@ -67,6 +67,8 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
+- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
@@ -76,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
 - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
@@ -86,8 +89,10 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
 - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
 - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
+- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
 import json
 import os
+import sys
 import time
-from dataclasses import dataclass
+import traceback
-from typing import Optional
+from dataclasses import dataclass, field
+from typing import List, Optional
 import aiohttp
 from tqdm.asyncio import tqdm
@@ -26,8 +28,11 @@ class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0
-    ttft: float = 0
+    ttft: float = 0  # Time to first token
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
    prompt_len: int = 0
+    error: str = ""
 async def async_request_tgi(
@@ -55,71 +60,38 @@ async def async_request_tgi(
        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
-                    async for data in response.content.iter_any():
+                    async for chunk in response.content:
-                        if ttft == 0:
+                        chunk = chunk.strip()
-                            ttft = time.perf_counter() - st
+                        if not chunk:
-                            output.ttft = ttft
+                            continue
-                    output.latency = time.perf_counter() - st
-                    body = data.decode("utf-8").lstrip("data:")
-                    output.generated_text = json.loads(body)["generated_text"]
-                    output.success = True
-                else:
-                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
-            output.success = False
-        if pbar:
-            pbar.update(1)
-        return output
-async def async_request_vllm(
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate")
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+                        data = json.loads(chunk)
-        payload = {
+                        timestamp = time.perf_counter()
-            "prompt": request_func_input.prompt,
+                        # First token
-            "n": 1,
-            "best_of": request_func_input.best_of,
-            "use_beam_search": request_func_input.use_beam_search,
-            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
-            "top_p": 1.0,
-            "max_tokens": request_func_input.output_len,
-            "ignore_eos": True,
-            "stream": True,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-        ttft = 0
-        st = time.perf_counter()
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for data in response.content.iter_any():
                        if ttft == 0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
-                    output.latency = time.perf_counter() - st
-                    # When streaming, '\0' is appended to the end of the response.
+                        # Decoding phase
-                    body = data.decode("utf-8").strip("\0")
+                        else:
-                    output.generated_text = json.loads(
+                            output.itl.append(timestamp -
-                        body)["text"][0][len(request_func_input.prompt):]
+                                              most_recent_timestamp)
-                    output.success = True
-                else:
+                        most_recent_timestamp = timestamp
-                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -146,26 +118,45 @@ async def async_request_trt_llm(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        ttft = 0
+        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload) as resp:
+            async with session.post(url=api_url, json=payload) as response:
-                if resp.status == 200:
+                if response.status == 200:
-                    async for data in resp.content.iter_any():
+                    async for chunk in response.content:
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
                        if ttft == 0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
-                    output.latency = time.perf_counter() - st
-                    body = data.decode("utf-8").lstrip("data:")
+                        # Decoding phase
-                    output.generated_text = json.loads(body)["text_output"]
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+                    output.latency = most_recent_timestamp - st
+                    output.generated_text = json.loads(data)["text_output"]
                    output.success = True
                else:
+                    output.error = response.reason
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -181,34 +172,35 @@ async def async_request_deepspeed_mii(
        assert not request_func_input.use_beam_search
        payload = {
-            "prompts": request_func_input.prompt,
+            "prompt": request_func_input.prompt,
-            "max_new_tokens": request_func_input.output_len,
+            "max_tokens": request_func_input.output_len,
-            "ignore_eos": True,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
-            "do_sample": True,
-            "temperature":
-            0.01,  # deepspeed-mii does not accept 0.0 temperature.
            "top_p": 1.0,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
-        # https://github.com/microsoft/DeepSpeed-MII/pull/311
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
        output.ttft = 0
        st = time.perf_counter()
        try:
            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as resp:
+                                    json=payload) as response:
-                if resp.status == 200:
+                if response.status == 200:
-                    parsed_resp = await resp.json()
+                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp[0]["generated_text"]
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
+                    output.error = response.reason
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
@@ -220,7 +212,9 @@ async def async_request_openai_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith("v1/completions")
+    assert api_url.endswith(
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."
    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@@ -242,43 +236,150 @@ async def async_request_openai_completions(
        generated_text = ""
        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    async for chunk in response.content:
-                        if ttft == 0:
+                        chunk = chunk.strip()
-                            ttft = time.perf_counter() - st
+                        if not chunk:
-                            output.ttft = ttft
+                            continue
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            data = json.loads(chunk)
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
                        chunk = chunk.strip()
                        if not chunk:
                            continue
-                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
-                            body = json.loads(chunk)
+                            timestamp = time.perf_counter()
-                            generated_text += body["choices"][0]["text"]
+                            data = json.loads(chunk)
+                            if "content" in data["choices"][0]["delta"]:
+                                # First token
+                                if ttft == 0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+                                generated_text += data["choices"][0]["delta"][
+                                    "content"]
+                            most_recent_timestamp = timestamp
                    output.generated_text = generated_text
                    output.success = True
                    output.latency = latency
                else:
+                    output.error = response.reason
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
    if pbar:
        pbar.update(1)
    return output
+# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
+# introduced in Python 3.9
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
-    "vllm": async_request_vllm,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -16,17 +16,19 @@ def main(args: argparse.Namespace):
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
+    llm = LLM(model=args.model,
-        model=args.model,
+              tokenizer=args.tokenizer,
-        tokenizer=args.tokenizer,
+              quantization=args.quantization,
-        quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
-        tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
-        trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
-        dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
-        enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
-        kv_cache_dtype=args.kv_cache_dtype,
+              device=args.device,
-        device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
-    )
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size)
    sampling_params = SamplingParams(
        n=args.n,
@@ -145,5 +147,25 @@ if __name__ == '__main__':
        default="cuda",
        choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        type=bool,
+        default=False,
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
+import argparse
+import time
+from vllm import LLM, SamplingParams
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+    llm.generate(prompts, sampling_params=sampling_params)
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+def main(args):
+    llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              enable_prefix_caching=args.enable_prefix_caching)
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts[:1],
+        sampling_params=sampling_params,
+    )
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
 """Benchmark online serving throughput.
 On the server side, run one of the following commands:
-    (vLLM backend)
+    vLLM OpenAI API server
-    python -m vllm.entrypoints.api_server \
+    python -m vllm.entrypoints.openai.api_server \
        --model <your_model> --swap-space 16 \
        --disable-log-requests
@@ -12,28 +12,30 @@ On the server side, run one of the following commands:
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
-        --tokenizer <your_model> --dataset <target_dataset> \
+        --model <your_model> \
-        --request-rate <request_rate>
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
 """
 import argparse
 import asyncio
 import json
+import os
 import random
 import time
+import warnings
 from dataclasses import dataclass
 from datetime import datetime
 from typing import AsyncGenerator, List, Tuple
 import numpy as np
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from backend_request_func import (
+from vllm.transformers_utils.tokenizer import get_tokenizer
-    ASYNC_REQUEST_FUNCS,
-    RequestFuncInput,
-    RequestFuncOutput,
-)
 @dataclass
@@ -52,7 +54,7 @@ class BenchmarkMetrics:
    p99_tpot_ms: float
-def sample_requests(
+def sample_sharegpt_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@@ -100,6 +102,73 @@ def sample_requests(
    return sampled_requests
+def sample_sonnet_requests(
+    dataset_path: str,
+    num_requests: int,
+    input_len: int,
+    output_len: int,
+    prefix_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int]]:
+    assert input_len > prefix_len, "input_len must be greater than prefix_len."
+    # Load the dataset.
+    with open(dataset_path) as f:
+        poem_lines = f.readlines()
+    # Tokenize the poem lines.
+    poem_token_ids = tokenizer(poem_lines).input_ids
+    average_poem_len = sum(
+        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
+    # Base prefix for all requests.
+    base_prompt = "Pick as many lines as you can from these poem lines:\n"
+    base_message = [{
+        "role": "user",
+        "content": base_prompt,
+    }]
+    base_prompt_formatted = tokenizer.apply_chat_template(
+        base_message, add_generation_prompt=True, tokenize=False)
+    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+    assert (input_len > base_prompt_offset
+            ), f"Please set 'args.input-len' higher than {base_prompt_offset}."
+    num_input_lines = round(
+        (input_len - base_prompt_offset) / average_poem_len)
+    # First approximately `prefix_len` number of tokens in the
+    # prompt are fixed poem lines.
+    assert (
+        prefix_len > base_prompt_offset
+    ), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
+    num_prefix_lines = round(
+        (prefix_len - base_prompt_offset) / average_poem_len)
+    prefix_lines = poem_lines[:num_prefix_lines]
+    # Sample the rest of lines per request.
+    sampled_requests: List[Tuple[str, int, int]] = []
+    for _ in range(num_requests):
+        sampled_lines = "".join(
+            prefix_lines +
+            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+        prompt = f"{base_prompt}{sampled_lines}"
+        message = [
+            {
+                "role": "user",
+                "content": prompt,
+            },
+        ]
+        prompt_formatted = tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, tokenize=False)
+        prompt_len = len(tokenizer(prompt_formatted).input_ids)
+        sampled_requests.append(
+            (prompt, prompt_formatted, prompt_len, output_len))
+    return sampled_requests
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
@@ -122,37 +191,42 @@ def calculate_metrics(
    outputs: List[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-) -> BenchmarkMetrics:
+) -> Tuple[BenchmarkMetrics, List[int]]:
-    total_output = 0
+    actual_output_lens = []
    total_input = 0
    completed = 0
-    per_token_latencies = []
+    tpots = []
    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            output_len = len(tokenizer.encode(outputs[i].generated_text))
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
-            total_output += output_len
+            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
-            per_token_latencies.append(outputs[i].latency / output_len)
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
            ttfts.append(outputs[i].ttft)
            completed += 1
+        else:
+            actual_output_lens.append(0)
    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
-        total_output=total_output,
+        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
        input_throughput=total_input / dur_s,
-        output_throughput=total_output / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
-        mean_ttft_ms=np.mean(ttfts) * 1000,
+        mean_ttft_ms=np.mean(ttfts or 0) *
-        median_ttft_ms=np.median(ttfts) * 1000,
+        1000,  # ttfts is empty if streaming is not supported by backend
-        p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        mean_tpot_ms=np.mean(per_token_latencies) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
-        median_tpot_ms=np.median(per_token_latencies) * 1000,
+        mean_tpot_ms=np.mean(tpots) * 1000,
-        p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+        median_tpot_ms=np.median(tpots) * 1000,
+        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
    )
-    return metrics
+    return metrics, actual_output_lens
 async def benchmark(
@@ -171,10 +245,10 @@ async def benchmark(
    else:
        raise ValueError(f"Unknown backend: {backend}")
-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
    print(f"Traffic request rate: {request_rate}")
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
    benchmark_start_time = time.perf_counter()
    tasks = []
    async for request in get_request(input_requests, request_rate):
@@ -192,40 +266,53 @@ async def benchmark(
            asyncio.create_task(
                request_func(request_func_input=request_func_input,
                             pbar=pbar)))
-    outputs = await asyncio.gather(*tasks)
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
    if not disable_tqdm:
        pbar.close()
    benchmark_duration = time.perf_counter() - benchmark_start_time
-    metrics = calculate_metrics(
+    metrics, actual_output_lens = calculate_metrics(
        input_requests=input_requests,
        outputs=outputs,
        dur_s=benchmark_duration,
        tokenizer=tokenizer,
    )
-    print(f"Successful requests: {metrics.completed}")
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
-    print(f"Benchmark duration: {benchmark_duration:2f} s")
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
-    print(f"Total input tokens: {metrics.total_input}")
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-    print(f"Total generated tokens: {metrics.total_output}")
+                                    benchmark_duration))
-    print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
+    print("{:<40} {:<10}".format("Total generated tokens:",
-    print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
+                                 metrics.total_output))
-    print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-    print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
+                                    metrics.request_throughput))
-    print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
-    print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
+                                    metrics.input_throughput))
-    print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-    print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+                                    metrics.output_throughput))
+    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("=" * 50)
    result = {
        "duration": benchmark_duration,
        "completed": metrics.completed,
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
-        "request_inthroughput": metrics.request_throughput,
+        "request_throughput": metrics.request_throughput,
        "input_throughput": metrics.input_throughput,
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
@@ -233,7 +320,13 @@ async def benchmark(
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
    }
    return result
@@ -254,7 +347,58 @@ def main(args: argparse.Namespace):
    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
-    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+        )
+    elif args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+        )
+    elif args.dataset_name == "sonnet":
+        # Do not format the prompt, pass to message directly
+        if args.backend == "openai-chat":
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.input_len,
+                output_len=args.output_len,
+                prefix_len=args.prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+        else:
+            assert (
+                tokenizer.chat_template or tokenizer.default_chat_template
+            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.input_len,
+                output_len=args.output_len,
+                prefix_len=args.prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt_formatted, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
    benchmark_result = asyncio.run(
        benchmark(
@@ -277,13 +421,23 @@ def main(args: argparse.Namespace):
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
        result_json["date"] = current_dt
        result_json["backend"] = backend
-        result_json["version"] = args.version
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
        # Traffic
        result_json["request_rate"] = (
            args.request_rate if args.request_rate < float("inf") else "inf")
@@ -293,7 +447,9 @@ def main(args: argparse.Namespace):
        # Save to file
        base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
            json.dump(result_json, outfile)
@@ -307,12 +463,6 @@ if __name__ == "__main__":
        default="vllm",
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
    )
-    parser.add_argument(
-        "--version",
-        type=str,
-        default="N/A",
-        help="Version of the serving backend/engine.",
-    )
    parser.add_argument(
        "--base-url",
        type=str,
@@ -324,12 +474,26 @@ if __name__ == "__main__":
    parser.add_argument(
        "--endpoint",
        type=str,
-        default="/generate",
+        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument("--dataset",
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in the "
+        "next release.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "sonnet"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument("--dataset-path",
                        type=str,
-                        required=True,
+                        default=None,
                        help="Path to the dataset.")
    parser.add_argument(
        "--model",
@@ -341,7 +505,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default model tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--best-of",
@@ -357,6 +521,27 @@ if __name__ == "__main__":
        default=1000,
        help="Number of prompts to process.",
    )
+    parser.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
    parser.add_argument(
        "--request-rate",
        type=float,
@@ -382,6 +567,21 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,9 +6,9 @@ import time
 from typing import List, Optional, Tuple
 import torch
+from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-from tqdm import tqdm
 def sample_requests(
@@ -73,21 +73,25 @@ def run_vllm(
    enforce_eager: bool,
    kv_cache_dtype: str,
    device: str,
+    enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
-    llm = LLM(
+    llm = LLM(model=model,
-        model=model,
+              tokenizer=tokenizer,
-        tokenizer=tokenizer,
+              quantization=quantization,
-        quantization=quantization,
+              tensor_parallel_size=tensor_parallel_size,
-        tensor_parallel_size=tensor_parallel_size,
+              seed=seed,
-        seed=seed,
+              trust_remote_code=trust_remote_code,
-        trust_remote_code=trust_remote_code,
+              dtype=dtype,
-        dtype=dtype,
+              max_model_len=max_model_len,
-        max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
-        enforce_eager=enforce_eager,
+              enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
+              kv_cache_dtype=kv_cache_dtype,
-        device=device,
+              device=device,
-    )
+              enable_prefix_caching=enable_prefix_caching,
+              download_dir=download_dir)
    # Add the requests to the engine.
    for prompt, _, output_len in requests:
@@ -179,13 +183,15 @@ def run_mii(
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
-    from mii import pipeline
+    from mii import client, serve
-    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
    prompts = [prompt for prompt, _, _ in requests]
    start = time.perf_counter()
-    llm(prompts, max_new_tokens=output_len)
+    llm.generate(prompts, max_new_tokens=output_len)
    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
    return end - start
@@ -211,7 +217,9 @@ def main(args: argparse.Namespace):
                                args.seed, args.n, args.use_beam_search,
                                args.trust_remote_code, args.dtype,
                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device)
+                                args.kv_cache_dtype, args.device,
+                                args.enable_prefix_caching,
+                                args.gpu_memory_utilization, args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -286,6 +294,12 @@ if __name__ == "__main__":
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")
@@ -302,6 +316,15 @@ if __name__ == "__main__":
        default="cuda",
        choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model

--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -2,13 +2,15 @@ import json
 import os
 import sys
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-from vllm.model_executor.layers.fused_moe import fused_moe
 import torch
 import torch.nn.functional as F
 import triton
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 def main():
    method = fused_moe
@@ -64,7 +66,7 @@ def run_grid(bs, method):
        print(f'{tp_size=} {bs=}')
        print(f'{config}')
        # warmup
-        print(f'warming up')
+        print('warming up')
        try:
            for _ in range(num_warmup_trials):
                run_timing(
@@ -82,7 +84,7 @@ def run_grid(bs, method):
            continue
        # trial
-        print(f'benchmarking')
+        print('benchmarking')
        for _ in range(num_trials):
            kernel_dur_ms = run_timing(
                num_calls=num_calls,
@@ -103,17 +105,25 @@ def run_grid(bs, method):
                best_config = config
                best_time_us = kernel_dur_us
-            print(
+            print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
-                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
+                  f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
-            )
+                  f'{d_model=} {model_intermediate_size=} {num_layers=}')
    print("best_time_us", best_time_us)
    print("best_config", best_config)
-    filename = "/tmp/config.jsonl"
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size)
    print(f"writing config to file {filename}")
-    with open(filename, "a") as f:
+    existing_content = {}
-        f.write(json.dumps({str(bs): best_config}) + "\n")
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
-from typing import Optional
 import argparse
 import random
 import time
+from typing import Optional
 import torch
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 from vllm._C import ops
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512

--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
+import argparse
+from itertools import accumulate
+from typing import Optional
+import nvtx
+import torch
+from vllm.model_executor.layers.rotary_embedding import get_rope
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
+#!/usr/bin/env python3
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+import argparse
+import os
+import shutil
+from torch.utils.hipify.hipify_python import hipify
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Project directory where all the source + include files live.
+    parser.add_argument(
+        "-p",
+        "--project_dir",
+        help="The project directory.",
+    )
+    # Directory where hipified files are written.
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+    # Source files to convert.
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
+    args = parser.parse_args()
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
+    # Get absolute path for all source files.
+    extra_files = [os.path.abspath(s) for s in args.sources]
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
+        hipified_sources.append(hipified_s_abs)
+    assert (len(hipified_sources) == len(args.sources))
+    # Print hipified source files.
+    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+  # Swap out original extension sources with hipified sources.
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
+#
+# Get additional GPU compiler flags from torch.
+#
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
+    endif()
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+    list(APPEND GPU_FLAGS
+      "-DUSE_ROCM"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-fno-gpu-rdc")
+  endif()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`.
+#
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+    # via the `PYTORCH_ROCM_ARCH` env variable.
+    #
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
+      endif()
+    endforeach()
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+  elseif(${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Setup/process CUDA arch flags.
+    #
+    # The torch cmake setup hardcodes the detected architecture flags in
+    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
+    # can't modified on a per-target basis, e.g. for the `punica` extension.
+    # So, all the `-gencode` flags need to be extracted and removed from
+    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
+    # Since it's not possible to use `target_compiler_options` for adding target
+    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
+    # must be used instead.  This requires repackaging the architecture flags
+    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
+    #
+    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
+    # to one of the other nvcc options to specify architectures.
+    #
+    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
+    # detected architectures.
+    #
+    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+    # If this error is triggered, it might mean that torch has changed how it sets
+    # up nvcc architecture code generation flags.
+    if (NOT _CUDA_ARCH_FLAGS)
+      message(FATAL_ERROR
+        "Could not find any architecture related code generation flags in "
+        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
+    endif()
+    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
+    # Initialize the architecture lists to empty.
+    set(${GPU_ARCHES})
+    # Process each `gencode` flag.
+    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
+      # For each flag, extract the version number and whether it refers to PTX
+      # or native code.
+      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
+      # for that match.
+      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+      if (_COMPUTE)
+        set(_COMPUTE ${CMAKE_MATCH_1})
+      endif()
+      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
+      if (_SM)
+        set(_SM ${CMAKE_MATCH_1})
+      endif()
+      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
+      if (_CODE)
+        set(_CODE ${CMAKE_MATCH_1})
+      endif()
+      # Make sure the virtual architecture can be matched.
+      if (NOT _COMPUTE)
+        message(FATAL_ERROR
+          "Could not determine virtual architecture from: ${_ARCH}.")
+      endif()
+      # One of sm_ or compute_ must exist.
+      if ((NOT _SM) AND (NOT _CODE))
+        message(FATAL_ERROR
+          "Could not determine a codegen architecture from: ${_ARCH}.")
+      endif()
+      if (_SM)
+        # -real suffix let CMake to only generate elf code for the kernels.
+        # we want this, otherwise the added ptx (default) will increase binary size.
+        set(_VIRT "-real")
+        set(_CODE_ARCH ${_SM})
+      else()
+        # -virtual suffix let CMake to generate ptx code for the kernels.
+        set(_VIRT "-virtual")
+        set(_CODE_ARCH ${_CODE})
+      endif()
+      # Check if the current version is in the supported arch list.
+      string_to_ver(_CODE_VER ${_CODE_ARCH})
+      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
+        continue()
+      endif()
+      # Add it to the arch list.
+      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
+    endforeach()
+  endif()
+  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
+endmacro()
+#
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  endif()
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
+    ${GPU_LIBRARIES})
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
+      ${CUDA_LIBRARIES})
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
+endfunction()
--- a/collect_env.py
+++ b/collect_env.py
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -33,12 +33,25 @@ template<typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
  // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
  const float f = (float) x;
  constexpr float ALPHA = M_SQRT1_2;
  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }
+template<typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float) x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+}
 } // namespace vllm
 // Launch activation and gating kernel.
@@ -73,6 +86,13 @@ void gelu_and_mul(
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
+void gelu_tanh_and_mul(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input)    // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+}
 namespace vllm {
 // Element-wise activation kernel template.

--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -15,9 +15,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -31,11 +28,6 @@
 #include <algorithm>
-#ifndef USE_ROCM
-#define WARP_SIZE 32
-#else
-#define WARP_SIZE warpSize
-#endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
 #pragma once
+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
 #ifndef USE_ROCM
  #define VLLM_LDG(arg) __ldg(arg)
 #else

--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
  }
  return (fptr_t) new vllm::CustomAllreduce(
-      reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(),
+      reinterpret_cast<vllm::Signal *>(meta.data_ptr()), rank_data.data_ptr(),
      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
 }
@@ -62,9 +62,9 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  if (inp_size % 16 != 0) return false;
  if (!_is_weak_contiguous(inp)) return false;
  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
-  // 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size
+  // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
-  // <= 512k
+  // performance improvement over NCCL.
-  return world_size <= 4 && inp_size <= 512 * 1024;
+  return false;
 }
 void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
@@ -126,7 +126,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }
-int meta_size() { return sizeof(vllm::Metadata); }
+int meta_size() { return sizeof(vllm::Signal); }
 void register_buffer(fptr_t _fa, torch::Tensor &t,
                     const std::vector<std::string> &handles,