update v0.6.2

b7374ad4 · zhuwenwen · 57d61ec2 · b7374ad4 · b7374ad4 · 57d61ec2
Commit b7374ad4 authored Dec 11, 2024 by zhuwenwen
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04-vllm0.6
 ENV LANG C.UTF-8
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 * @Author: zhuww
 * @email: zhuww@sugon.com
 * @Date: 2024-04-25 10:38:07
- * @LastEditTime: 2024-09-29 17:50:01
+ * @LastEditTime: 2024-12-11 17:18:01
 -->
 # LLAMA

@@ -28,7 +28,7 @@ LLama是一个基础语言模型的集合,参数范围从7B到65B。在数万亿
 提供[光源](https://www.sourcefind.cn/#/image/dcu/custom)拉取推理的docker镜像：

 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu20.04-dtk24.04.2-py3.10
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.3.0-py3.10-dtk24.04.3-ubuntu20.04-vllm0.6
 # <Image ID>用上面拉取docker镜像的ID替换
 # <Host Path>主机端路径
 # <Container Path>容器映射路径
@@ -50,13 +50,12 @@ docker run -it --name llama_vllm --privileged --shm-size=64G  --device=/dev/kfd
 conda create -n llama_vllm python=3.10
 ```
 关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
-* DTK驱动：dtk24.04.2
-* Pytorch: 2.1.0
-* triton:2.1.0
-* lmslim: 0.1.0
-* xformers: 0.0.25
-* flash_attn: 2.0.4
-* vllm: 0.5.0
+* DTK驱动：dtk24.04.3
+* Pytorch: 2.3.0
+* triton: 2.1.0
+* lmslim: 0.1.2
+* flash_attn: 2.6.1
+* vllm: 0.6.2
 * python: python3.10

 `Tips：需先安装相关依赖，最后安装vllm包`
@@ -121,7 +120,7 @@ python benchmarks/benchmark_serving.py --model meta-llama/Llama-2-7b-chat-hf --d
 ### OpenAI兼容服务
 启动服务：
 ```bash
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --enforce-eager --dtype float16 --trust-remote-code
+vllm serve meta-llama/Llama-2-7b-chat-hf --enforce-eager --dtype float16 --trust-remote-code
 ```
 这里`--model`为加载模型路径，`--dtype`为数据类型：float16，默认情况使用tokenizer中的预定义聊天模板，`--chat-template`可以添加新模板覆盖默认模板,`-q gptq`为使用gptq量化模型进行推理。

@@ -189,7 +188,7 @@ ssh -L 8000:计算节点IP:8000 -L 8001:计算节点IP:8001 用户名@登录节
 3.启动OpenAI兼容服务

 ```
-python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --enforce-eager --dtype float16 --trust-remote-code --port 8000 --host "0.0.0.0"
+vllm serve meta-llama/Llama-2-7b-chat-hf --enforce-eager --dtype float16 --trust-remote-code --port 8000 --host "0.0.0.0"
 ```

 4.启动gradio服务

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
-# Benchmarking vLLM
-
-## Downloading the ShareGPT dataset
-
-You can download the dataset by running:
-```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-```
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
-import json
-import os
-import sys
-import time
-import traceback
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import aiohttp
-from tqdm.asyncio import tqdm
-
-AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
-
-
-@dataclass
-class RequestFuncInput:
-    prompt: str
-    api_url: str
-    prompt_len: int
-    output_len: int
-    model: str
-    best_of: int = 1
-    use_beam_search: bool = False
-
-
-@dataclass
-class RequestFuncOutput:
-    generated_text: str = ""
-    success: bool = False
-    latency: float = 0.0
-    ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(
-        default_factory=list)  # List of inter-token latencies
-    prompt_len: int = 0
-    error: str = ""
-
-
-async def async_request_tgi(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate_stream")
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
-        params = {
-            "best_of": request_func_input.best_of,
-            "max_new_tokens": request_func_input.output_len,
-            "do_sample": True,
-            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
-            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
-        }
-        payload = {
-            "inputs": request_func_input.prompt,
-            "parameters": params,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
-
-                        data = json.loads(chunk)
-                        timestamp = time.perf_counter()
-                        # First token
-                        if ttft == 0.0:
-                            ttft = time.perf_counter() - st
-                            output.ttft = ttft
-
-                        # Decoding phase
-                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
-
-                        most_recent_timestamp = timestamp
-
-                    output.latency = most_recent_timestamp - st
-                    output.success = True
-                    output.generated_text = data["generated_text"]
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
-async def async_request_trt_llm(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate_stream")
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
-        assert request_func_input.best_of == 1
-        payload = {
-            "accumulate_tokens": True,
-            "text_input": request_func_input.prompt,
-            "temperature": 0.0,
-            "top_p": 1.0,
-            "max_tokens": request_func_input.output_len,
-            "stream": True,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
-
-                        data = json.loads(chunk)
-                        output.generated_text += data["text_output"]
-                        timestamp = time.perf_counter()
-                        # First token
-                        if ttft == 0.0:
-                            ttft = time.perf_counter() - st
-                            output.ttft = ttft
-
-                        # Decoding phase
-                        else:
-                            output.itl.append(timestamp -
-                                              most_recent_timestamp)
-
-                        most_recent_timestamp = timestamp
-
-                    output.latency = most_recent_timestamp - st
-                    output.success = True
-
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
-async def async_request_deepspeed_mii(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
-        assert not request_func_input.use_beam_search
-
-        payload = {
-            "prompt": request_func_input.prompt,
-            "max_tokens": request_func_input.output_len,
-            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
-            "top_p": 1.0,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
-        # will use 0 as placeholder.
-        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
-        output.ttft = 0
-
-        st = time.perf_counter()
-        try:
-            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as response:
-                if response.status == 200:
-                    parsed_resp = await response.json()
-                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
-                    output.success = True
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
-async def async_request_openai_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        "v1/completions"
-    ), "OpenAI Completions API URL must end with 'v1/completions'."
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
-        payload = {
-            "model": request_func_input.model,
-            "prompt": request_func_input.prompt,
-            "temperature": 0.0,
-            "best_of": request_func_input.best_of,
-            "max_tokens": request_func_input.output_len,
-            "stream": True,
-        }
-        headers = {
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-        }
-
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        generated_text = ""
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
-                            data = json.loads(chunk)
-
-                            if data["choices"][0]["text"]:
-                                timestamp = time.perf_counter()
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                # NOTE: Some completion API might have a last
-                                # usage summary response without a token so we
-                                # do not want to include as inter-token-latency
-                                elif data.get("usage", None) is None:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
-
-                                most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["text"]
-
-                    output.generated_text = generated_text
-                    output.success = True
-                    output.latency = latency
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
-async def async_request_openai_chat_completions(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        "v1/chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        assert not request_func_input.use_beam_search
-        payload = {
-            "model": request_func_input.model,
-            "messages": [
-                {
-                    "role": "user",
-                    "content": request_func_input.prompt,
-                },
-            ],
-            "temperature": 0.0,
-            "max_tokens": request_func_input.output_len,
-            "stream": True,
-        }
-        headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
-        }
-
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        generated_text = ""
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(url=api_url, json=payload,
-                                    headers=headers) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data: ")
-                        if chunk == "[DONE]":
-                            latency = time.perf_counter() - st
-                        else:
-                            timestamp = time.perf_counter()
-                            data = json.loads(chunk)
-
-                            delta = data["choices"][0]["delta"]
-                            if delta.get("content", None):
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp -
-                                                      most_recent_timestamp)
-
-                                generated_text += delta["content"]
-
-                            most_recent_timestamp = timestamp
-
-                    output.generated_text = generated_text
-                    output.success = True
-                    output.latency = latency
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
-ASYNC_REQUEST_FUNCS = {
-    "tgi": async_request_tgi,
-    "vllm": async_request_openai_completions,
-    "lmdeploy": async_request_openai_completions,
-    "deepspeed-mii": async_request_deepspeed_mii,
-    "openai": async_request_openai_completions,
-    "openai-chat": async_request_openai_chat_completions,
-    "tensorrt-llm": async_request_trt_llm,
-}
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
-"""Benchmark the latency of processing a single batch of requests."""
-import argparse
-import json
-import time
-from pathlib import Path
-from typing import List, Optional
-
-import numpy as np
-import torch
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.inputs import PromptStrictInputs
-from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(model=args.model,
-              speculative_model=args.speculative_model,
-              num_speculative_tokens=args.num_speculative_tokens,
-              tokenizer=args.tokenizer,
-              quantization=args.quantization,
-              tensor_parallel_size=args.tensor_parallel_size,
-              trust_remote_code=args.trust_remote_code,
-              dtype=args.dtype,
-              enforce_eager=args.enforce_eager,
-              kv_cache_dtype=args.kv_cache_dtype,
-              quantization_param_path=args.quantization_param_path,
-              device=args.device,
-              ray_workers_use_nsight=args.ray_workers_use_nsight,
-              use_v2_block_manager=args.use_v2_block_manager,
-              enable_chunked_prefill=args.enable_chunked_prefill,
-              download_dir=args.download_dir,
-              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization,
-              distributed_executor_backend=args.distributed_executor_backend)
-
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=0.0 if args.use_beam_search else 1.0,
-        top_p=1.0,
-        use_beam_search=args.use_beam_search,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_inputs: List[PromptStrictInputs] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
-
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            with torch.profiler.profile(
-                    activities=[
-                        torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
-                    ],
-                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
-                llm.generate(dummy_inputs,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-            print(p.key_averages())
-        else:
-            start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
-
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
-
-    if args.profile:
-        profile_dir = args.profile_result_dir
-        if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90]
-    percentiles = np.percentile(latencies, percentages)
-    print(f'Avg latency: {np.mean(latencies)} seconds')
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f'{percentage}% percentile latency: {percentile} seconds')
-
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--model', type=str, default='facebook/opt-125m')
-    parser.add_argument('--speculative-model', type=str, default=None)
-    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--tokenizer', type=str, default=None)
-    parser.add_argument('--quantization',
-                        '-q',
-                        choices=[*QUANTIZATION_METHODS, None],
-                        default=None)
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=10,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=30,
-                        help='Number of iterations to run.')
-    parser.add_argument('--trust-remote-code',
-                        action='store_true',
-                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
-        help='data type for model weights and activations. '
-        'The "auto" option will use FP16 precision '
-        'for FP32 and FP16 models, and BF16 precision '
-        'for BF16 models.')
-    parser.add_argument('--enforce-eager',
-                        action='store_true',
-                        help='enforce eager mode and disable CUDA graph')
-    parser.add_argument(
-        '--kv-cache-dtype',
-        type=str,
-        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
-        default="auto",
-        help='Data type for kv cache storage. If "auto", will use model '
-        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
-    parser.add_argument(
-        '--quantization-param-path',
-        type=str,
-        default=None,
-        help='Path to the JSON file containing the KV cache scaling factors. '
-        'This should generally be supplied, when KV cache dtype is FP8. '
-        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
-        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
-    parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
-    parser.add_argument(
-        '--profile-result-dir',
-        type=str,
-        default=None,
-        help=('path to save the pytorch profiler output. Can be visualized '
-              'with ui.perfetto.dev or Tensorboard.'))
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda",
-        choices=["cuda", "cpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
-    parser.add_argument('--block-size',
-                        type=int,
-                        default=16,
-                        help='block size of key/value cache')
-    parser.add_argument(
-        '--enable-chunked-prefill',
-        action='store_true',
-        help='If True, the prefill requests can be chunked based on the '
-        'max_num_batched_tokens')
-    parser.add_argument('--use-v2-block-manager', action='store_true')
-    parser.add_argument(
-        "--ray-workers-use-nsight",
-        action='store_true',
-        help="If specified, use nsight to profile ray workers",
-    )
-    parser.add_argument('--download-dir',
-                        type=str,
-                        default=None,
-                        help='directory to download and load the weights, '
-                        'default to the default cache dir of huggingface')
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the latency results in JSON format.')
-    parser.add_argument('--gpu-memory-utilization',
-                        type=float,
-                        default=0.9,
-                        help='the fraction of GPU memory to be used for '
-                        'the model executor, which can range from 0 to 1.'
-                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
-import argparse
-import time
-
-from vllm import LLM, SamplingParams
-
-PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
-
-
-def test_prefix(llm=None, sampling_params=None, prompts=None):
-    start_time = time.time()
-
-    llm.generate(prompts, sampling_params=sampling_params)
-
-    end_time = time.time()
-    print(f"cost time {end_time - start_time}")
-
-
-def main(args):
-    llm = LLM(model=args.model,
-              tokenizer_mode='auto',
-              trust_remote_code=True,
-              enforce_eager=True,
-              use_v2_block_manager=args.use_v2_block_manager,
-              tensor_parallel_size=args.tensor_parallel_size,
-              enable_prefix_caching=args.enable_prefix_caching)
-
-    num_prompts = 100
-    prompts = [PROMPT] * num_prompts
-    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
-
-    print("------warm up------")
-    test_prefix(
-        llm=llm,
-        prompts=prompts,
-        sampling_params=sampling_params,
-    )
-
-    print("------start generating------")
-    test_prefix(
-        llm=llm,
-        prompts=prompts,
-        sampling_params=sampling_params,
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Benchmark the performance with or without automatic '
-        'prefix caching.')
-    parser.add_argument('--model',
-                        type=str,
-                        default='baichuan-inc/Baichuan2-13B-Chat')
-    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
-    parser.add_argument('--output-len', type=int, default=10)
-    parser.add_argument('--enable-prefix-caching',
-                        action='store_true',
-                        help='enable prefix caching')
-    parser.add_argument('--use-v2-block-manager',
-                        action='store_true',
-                        help='Use BlockSpaceMangerV2')
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
-"""Benchmark offline inference throughput."""
+"""Benchmark offline prioritization."""
 import argparse
 import json
 import random
 import time
 from typing import List, Optional, Tuple

-import numpy as np
-import torch
-from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
+from transformers import AutoTokenizer, PreTrainedTokenizerBase

-from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS


@@ -56,13 +51,16 @@ def sample_requests(
        if prompt_len > 1024 or prompt_len + output_len > 2048:
            # Prune too long sequences.
            continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+
+        #Select a equi-probable random priority
+        priority = 0 if random.random() < 0.5 else 1
+
+        filtered_dataset.append((prompt, prompt_len, output_len, priority))

    return filtered_dataset


 def run_vllm(
-    warmup_requests: List[Tuple[str, int, int]],
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: str,
@@ -81,7 +79,6 @@ def run_vllm(
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
 ) -> float:
@@ -104,14 +101,16 @@ def run_vllm(
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
+        disable_log_stats=False,
    )

    # Add the requests to the engine.
    prompts = []
    sampling_params = []
-    for prompt, _, output_len in requests:
+    priority = []
+    for prompt, _, output_len, _priority in requests:
        prompts.append(prompt)
+        priority.append(_priority)
        sampling_params.append(
            SamplingParams(
                n=n,
@@ -122,122 +121,9 @@ def run_vllm(
                max_tokens=output_len,
            ))

-    # warmup
-    warmup_prompts = []
-    warmup_sampling_params = []
-    for prompt, _, output_len in warmup_requests:
-        warmup_prompts.append(prompt)
-        warmup_sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=0.0 if use_beam_search else 1.0,
-                top_p=1.0,
-                use_beam_search=use_beam_search,
-                ignore_eos=True,
-                max_tokens=output_len,
-            ))
-        
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
-    
-    # dummy_prompt_token_ids = np.random.randint(10000,
-    #                                            size=(args.num_prompts,
-    #                                                  args.input_len))
-    # dummy_inputs: List[PromptStrictInputs] = [{
-    #     "prompt_token_ids": batch
-    # } for batch in dummy_prompt_token_ids.tolist()]
-
-    # def run_to_completion():
-    #     llm.generate(dummy_inputs,
-    #                     sampling_params=sampling_params,
-    #                     use_tqdm=False)
-
-    # print("Warming up...")
-    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-    #     run_to_completion()
-    
-    start = time.perf_counter()
-    llm.generate(prompts, sampling_params, use_tqdm=True)
-    end = time.perf_counter()
-    return end - start
-
-
-def run_hf(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tokenizer: PreTrainedTokenizerBase,
-    n: int,
-    use_beam_search: bool,
-    max_batch_size: int,
-    trust_remote_code: bool,
-) -> float:
-    assert not use_beam_search
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
-    if llm.config.model_type == "llama":
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.cuda()
-
-    pbar = tqdm(total=len(requests))
-    start = time.perf_counter()
-    batch: List[str] = []
-    max_prompt_len = 0
-    max_output_len = 0
-    for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
-        # Add the prompt to the batch.
-        batch.append(prompt)
-        max_prompt_len = max(max_prompt_len, prompt_len)
-        max_output_len = max(max_output_len, output_len)
-        if len(batch) < max_batch_size and i != len(requests) - 1:
-            # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
-            if (max(max_prompt_len, next_prompt_len) +
-                    max(max_output_len, next_output_len)) <= 2048:
-                # We can add more requests to the batch.
-                continue
-
-        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt",
-                              padding=True).input_ids
-        llm_outputs = llm.generate(
-            input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
-            num_return_sequences=n,
-            temperature=1.0,
-            top_p=1.0,
-            use_cache=True,
-            max_new_tokens=max_output_len,
-        )
-        # Include the decoding time.
-        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
-        pbar.update(len(batch))
-
-        # Clear the batch.
-        batch = []
-        max_prompt_len = 0
-        max_output_len = 0
-    end = time.perf_counter()
-    return end - start
-
-
-def run_mii(
-    requests: List[Tuple[str, int, int]],
-    model: str,
-    tensor_parallel_size: int,
-    output_len: int,
-) -> float:
-    from mii import client, serve
-    llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
-
    start = time.perf_counter()
-    llm.generate(prompts, max_new_tokens=output_len)
+    llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
    end = time.perf_counter()
-    client = client(model)
-    client.terminate_server()
    return end - start


@@ -248,9 +134,6 @@ def main(args: argparse.Namespace):
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    warmup_prompt = "hi" * 10
-    warmup_requests = [(warmup_prompt, 10, 10)
-                for _ in range(1)]
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
        prompt = "hi" * (args.input_len - 1)
@@ -262,35 +145,20 @@ def main(args: argparse.Namespace):

    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
+            requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir)
-    elif args.backend == "hf":
-        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
-    elif args.backend == "mii":
-        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
-                               args.output_len)
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-    if args.dataset is None:
-        total_out_tokens = args.output_len * args.num_prompts
-    else:
-        total_out_tokens = sum(output_len for _, _, output_len in requests) 
-    print(f"Latency: {elapsed_time:.2f} s")
-    print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+                           for _, prompt_len, output_len, priority in requests)
+    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
-    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
-

    # Output JSON results if specified
    if args.output_json:
@@ -336,19 +204,11 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=1,
-                        help='Number of iterations to run for warmup.')
    parser.add_argument("--num-prompts",
                        type=int,
-                        default=1000,
+                        default=200,
                        help="Number of prompts to process.")
    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--hf-max-batch-size",
-                        type=int,
-                        default=None,
-                        help="Maximum batch size for HF backend.")
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
@@ -383,7 +243,7 @@ if __name__ == "__main__":
        default="auto",
        help='Data type for kv cache storage. If "auto", will use model '
        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
-        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+        'ROCm (hcu) supports fp8 (=fp8_e4m3)')
    parser.add_argument(
        '--quantization-param-path',
        type=str,
@@ -392,7 +252,7 @@ if __name__ == "__main__":
        'This should generally be supplied, when KV cache dtype is FP8. '
        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
-        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'cuda version greater than 11.8. On ROCm (hcu), FP8_E4M3 is '
        'instead supported for common inference criteria.')
    parser.add_argument(
        "--device",
@@ -422,13 +282,7 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
+
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
@@ -438,26 +292,4 @@ if __name__ == "__main__":
    else:
        assert args.input_len is None

-    if args.backend == "vllm":
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-    elif args.backend == "hf":
-        if args.hf_max_batch_size is None:
-            raise ValueError("HF max batch size is required for HF backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-    elif args.backend == "mii":
-        if args.dtype != "auto":
-            raise ValueError("dtype must be auto for MII backend.")
-        if args.n != 1:
-            raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-        if args.tokenizer != args.model:
-            raise ValueError("Tokenizer must be the same as the model for MII "
-                             "backend.")
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
-import argparse
-import copy
-import itertools
-import pickle as pkl
-import time
-from typing import Callable, Iterable, List, Tuple
-
-import torch
-import torch.utils.benchmark as TBenchmark
-from torch.utils.benchmark import Measurement as TMeasurement
-from weight_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-
-DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-DEFAULT_TP_SIZES = [1]
-
-# helpers
-
-
-def to_fp8(tensor: torch.tensor) -> torch.tensor:
-    finfo = torch.finfo(torch.float8_e4m3fn)
-    return torch.round(tensor.clamp(
-        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
-
-
-def to_int8(tensor: torch.tensor) -> torch.tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
-
-
-def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.tensor, torch.tensor]:
-
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
-
-    if dtype == torch.int8:
-        return to_int8(a), to_int8(b)
-    if dtype == torch.float8_e4m3fn:
-        return to_fp8(a), to_fp8(b)
-
-    raise ValueError("unsupported dtype")
-
-
-# impl
-
-
-def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                    scale_b: torch.tensor,
-                    out_dtype: torch.dtype) -> torch.tensor:
-    return torch.mm(a, b)
-
-
-def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                     scale_b: torch.tensor,
-                     out_dtype: torch.dtype) -> torch.tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype)
-
-
-def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
-                                scale_a: torch.tensor, scale_b: torch.tensor,
-                                out_dtype: torch.dtype) -> torch.tensor:
-    return torch._scaled_mm(a,
-                            b,
-                            scale_a=scale_a,
-                            scale_b=scale_b,
-                            out_dtype=out_dtype,
-                            use_fast_accum=True)
-
-
-def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-                 scale_b: torch.tensor,
-                 out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
-
-
-# bench
-def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
-             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
-             sub_label: str, fn: Callable, description: str) -> TMeasurement:
-
-    min_run_time = 1
-
-    globals = {
-        "a": a,
-        "b": b,
-        "scale_a": scale_a,
-        "scale_b": scale_b,
-        "out_dtype": out_dtype,
-        "fn": fn,
-    }
-    return TBenchmark.Timer(
-        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
-        globals=globals,
-        label=label,
-        sub_label=sub_label,
-        description=description,
-    ).blocked_autorange(min_run_time=min_run_time)
-
-
-def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-               sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.int8
-    a, b = make_rand_tensors(torch.int8, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-    # pytorch impl
-    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
-    # cutlass impl
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_i8_i8_bf16_scaled_mm"))
-
-    return timers
-
-
-def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-              sub_label: str) -> Iterable[TMeasurement]:
-    assert dtype == torch.float8_e4m3fn
-    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
-    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-
-    timers = []
-
-    # pytorch impl: bf16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
-
-    # pytorch impl: bf16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
-
-    # pytorch impl: fp16 output, without fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
-
-    # pytorch impl: fp16 output, with fp8 fast accum
-    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 pytorch_fp8_impl_fast_accum,
-                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
-
-    # cutlass impl: bf16 output
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.bfloat16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_bf16_scaled_mm"))
-    # cutlass impl: fp16 output
-    timers.append(
-        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
-                 torch.float16, label, sub_label, cutlass_impl,
-                 "cutlass_fp8_fp8_fp16_scaled_mm"))
-    return timers
-
-
-def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
-          sub_label: str) -> Iterable[TMeasurement]:
-    if dtype == torch.int8:
-        return bench_int8(dtype, m, k, n, label, sub_label)
-    if dtype == torch.float8_e4m3fn:
-        return bench_fp8(dtype, m, k, n, label, sub_label)
-    raise ValueError("unsupported type")
-
-
-# runner
-def print_timers(timers: Iterable[TMeasurement]):
-    compare = TBenchmark.Compare(timers)
-    compare.print()
-
-
-def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
-
-    results = []
-    for m, k, n in MKNs:
-        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
-                       f"MKN=({m}x{k}x{n})")
-        print_timers(timers)
-        results.extend(timers)
-
-    return results
-
-
-# output makers
-def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
-                base_description: str,
-                timestamp=None):
-
-    print(f"== All Results {base_description} ====")
-    print_timers(data)
-
-    # pickle all the results
-    timestamp = int(time.time()) if timestamp is None else timestamp
-    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(data, f)
-
-
-# argparse runners
-
-
-def run_square_bench(args):
-    dim_sizes = list(
-        range(args.dim_start, args.dim_end + 1, args.dim_increment))
-    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"square_bench-{args.dtype}")
-
-
-def run_range_bench(args):
-    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
-    n = len(dim_sizes)
-    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
-    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
-    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
-    MKNs = list(zip(Ms, Ks, Ns))
-    data = run(args.dtype, MKNs)
-
-    make_output(data, MKNs, f"range_bench-{args.dtype}")
-
-
-def run_model_bench(args):
-
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
-        KNs = []
-        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
-            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
-            KNs.append(KN)
-        return KNs
-
-    model_bench_data = []
-    models_tps = list(itertools.product(args.models, args.tp_sizes))
-    for model, tp_size in models_tps:
-        Ms = args.batch_sizes
-        KNs = model_shapes(model, tp_size)
-        MKNs = []
-        for m in Ms:
-            for k, n in KNs:
-                MKNs.append((m, k, n))
-
-        data = run(args.dtype, MKNs)
-        model_bench_data.append(data)
-
-    # Print all results
-    for data, model_tp in zip(model_bench_data, models_tps):
-        model, tp_size = model_tp
-        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
-        print_timers(data)
-
-    timestamp = int(time.time())
-
-    all_data = []
-    for d in model_bench_data:
-        all_data.extend(d)
-    # pickle all data
-    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
-        pkl.dump(all_data, f)
-
-
-if __name__ == '__main__':
-
-    def to_torch_dtype(dt):
-        if dt == "int8":
-            return torch.int8
-        if dt == "fp8":
-            return torch.float8_e4m3fn
-        raise ValueError("unsupported dtype")
-
-    parser = argparse.ArgumentParser(
-        description="""
-Benchmark Cutlass GEMM.
-
-    To run square GEMMs:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
-    
-    To run constant N and K and sweep M:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
-    
-    To run dimensions from a model:
-        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
-    
-    Output:
-        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
-            """,  # noqa: E501
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    parser.add_argument("--dtype",
-                        type=to_torch_dtype,
-                        required=True,
-                        help="Available options are ['int8', 'fp8']")
-    subparsers = parser.add_subparsers(dest="cmd")
-
-    square_parser = subparsers.add_parser("square_bench")
-    square_parser.add_argument("--dim-start", type=int, required=True)
-    square_parser.add_argument("--dim-end", type=int, required=True)
-    square_parser.add_argument("--dim-increment", type=int, required=True)
-    square_parser.set_defaults(func=run_square_bench)
-
-    range_parser = subparsers.add_parser("range_bench")
-    range_parser.add_argument("--dim-start", type=int, required=True)
-    range_parser.add_argument("--dim-end", type=int, required=True)
-    range_parser.add_argument("--dim-increment", type=int, required=True)
-    range_parser.add_argument("--m-constant", type=int, default=None)
-    range_parser.add_argument("--n-constant", type=int, default=None)
-    range_parser.add_argument("--k-constant", type=int, default=None)
-    range_parser.set_defaults(func=run_range_bench)
-
-    model_parser = subparsers.add_parser("model_bench")
-    model_parser.add_argument("--models",
-                              nargs="+",
-                              type=str,
-                              default=DEFAULT_MODELS,
-                              choices=WEIGHT_SHAPES.keys())
-    model_parser.add_argument("--tp-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_TP_SIZES)
-    model_parser.add_argument("--batch-sizes",
-                              nargs="+",
-                              type=int,
-                              default=DEFAULT_BATCH_SIZES)
-    model_parser.set_defaults(func=run_model_bench)
-
-    args = parser.parse_args()
-    args.func(args)
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
-import argparse
-import os
-import sys
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.aqlm import (
-    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
-    optimized_dequantize_gemm)
-
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-
-def torch_mult(
-        input: torch.Tensor,  #  [..., in_features]
-        weights: torch.Tensor,
-        scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-) -> torch.Tensor:
-    output = F.linear(input, weights)
-    return output
-
-
-def dequant_out_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    if bias is None:
-        output = F.linear(input, weights, bias)
-        orig_shape = output.shape
-        flattened_output = output.view(-1, output.size(-1))
-        f_scales = scales.view(-1, scales.shape[0])
-        b_scales = f_scales.expand(flattened_output.shape[0], -1)
-        flattened_output *= b_scales
-        return flattened_output.view(orig_shape)
-    else:
-        b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-            -1, weights.shape[1])
-        weights *= b_scales
-        return F.linear(input, weights, bias)
-
-
-def dequant_weight_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
-        -1, weights.shape[1])
-    weights *= b_scales
-    return F.linear(input, weights, bias)
-
-
-def dequant_no_scale(
-    input: torch.Tensor,  #  [..., in_features]
-    codes: torch.IntTensor,  #  [num_out_groups, num_in_groups, num_codebooks]
-    codebooks: torch.
-    Tensor,  #  [num_codebooks, codebook_size, out_group_size, in_group_size]
-    scales: torch.Tensor,  #  [num_out_groups, 1, 1, 1]
-    output_partition_sizes: torch.IntTensor,
-    bias: Optional[torch.Tensor],
-) -> torch.Tensor:
-
-    weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
-
-    return F.linear(input, weights, bias)
-
-
-# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
-# the generic pytorch version.
-# Just visual comparison.
-def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
-
-    n = parts.sum().item()
-
-    device = torch.device('cuda:0')
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(-code_range,
-                          code_range,
-                          size=(n, k // ingroups, nbooks),
-                          dtype=get_int_dtype(bits),
-                          device=device)
-
-    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-                            dtype=torch.float16,
-                            device=device)
-
-    count = 0
-    for index in range(16):
-        for i in range(8):
-            for book in range(nbooks):
-                codebooks[book, index, 0, i] = count * (10**book)
-            count += 1
-
-    print("codes shape", codes.shape)
-
-    for i in range(16):
-        for book in range(nbooks):
-            codes[0, i, book] = i
-            codes[0, -i, book] = i
-
-    weights = dequantize_weight(codes, codebooks, None)
-    weights2 = ops.aqlm_dequant(codes, codebooks, parts)
-
-    print("weights shape:", weights.shape)
-    print("weights2 shape:", weights2.shape)
-
-    print("weights are:", weights)
-    print("weights2 are:", weights2)
-
-    print("first 128 weights are", weights[0, 0:128].to(torch.int32))
-    print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
-
-    print("last 128 weights are", weights[0, -128:])
-    print("last 128 weights2 are:", weights2[0, -128:])
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
-
-    # Add arguments
-    parser.add_argument("--nbooks",
-                        type=int,
-                        default=1,
-                        help="Number of codebooks (default: 1)")
-    parser.add_argument("--bits",
-                        type=int,
-                        default=16,
-                        help="Number of bits per code element (default: 16)")
-    parser.add_argument(
-        "--test",
-        type=bool,
-        default=False,
-        help="Run the decompression/dequant tester rather than benchmarking "
-        "(default: False)")
-
-    # Parse the arguments
-    args = parser.parse_args()
-
-    # Extract values
-    nbooks = args.nbooks
-    bits = args.bits
-
-    if args.test:
-        dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
-        return
-
-    # Otherwise, benchmark.
-    methods = [
-        ops.aqlm_gemm,
-        dequant_out_scale,
-        generic_dequantize_gemm,
-        optimized_dequantize_gemm,
-        dequant_weight_scale,
-        torch_mult,
-        dequant_no_scale,
-    ]
-
-    filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
-    print(f"writing benchmarks to file {filename}")
-    with open(filename, "w") as f:
-        sys.stdout = f
-
-        print('m | k | n | n parts', end='')
-        for method in methods:
-            print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
-        print('')
-
-        # These are reasonable prefill sizes.
-        ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
-                         (4096, (11008, 11008)), (11008, (4096, )))
-
-        # reasonable ranges for m.
-        for m in [
-                1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
-                128, 256, 512, 1024, 1536, 2048, 3072, 4096
-        ]:
-            print(f'{m}', file=sys.__stdout__)
-            for ksp in ksandpartions:
-                run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
-                         methods)
-
-        sys.stdout = sys.__stdout__
-
-
-def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
-             methods):
-
-    # I didn't see visible improvements from increasing these, but feel free :)
-    num_warmup_trials = 1
-    num_trials = 1
-
-    num_calls = 100
-
-    # warmup.
-    for method in methods:
-        for _ in range(num_warmup_trials):
-            run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-    n = parts.sum().item()
-    print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
-
-    for method in methods:
-        best_time_us = 1e20
-        for _ in range(num_trials):
-            kernel_dur_ms = run_timing(
-                num_calls=num_calls,
-                m=m,
-                k=k,
-                parts=parts,
-                nbooks=nbooks,
-                bits=bits,
-                method=method,
-            )
-
-            kernel_dur_us = 1000 * kernel_dur_ms
-
-            if kernel_dur_us < best_time_us:
-                best_time_us = kernel_dur_us
-
-        print(f' | {kernel_dur_us:.0f}', end='')
-
-    print('')
-
-
-def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
-               nbooks: int, bits: int, method) -> float:
-
-    n = parts.sum().item()
-
-    device = torch.device('cuda:0')
-
-    input = torch.randn((1, m, k), dtype=torch.float16, device=device)
-
-    code_range = (1 << bits) // 2
-    ingroups = 8
-
-    codes = torch.randint(-code_range,
-                          code_range,
-                          size=(n, k // ingroups, nbooks),
-                          dtype=get_int_dtype(bits),
-                          device=device)
-
-    codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
-                            dtype=torch.float16,
-                            device=device)
-
-    scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
-
-    # for comparison to just a pytorch mult.
-    weights = torch.randn((n, k), dtype=torch.float16, device=device)
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    start_event.record()
-
-    if method is torch_mult:
-        for i in range(num_calls):
-            torch_mult(input, weights, scales)
-    else:
-        for i in range(num_calls):
-            method(input, codes, codebooks, scales, parts, None)
-
-    end_event.record()
-    end_event.synchronize()
-
-    dur_ms = start_event.elapsed_time(end_event) / num_calls
-    return dur_ms
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
+import time
+
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)
+
+
+@torch.inference_mode()
+def main(num_tokens: int,
+         hidden_size: int,
+         add_residual: bool,
+         dtype: torch.dtype,
+         seed: int = 0,
+         do_profile: bool = False,
+         num_warmup_iters: int = 5,
+         num_iters: int = 100) -> None:
+    seed_everything(seed)
+    torch.set_default_device("cuda")
+
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        for _ in range(num_iters):
+            layer(x, residual)
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=num_warmup_iters, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=num_iters, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description="Benchmark the layernorm kernel.")
+    parser.add_argument("--num-tokens", type=int, default=4096)
+    parser.add_argument("--hidden-size", type=int, default=8192)
+    parser.add_argument("--add-residual", action="store_true")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["half", "bfloat16", "float"],
+                        default="half")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--num-warmup-iters", type=int, default=5)
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=100,
+                        help="Number of benchmark iterations. "
+                        "If --profile is set, this number is ignored")
+
+    args = parser.parse_args()
+    print(args)
+
+    main(num_tokens=args.num_tokens,
+         hidden_size=args.hidden_size,
+         add_residual=args.add_residual,
+         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+         seed=args.seed,
+         do_profile=args.profile,
+         num_warmup_iters=args.num_warmup_iters,
+         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
+import argparse
+import copy
+import itertools
+import math
+import pickle as pkl
+import time
+from itertools import product
+from typing import Callable, Iterable, List, Optional, Tuple
+
+import pandas as pd
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    gptq_pack, pack_rows, quantize_weights)
+from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
+DEFAULT_TP_SIZES = [1]
+
+
+def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor:
+    w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape)
+    w_q = w_q.t().contiguous().t()  # make col major
+    return ops.machete_prepack_B(w_q, wtype)
+
+
+def make_bench_tensors(
+    atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int,
+    k: int
+) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor,
+                                    torch.tensor]]]:
+    assert wtype.is_integer(), "TODO: support floating point weights"
+
+    # we want to make sure that weights don't fit into L2 cache between runs so
+    #  we construct enough weights to exceed L2 cache, which is 50mb on a H100
+    #  so we target total weight size > 2*50mb
+    num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits))
+
+    a = torch.randn((m, k), device="cuda", dtype=atype) * 5
+    weights = [
+        torch.randn((k, n), device="cuda", dtype=atype)
+        for _ in range(num_weights)
+    ]
+    quanitized_weights = [
+        quantize_weights(w, wtype, group_size) for w in weights
+    ]
+
+    return a, quanitized_weights
+
+
+# impl
+
+
+# bench
+def bench_fn(label: str, sub_label: str, description: str,
+             fn: Callable) -> TMeasurement:
+
+    min_run_time = 1
+    return TBenchmark.Timer(
+        stmt="fn()",
+        globals={
+            "fn": fn
+        },
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def loop_over_weights(
+    a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor,
+                                         torch.tensor, torch.tensor]],
+    fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor],
+                 None]):
+    for w_ref, w_q, w_s, _ in weights:
+        fn(a, w_ref, w_q, w_s)
+
+
+_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
+_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
+
+
+def bench(atype: torch.dtype,
+          wtype: ScalarType,
+          group_size: int,
+          m: int,
+          k: int,
+          n: int,
+          label: str,
+          sub_label: str,
+          benchmark_marlinv1: bool = True,
+          sweep_schedules: bool = True) -> Iterable[TMeasurement]:
+    global _SWEEP_SCHEDULES_RESULTS
+
+    a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k)
+    sub_label += f", L={len(weights)}"
+
+    weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp)
+                       for w_ref, w_q, w_s, w_zp in weights]
+
+    timers = []
+    # pytorch impl
+    timers.append(
+        bench_fn(
+            label, sub_label, "torch.matmul", lambda: loop_over_weights(
+                a,
+                weights,
+                lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref),
+            )))
+
+    if benchmark_marlinv1:
+        w_ref = weights[0][0]
+
+        w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device)
+        sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device)
+        g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device)
+
+        def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor:
+            w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape)
+            return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape,
+                                          wtype.size_bits)
+
+        def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor:
+            return marlin_permute_scales(w_s, *w_ref.shape, group_size)
+
+        weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q),
+                             marlinv1_permute_scales(w_s), w_zp)
+                            for w_ref, w_q, w_s, w_zp in weights]
+
+        workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N,
+                                    GPTQ_MARLIN_MAX_PARALLEL)
+
+        # marlinv1
+        timers.append(
+            bench_fn(
+                label, sub_label, "marlin_orig", lambda: loop_over_weights(
+                    a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops.
+                    gptq_marlin_gemm(a,
+                                     w_q,
+                                     w_s,
+                                     w_zp_empty,
+                                     g_idx,
+                                     sort_indices,
+                                     workspace.scratch,
+                                     wtype,
+                                     size_m=a.shape[0],
+                                     size_n=w_ref.shape[1],
+                                     size_k=w_ref.shape[0],
+                                     is_k_full=True))))
+
+    # machete
+    timers.append(
+        bench_fn(
+            label, sub_label, "machete_heuristic", lambda: loop_over_weights(
+                a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm(
+                    a, w_q, wtype, b_scales=w_s, b_group_size=group_size))))
+
+    if sweep_schedules:
+        print("Finding best schedule for machete")
+        best = None
+        best_schedule = None
+        schedules = ops.machete_supported_schedules(wtype)
+        for schedule in reversed(schedules):
+            schedule_M = int(schedule.split("_")[0].split("x")[1])
+
+            # Prune known bad schedules
+            if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4:
+                continue
+
+            def run(a, _, w_q, w_s, schedule=schedule):
+                ops.machete_gemm(a,
+                                 w_q,
+                                 wtype,
+                                 w_s,
+                                 b_group_size=group_size,
+                                 schedule=schedule)
+
+            res = bench_fn(label, sub_label, "machete_best",
+                           lambda: loop_over_weights(a, weights_machete, run))
+
+            results_row = {
+                "M": m,
+                "K": k,
+                "N": n,
+                "group_size": group_size,
+                "schedule": schedule,
+                "median": res.median,
+            }
+            if _SWEEP_SCHEDULES_RESULTS is None:
+                _SWEEP_SCHEDULES_RESULTS = pd.DataFrame(
+                    columns=results_row.keys())
+            _SWEEP_SCHEDULES_RESULTS.\
+                loc[len(_SWEEP_SCHEDULES_RESULTS)] = results_row
+
+            print(f"  {res.median:5.5} ", schedule)
+            if not best or res.median < best.median:
+                best = res
+                best_schedule = schedule
+        print("Best schedule:", best_schedule)
+        timers.append(best)
+
+    return timers
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype, sweep_schedules: bool,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype,
+                       scalar_types.uint4b8,
+                       128,
+                       m,
+                       k,
+                       n,
+                       f"{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})",
+                       sweep_schedules=sweep_schedules)
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(
+    data: Iterable[TMeasurement],
+    MKNs: Iterable[Tuple[int, int, int]],
+    base_description: str,
+    timestamp=None,
+):
+
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
+    m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+    m_increment, k_increment, n_increment = \
+        [int(x) for x in args.dim_increment.split(",")]
+    Ms = list(range(m_start, m_end + 1, m_increment))
+    Ks = list(range(k_start, k_end + 1, k_increment))
+    Ns = list(range(n_start, n_end + 1, n_increment))
+    MKNs = list(product(Ms, Ks, Ns))
+
+    data = run(args.dtype, args.sweep_schedules, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, args.sweep_schedules, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == "__main__":
+
+    def to_torch_dtype(dt):
+        if dt == "bfloat16":
+            return torch.bfloat16
+        if dt == "float16":
+            return torch.float16
+        raise ValueError("unsupported dtype")
+
+    parser = FlexibleArgumentParser(
+        description="""
+Benchmark Machete GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/kernels/benchmark_machete.py --dtype float16 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--dtype",
+        type=to_torch_dtype,
+        required=True,
+        help="Available options are ['bfloat16', 'float16']",
+    )
+    parser.add_argument(
+        "--sweep-schedules",
+        action="store_true",
+        help="Run a sweep over all supported schedules",
+    )
+    parser.add_argument("--sweep-csv-out",
+                        help="CSV to store sweep results",
+                        default="sch_sweep_results.csv")
+    subparsers = parser.add_subparsers(dest="cmd", required=True)
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument(
+        "--dim-start",
+        type=str,
+        required=True,
+        help="Start value for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-end",
+        type=str,
+        required=True,
+        help="End value (inclusive) for M,K,N as common separated list")
+    range_parser.add_argument(
+        "--dim-increment",
+        type=str,
+        required=True,
+        help="Increment value for M,K,N as common separated list")
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES.keys(),
+    )
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+
+    _SWEEP_SCHEDULES_RESULTS_CSV = args.sweep_csv_out
+    args.func(args)
+
+    if _SWEEP_SCHEDULES_RESULTS is not None:
+        _SWEEP_SCHEDULES_RESULTS.to_csv(_SWEEP_SCHEDULES_RESULTS_CSV)
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
-import argparse
-
-import torch
-import torch.utils.benchmark as benchmark
-from benchmark_shapes import WEIGHT_SHAPES
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
-    GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
-    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
-    GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, quantize_weights, sort_weights)
-
-DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
-
-ACT_ORDER_OPTS = [False, True]
-K_FULL_OPTS = [False, True]
-
-
-def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
-              size_m, size_k, size_n):
-    label = "Quant Matmul"
-
-    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
-                 "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
-                                         group_size, size_m, size_k, size_n))
-
-    print(f"Testing: {sub_label}")
-
-    a = torch.randn(size_m, size_k).to(torch.half).cuda()
-    b = torch.rand(size_k, size_n).to(torch.half).cuda()
-
-    a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
-
-    # Marlin quant
-    (
-        marlin_w_ref,
-        marlin_q_w,
-        marlin_s,
-        marlin_g_idx,
-        marlin_sort_indices,
-        marlin_rand_perm,
-    ) = marlin_quantize(b, num_bits, group_size, act_order)
-
-    # Marlin_24 quant
-    (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
-     marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
-
-    # GPTQ quant
-    (w_ref, q_w, s, g_idx,
-     rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
-    q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
-
-    # For act_order, sort the "weights" and "g_idx"
-    # so that group ids are increasing
-    repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
-    if act_order:
-        (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
-
-    # Prepare
-    marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
-                                       GPTQ_MARLIN_MAX_PARALLEL)
-
-    marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
-                                          GPTQ_MARLIN_24_MAX_PARALLEL)
-
-    globals = {
-        # Gen params
-        "num_bits": num_bits,
-        "group_size": group_size,
-        "size_m": size_m,
-        "size_n": size_n,
-        "size_k": size_k,
-        "a": a,
-        "a_tmp": a_tmp,
-        # Marlin params
-        "marlin_w_ref": marlin_w_ref,
-        "marlin_q_w": marlin_q_w,
-        "marlin_s": marlin_s,
-        "marlin_g_idx": marlin_g_idx,
-        "marlin_sort_indices": marlin_sort_indices,
-        "marlin_rand_perm": marlin_rand_perm,
-        "marlin_workspace": marlin_workspace,
-        "is_k_full": is_k_full,
-        # Marlin_24 params
-        "marlin_24_w_ref": marlin_24_w_ref,
-        "marlin_24_q_w_comp": marlin_24_q_w_comp,
-        "marlin_24_meta": marlin_24_meta,
-        "marlin_24_s": marlin_24_s,
-        "marlin_24_workspace": marlin_24_workspace,
-        # GPTQ params
-        "q_w_gptq": q_w_gptq,
-        "repack_sort_indices": repack_sort_indices,
-        # Kernels
-        "gptq_marlin_gemm": ops.gptq_marlin_gemm,
-        "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
-        "gptq_marlin_repack": ops.gptq_marlin_repack,
-    }
-
-    min_run_time = 1
-
-    # Warmup pytorch
-    for i in range(5):
-        torch.matmul(a, marlin_w_ref)
-
-    results.append(
-        benchmark.Timer(
-            stmt="torch.matmul(a, marlin_w_ref)",
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="pytorch_gemm",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_gemm",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-    if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
-            and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
-        results.append(
-            benchmark.Timer(
-                stmt=
-                "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)",  # noqa: E501
-                globals=globals,
-                label=label,
-                sub_label=sub_label,
-                description="gptq_marlin_24_gemm",
-            ).blocked_autorange(min_run_time=min_run_time))
-
-    results.append(
-        benchmark.Timer(
-            stmt=
-            "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)",  # noqa: E501
-            globals=globals,
-            label=label,
-            sub_label=sub_label,
-            description="gptq_marlin_repack",
-        ).blocked_autorange(min_run_time=min_run_time))
-
-
-def main(args):
-    print("Benchmarking models:")
-    for i, model in enumerate(args.models):
-        print(f"[{i}]  {model}")
-
-    results = []
-
-    for model in args.models:
-        for layer in WEIGHT_SHAPES[model]:
-            size_k = layer[0]
-            size_n = layer[1]
-
-            if len(args.limit_k) > 0 and size_k not in args.limit_k:
-                continue
-
-            if len(args.limit_n) > 0 and size_n not in args.limit_n:
-                continue
-
-            for act_order in ACT_ORDER_OPTS:
-                if len(args.limit_act_order
-                       ) > 0 and act_order not in args.limit_act_order:
-                    continue
-
-                for is_k_full in K_FULL_OPTS:
-                    if len(args.limit_k_full
-                           ) > 0 and is_k_full not in args.limit_k_full:
-                        continue
-
-                    for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
-                        if len(args.limit_num_bits
-                               ) > 0 and num_bits not in args.limit_num_bits:
-                            continue
-
-                        for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
-                            if len(
-                                    args.limit_group_size
-                            ) > 0 and group_size not in args.limit_group_size:
-                                continue
-
-                            # For act_order, the group_size must be less than
-                            # size_k
-                            if act_order and (group_size == size_k
-                                              or group_size == -1):
-                                continue
-
-                            for size_m in args.batch_sizes:
-                                bench_run(results, model, act_order, is_k_full,
-                                          num_bits, group_size, size_m, size_k,
-                                          size_n)
-
-    compare = benchmark.Compare(results)
-    compare.print()
-
-
-# For quick benchmarking use:
-#   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
-#
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Benchmark Marlin across specified models/shapes/batches")
-    parser.add_argument(
-        "--models",
-        nargs="+",
-        type=str,
-        default=DEFAULT_MODELS,
-        choices=WEIGHT_SHAPES.keys(),
-    )
-    parser.add_argument("--batch-sizes",
-                        nargs="+",
-                        type=int,
-                        default=DEFAULT_BATCH_SIZES)
-    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
-    parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
-
-    args = parser.parse_args()
-    main(args)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
-import argparse
-import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple
-
-import ray
-import torch
-import triton
-from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-
-
-def benchmark_config(
-    config: Dict[str, int],
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-    num_iters: int = 100,
-) -> float:
-    init_dtype = torch.float16 if use_fp8 else dtype
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
-                     shard_intermediate_size,
-                     hidden_size,
-                     dtype=init_dtype)
-    w2 = torch.randn(num_experts,
-                     hidden_size,
-                     shard_intermediate_size // 2,
-                     dtype=init_dtype)
-    gating_output = torch.randn(num_iters,
-                                num_tokens,
-                                num_experts,
-                                dtype=torch.float32)
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-    if use_fp8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
-        a1_scale = torch.randn(1, dtype=torch.float32)
-        a2_scale = torch.randn(1, dtype=torch.float32)
-
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
-
-    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
-
-    def prepare(i: int):
-        input_gating.copy_(gating_output[i])
-
-    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8=use_fp8,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
-
-    # JIT compilation & warmup
-    run()
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run()
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies = []
-    for i in range(num_iters):
-        prepare(i)
-        torch.cuda.synchronize()
-
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
-    configs = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
-    def __init__(self, seed: int) -> None:
-        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
-        self.seed = seed
-
-    def benchmark(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
-
-        dtype_str = "float8" if use_fp8 else None
-        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-        # is the intermediate size after silu_and_mul.
-        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-        if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
-        else:
-            config = op_config[min(op_config.keys(),
-                                   key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
-                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
-        return config, kernel_time
-
-    def tune(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
-        best_config = None
-        best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
-        now = datetime.now()
-        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-        return best_config
-
-
-def sort_config(config: Dict[str, int]) -> Dict[str, int]:
-    return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
-    }
-
-
-def save_configs(
-    configs: Dict[int, Dict[str, int]],
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-) -> None:
-    dtype_str = "float8" if use_fp8 else None
-    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-    # is the intermediate size after silu_and_mul.
-    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-    print(f"Writing best config to {filename}...")
-    with open(filename, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    config = AutoConfig.from_pretrained(args.model)
-    if config.architectures[0] == "DbrxForCausalLM":
-        E = config.ffn_config.moe_num_experts
-        topk = config.ffn_config.moe_top_k
-        intermediate_size = config.ffn_config.ffn_hidden_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    else:
-        # Default: Mixtral.
-        E = config.num_local_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
-    hidden_size = config.hidden_size
-    dtype = config.torch_dtype
-    use_fp8 = args.dtype == "fp8"
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-
-    ray.init()
-    num_gpus = int(ray.available_resources()["GPU"])
-    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
-        outputs = []
-        worker_idx = 0
-        for input_args in inputs:
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, method)
-            output = worker_method.remote(*input_args)
-            outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
-        return ray.get(outputs)
-
-    if args.tune:
-        search_space = get_configs_compute_bound()
-        print(f"Start tuning over {len(search_space)} configurations...")
-
-        start = time.time()
-        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
-                     for batch_size in batch_sizes])
-        best_configs = {
-            M: sort_config(config)
-            for M, config in zip(batch_sizes, configs)
-        }
-        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
-        end = time.time()
-        print(f"Tuning took {end - start:.2f} seconds")
-    else:
-        outputs = _distribute("benchmark",
-                              [(batch_size, E, shard_intermediate_size,
-                                hidden_size, topk, dtype, use_fp8)
-                               for batch_size in batch_sizes])
-
-        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
-            print(f"Batch size: {batch_size}, config: {config}")
-            print(f"Kernel time: {kernel_time:.2f} us")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model",
-                        type=str,
-                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["auto", "fp8"],
-                        default="auto")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--tune", action="store_true")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
-import argparse
-import random
-import time
-from typing import Optional
-
-import torch
-
-from vllm import _custom_ops as ops
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
-
-NUM_BLOCKS = 1024
-PARTITION_SIZE = 512
-
-
-@torch.inference_mode()
-def main(
-    version: str,
-    num_seqs: int,
-    seq_len: int,
-    num_query_heads: int,
-    num_kv_heads: int,
-    head_size: int,
-    use_alibi: bool,
-    block_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    do_profile: bool,
-    device: str = "cuda",
-    kv_cache_dtype: Optional[str] = None,
-) -> None:
-    random.seed(seed)
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-
-    scale = float(1.0 / (head_size**0.5))
-    query = torch.empty(num_seqs,
-                        num_query_heads,
-                        head_size,
-                        dtype=dtype,
-                        device=device)
-    query.uniform_(-scale, scale)
-
-    assert num_query_heads % num_kv_heads == 0
-    alibi_slopes = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads,
-                                   dtype=torch.float,
-                                   device=device)
-
-    seq_lens = [seq_len for _ in range(num_seqs)]
-    max_seq_len = max(seq_lens)
-    seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
-
-    # Create the block tables.
-    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables = []
-    for _ in range(num_seqs):
-        block_table = [
-            random.randint(0, NUM_BLOCKS - 1)
-            for _ in range(max_num_blocks_per_seq)
-        ]
-        block_tables.append(block_table)
-    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
-
-    # Create the KV cache.
-    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
-                                                            block_size,
-                                                            1,
-                                                            num_kv_heads,
-                                                            head_size,
-                                                            kv_cache_dtype,
-                                                            dtype,
-                                                            device=device)
-    key_cache, value_cache = key_caches[0], value_caches[0]
-
-    # Prepare for the paged attention kernel.
-    output = torch.empty_like(query)
-    if version == "v2":
-        num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
-        tmp_output = torch.empty(
-            size=(num_seqs, num_query_heads, num_partitions, head_size),
-            dtype=output.dtype,
-            device=output.device,
-        )
-        exp_sums = torch.empty(
-            size=(num_seqs, num_query_heads, num_partitions),
-            dtype=torch.float32,
-            device=output.device,
-        )
-        max_logits = torch.empty_like(exp_sums)
-
-    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
-        torch.cuda.synchronize()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        # Using default kv_scale
-        kv_scale = 1.0
-
-        for _ in range(num_iters):
-            if version == "v1":
-                ops.paged_attention_v1(
-                    output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    kv_scale,
-                )
-            elif version == "v2":
-                ops.paged_attention_v2(
-                    output,
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    kv_scale,
-                )
-            else:
-                raise ValueError(f"Invalid version: {version}")
-        torch.cuda.synchronize()
-
-        end_time = time.perf_counter()
-        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        return (end_time - start_time) / num_iters
-
-    # Warmup.
-    print("Warming up...")
-    run_benchmark = run_cuda_benchmark
-    run_benchmark(num_iters=3, profile=False)
-
-    # Benchmark.
-    if do_profile:
-        latency = run_benchmark(num_iters=1, profile=True)
-    else:
-        latency = run_benchmark(num_iters=100, profile=False)
-    print(f"Kernel running time: {latency * 1000000:.3f} us")
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Benchmark the paged attention kernel.")
-    parser.add_argument("--version",
-                        type=str,
-                        choices=["v1", "v2"],
-                        default="v2")
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--seq_len", type=int, default=4096)
-    parser.add_argument("--num-query-heads", type=int, default=64)
-    parser.add_argument("--num-kv-heads", type=int, default=8)
-    parser.add_argument("--head-size",
-                        type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
-                        default=128)
-    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
-    parser.add_argument("--use-alibi", action="store_true")
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["half", "bfloat16", "float"],
-                        default="half")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--profile", action="store_true")
-    parser.add_argument(
-        "--kv-cache-dtype",
-        type=str,
-        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
-        default="auto",
-        help="Data type for kv cache storage. If 'auto', will use model "
-        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
-        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
-    args = parser.parse_args()
-    print(args)
-
-    if args.num_query_heads % args.num_kv_heads != 0:
-        raise ValueError("num_query_heads must be divisible by num_kv_heads")
-    main(
-        version=args.version,
-        num_seqs=args.batch_size,
-        seq_len=args.seq_len,
-        num_query_heads=args.num_query_heads,
-        num_kv_heads=args.num_kv_heads,
-        head_size=args.head_size,
-        block_size=args.block_size,
-        use_alibi=args.use_alibi,
-        dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
-        seed=args.seed,
-        do_profile=args.profile,
-        kv_cache_dtype=args.kv_cache_dtype,
-    )
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
+import time
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
+                        seed_everything)
+
+
+@torch.inference_mode()
+def main(num_tokens: int,
+         hidden_size: int,
+         static_scale: bool,
+         quant_dtype: torch.dtype,
+         dtype: torch.dtype,
+         seed: int = 0,
+         do_profile: bool = False,
+         num_warmup_iters: int = 5,
+         num_iters: int = 100) -> None:
+    seed_everything(seed)
+    torch.set_default_device("cuda")
+
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
+
+    def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
+        torch.cuda.synchronize()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        for _ in range(num_iters):
+            if quant_dtype == torch.int8:
+                ops.scaled_int8_quant(x, scale)
+            else:
+                ops.scaled_fp8_quant(x, scale)
+        torch.cuda.synchronize()
+
+        end_time = time.perf_counter()
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        return (end_time - start_time) / num_iters
+
+    # Warmup.
+    print("Warming up...")
+    run_benchmark = run_cuda_benchmark
+    run_benchmark(num_iters=num_warmup_iters, profile=False)
+
+    # Benchmark.
+    if do_profile:
+        latency = run_benchmark(num_iters=1, profile=True)
+    else:
+        latency = run_benchmark(num_iters=num_iters, profile=False)
+    print(f"Kernel running time: {latency * 1000000:.3f} us")
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError(f"Unsupported dtype: {dt}")
+
+    parser = FlexibleArgumentParser(
+        description="Benchmark the quantization (fp8 or int8) kernel.")
+    parser.add_argument("--num-tokens", type=int, default=4096)
+    parser.add_argument("--hidden-size", type=int, default=8192)
+    parser.add_argument("--static-scale", action="store_true")
+    parser.add_argument("--quant-dtype",
+                        type=str,
+                        choices=["fp8", "int8"],
+                        default="int8")
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["half", "bfloat16", "float"],
+                        default="half")
+
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--num-warmup-iters", type=int, default=5)
+    parser.add_argument("--num-iters",
+                        type=int,
+                        default=100,
+                        help="Number of benchmark iterations. "
+                        "If --profile is set, this number is ignored")
+
+    args = parser.parse_args()
+    print(args)
+
+    main(num_tokens=args.num_tokens,
+         hidden_size=args.hidden_size,
+         static_scale=args.static_scale,
+         quant_dtype=to_torch_dtype(args.quant_dtype),
+         dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
+         seed=args.seed,
+         do_profile=args.profile,
+         num_warmup_iters=args.num_warmup_iters,
+         num_iters=args.num_iters)
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
-import argparse
-from itertools import accumulate
-from typing import Optional
-
-import nvtx
-import torch
-
-from vllm.model_executor.layers.rotary_embedding import get_rope
-
-
-def benchmark_rope_kernels_multi_lora(
-    is_neox_style: bool,
-    batch_size: int,
-    seq_len: int,
-    num_heads: int,
-    head_size: int,
-    rotary_dim: Optional[int],
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    max_position: int = 8192,
-    base: int = 10000,
-) -> None:
-    torch.random.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-    torch.set_default_device(device)
-    if rotary_dim is None:
-        rotary_dim = head_size
-    # silulating serving 4 LoRAs
-    scaling_factors = [1, 2, 4, 8]
-    # batched RoPE can take multiple scaling factors
-    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
-                            is_neox_style, {
-                                "type": "linear",
-                                "factor": tuple(scaling_factors)
-                            })
-    # non-batched RoPE takes only one scaling factor, we create multiple
-    # instances to simulate the same behavior
-    non_batched_ropes = []
-    for scaling_factor in scaling_factors:
-        non_batched_ropes.append(
-            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
-                     {
-                         "type": "linear",
-                         "factor": (scaling_factor, )
-                     }))
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-
-    # create query offsets for batched RoPE, we concat multiple kv cache
-    # together and each query needs to find the right kv cache of its type
-    offset_map = torch.tensor(
-        list(
-            accumulate([0] + [
-                max_position * scaling_factor * 2
-                for scaling_factor in scaling_factors[:-1]
-            ])))
-    query_types = torch.randint(0,
-                                len(scaling_factors), (batch_size, seq_len),
-                                device=device)
-    # map query types to offsets
-    query_offsets = offset_map[query_types]
-    # the kernel takes flattened offsets
-    flatten_offsets = query_offsets.flatten()
-
-    # batched queries of the same type together for non-batched RoPE
-    queries = [query[query_types == i] for i in range(len(scaling_factors))]
-    keys = [key[query_types == i] for i in range(len(scaling_factors))]
-    packed_qkr = zip(queries, keys, non_batched_ropes)
-    # synchronize before start timing
-    torch.cuda.synchronize()
-    with nvtx.annotate("non-batched", color="yellow"):
-        for q, k, r in packed_qkr:
-            r.forward(positions, q, k)
-    torch.cuda.synchronize()
-    with nvtx.annotate("batched", color="green"):
-        batched_rope.forward(positions, query, key, flatten_offsets)
-    torch.cuda.synchronize()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Benchmark the rotary embedding kernels.")
-    parser.add_argument("--is-neox-style", type=bool, default=True)
-    parser.add_argument("--batch-size", type=int, default=16)
-    parser.add_argument("--seq-len", type=int, default=512)
-    parser.add_argument("--num-heads", type=int, default=8)
-    parser.add_argument("--head-size",
-                        type=int,
-                        choices=[64, 80, 96, 112, 128, 192, 256],
-                        default=128)
-    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["bfloat16", "float"],
-                        default="float")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--device",
-                        type=str,
-                        choices=["cuda:0", "cuda:1"],
-                        default="cuda:0")
-    args = parser.parse_args()
-    print(args)
-
-    benchmark_rope_kernels_multi_lora(
-        is_neox_style=args.is_neox_style,
-        batch_size=args.batch_size,
-        seq_len=args.seq_len,
-        num_heads=args.num_heads,
-        head_size=args.head_size,
-        rotary_dim=args.rotary_dim,
-        dtype=getattr(torch, args.dtype),
-        seed=args.seed,
-        device=args.device,
-    )
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
-WEIGHT_SHAPES = {
-    "ideal": [[4 * 256 * 32, 256 * 32]],
-    "mistralai/Mistral-7B-v0.1/TP1": [
-        [4096, 6144],
-        [4096, 4096],
-        [4096, 28672],
-        [14336, 4096],
-    ],
-    "mistralai/Mistral-7B-v0.1/TP2": [
-        [4096, 3072],
-        [2048, 4096],
-        [4096, 14336],
-        [7168, 4096],
-    ],
-    "mistralai/Mistral-7B-v0.1/TP4": [
-        [4096, 1536],
-        [1024, 4096],
-        [4096, 7168],
-        [3584, 4096],
-    ],
-    "meta-llama/Llama-2-7b-hf/TP1": [
-        [4096, 12288],
-        [4096, 4096],
-        [4096, 22016],
-        [11008, 4096],
-    ],
-    "meta-llama/Llama-2-7b-hf/TP2": [
-        [4096, 6144],
-        [2048, 4096],
-        [4096, 11008],
-        [5504, 4096],
-    ],
-    "meta-llama/Llama-2-7b-hf/TP4": [
-        [4096, 3072],
-        [1024, 4096],
-        [4096, 5504],
-        [2752, 4096],
-    ],
-    "meta-llama/Llama-2-13b-hf/TP1": [
-        [5120, 15360],
-        [5120, 5120],
-        [5120, 27648],
-        [13824, 5120],
-    ],
-    "meta-llama/Llama-2-13b-hf/TP2": [
-        [5120, 7680],
-        [2560, 5120],
-        [5120, 13824],
-        [6912, 5120],
-    ],
-    "meta-llama/Llama-2-13b-hf/TP4": [
-        [5120, 3840],
-        [1280, 5120],
-        [5120, 6912],
-        [3456, 5120],
-    ],
-    "meta-llama/Llama-2-70b-hf/TP1": [
-        [8192, 10240],
-        [8192, 8192],
-        [8192, 57344],
-        [28672, 8192],
-    ],
-    "meta-llama/Llama-2-70b-hf/TP2": [
-        [8192, 5120],
-        [4096, 8192],
-        [8192, 28672],
-        [14336, 8192],
-    ],
-    "meta-llama/Llama-2-70b-hf/TP4": [
-        [8192, 2560],
-        [2048, 8192],
-        [8192, 14336],
-        [7168, 8192],
-    ],
-}
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
+import math
+import pickle
+import re
+from collections import defaultdict
+from typing import List
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from torch.utils.benchmark import Measurement as TMeasurement
+
+from vllm.utils import FlexibleArgumentParser
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('filename', type=str)
+
+    args = parser.parse_args()
+
+    with open(args.filename, 'rb') as f:
+        data: List[TMeasurement] = pickle.load(f)
+
+    results = defaultdict(lambda: list())
+    for v in data:
+        result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label)
+        if result is not None:
+            KN = result.group(1)
+        else:
+            raise Exception("MKN not found")
+        result = re.search(r"MKN=\((\d+)x\d+x\d+\)", v.task_spec.sub_label)
+        if result is not None:
+            M = result.group(1)
+        else:
+            raise Exception("MKN not found")
+
+        kernel = v.task_spec.description
+        results[KN].append({
+            "kernel": kernel,
+            "batch_size": M,
+            "median": v.median
+        })
+
+    rows = int(math.ceil(len(results) / 2))
+    fig, axs = plt.subplots(rows, 2, figsize=(12, 5 * rows))
+    axs = axs.flatten()
+    for axs_idx, (shape, data) in enumerate(results.items()):
+        plt.sca(axs[axs_idx])
+        df = pd.DataFrame(data)
+        sns.lineplot(data=df,
+                     x="batch_size",
+                     y="median",
+                     hue="kernel",
+                     style="kernel",
+                     markers=True,
+                     dashes=False,
+                     palette="Dark2")
+        plt.title(f"Shape: {shape}")
+        plt.ylabel("time (median, s)")
+    plt.tight_layout()
+    plt.savefig("graph_machete_bench.pdf")
--- a/benchmarks/kernels/requirements.txt
+++ b/benchmarks/kernels/requirements.txt
+pandas
\ No newline at end of file