bench_mrope.py

# Adapted from vLLM benchmark_mrope.py

# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
# It generates test data, runs benchmarks, and saves results to a CSV file.
#
# The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup
#
# == Usage Examples ==
#
# Single model benchmark:
# python3 benchmark_mrope.py --model-name Qwen/Qwen2.5-VL-7B-Instruct --tp-size 8 \
#   --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024

import argparse
import time
from typing import Any

import numpy as np
import torch
from transformers import AutoConfig

from sglang.srt.layers.rotary_embedding import get_rope

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def get_model_config(model_name: str):
    """Get model configuration parameters"""
    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    return config


def generate_test_data(
    num_tokens: int,
    num_q_heads: int,
    num_kv_heads: int,
    head_size: int,
    max_position_embeddings: int,
    dtype: torch.dtype,
    device: torch.device,
):
    """Generate test data for given configuration."""
    # Create 2D positions (3, num_tokens) for multimodal case
    positions = torch.randint(
        0, max_position_embeddings // 4, (3, num_tokens), device=device
    )

    # Create query and key tensors
    query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
    key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)

    return positions, query, key


def calculate_stats(times: list[float]) -> dict[str, float]:
    """Calculate statistics from a list of times."""
    times_array = np.array(times)
    return {
        "mean": np.mean(times_array),
        "median": np.median(times_array),
        "p99": np.percentile(times_array, 99),
        "min": np.min(times_array),
        "max": np.max(times_array),
    }


def benchmark_mrope(
    model_name: str,
    num_tokens: int,
    head_dim: int,
    tp_size: int,
    num_heads: int,
    num_kv_heads: int,
    max_position: int = 8192,
    rope_theta: float = 10000,
    is_neox_style: bool = True,
    rope_scaling: dict[str, Any] = None,
    dtype: torch.dtype = torch.bfloat16,
    seed: int = 0,
    warmup_iter: int = 10,
    benchmark_iter: int = 100,
):
    torch.manual_seed(seed)
    torch.set_default_device(device)
    # the parameters to compute the q k v size based on tp_size
    mrope_helper_class = get_rope(
        head_size=head_dim,
        rotary_dim=head_dim,
        max_position=max_position,
        base=rope_theta,
        is_neox_style=is_neox_style,
        rope_scaling=rope_scaling,
        dtype=dtype,
    ).to(device=device)

    print(80 * "=")
    print(
        f"Evaluating model: {model_name} "
        f"with tp_size: {tp_size} "
        f"and num_tokens: {num_tokens}, "
        f"dtype: {dtype}"
    )

    # create q k v input tensors
    # create rotary pos emb input tensors
    positions, query, key = generate_test_data(
        num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
    )

    # Warm up
    for _ in range(warmup_iter):
        mrope_helper_class.forward_native(
            positions,
            query.clone(),
            key.clone(),
        )

        mrope_helper_class.forward(
            positions,
            query.clone(),
            key.clone(),
        )

    torch.cuda.synchronize()

    # Time reference implementation
    torch_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
        torch.cuda.synchronize()
        start_time = time.time()

        mrope_helper_class.forward_native(
            positions,
            query_clone,
            key_clone,
        )

        torch.cuda.synchronize()
        torch_times.append(time.time() - start_time)

    # Time triton kernel implementation
    triton_times = []
    for _ in range(benchmark_iter):
        query_clone = query.clone()
        key_clone = key.clone()
        torch.cuda.synchronize()
        start_time = time.time()
        mrope_helper_class.forward(
            positions,
            query_clone,
            key_clone,
        )
        torch.cuda.synchronize()
        triton_times.append(time.time() - start_time)

    # Calculate statistics
    torch_stats = calculate_stats(torch_times)
    triton_stats = calculate_stats(triton_times)
    print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")

    print(
        f"Torch implementation: "
        f"mean={torch_stats['mean']:.8f}s, "
        f"median={torch_stats['median']:.8f}s, "
        f"p99={torch_stats['p99']:.8f}s"
    )

    print(
        f"Triton implementation: "
        f"mean={triton_stats['mean']:.8f}s, "
        f"median={triton_stats['median']:.8f}s, "
        f"p99={triton_stats['p99']:.8f}s"
    )

    print(
        f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
    )

    return torch_stats, triton_stats


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Benchmark the rotary embedding kernels."
    )
    parser.add_argument("--model-name", type=str, default="")
    parser.add_argument("--tp-size", type=int, default=1)
    parser.add_argument("--warmup-iter", type=int, default=10)
    parser.add_argument("--benchmark-iter", type=int, default=100)
    parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
    parser.add_argument("--trust-remote-code", action="store_true")
    args = parser.parse_args()
    print(args)

    model_tp_dict = {}
    if args.model_name == "":
        model_tp_dict = {
            "Qwen/Qwen2-VL-2B-Instruct": [1],
            "Qwen/Qwen2-VL-7B-Instruct": [1],
            "Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
            "Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
            "Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
            "Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
        }
    else:
        model_tp_dict[args.model_name] = [args.tp_size]

    if args.num_tokens is None:
        num_tokens_list = [2**i for i in range(0, 18)]
    else:
        num_tokens_list = args.num_tokens

    for model_name, tp_list in model_tp_dict.items():
        for tp_size in tp_list:
            config = get_model_config(model_name)
            # get the model config
            total_num_kv_heads = config.num_key_value_heads
            total_num_heads = config.num_attention_heads
            num_heads = total_num_heads // tp_size
            num_kv_heads = max(1, total_num_kv_heads // tp_size)
            head_dim = config.hidden_size // total_num_heads
            is_neox_style = True
            rope_theta = config.rope_theta
            max_position = config.max_position_embeddings

            for num_tokens in num_tokens_list:
                benchmark_mrope(
                    model_name=model_name,
                    num_tokens=num_tokens,
                    head_dim=head_dim,
                    tp_size=tp_size,
                    num_heads=num_heads,
                    num_kv_heads=num_kv_heads,
                    max_position=max_position,
                    rope_theta=rope_theta,
                    is_neox_style=is_neox_style,
                    rope_scaling=config.rope_scaling,
                    dtype=getattr(torch, args.dtype),
                    seed=args.seed,
                    warmup_iter=args.warmup_iter,
                    benchmark_iter=args.benchmark_iter,
                )