# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

import os, sys, time
import subprocess
import pandas as pd
import numpy as np
import torch
import nvtx
import transformer_engine
from tests.pytorch.fused_attn.test_fused_attn import (
    ModelConfig,
    _is_flash_attention_supported,
    _is_fused_attention_supported,
    _is_unfused_attention_supported,
    _run_dot_product_attention,
)

pd.set_option("display.precision", 4)

# data type
dtype = torch.bfloat16
# number of iterations after 3 warmup iterations
num_iters = 3
# checkpointing
ckpt_attn = False
# workspace optimization path for cuDNN attention
workspace_opt = True
# QKV memory layout
qkv_layout = "bshd_bshd_bshd"
# sliding window attention
swa = False
# padding between sequences for qkv_format=thd
pad_between_seqs = False
# training mode
is_training = True

model_configs = {
    #   test:             b,  h, hg,   d,   sq,  skv,   p,     mask,              bias
    "test_0": ModelConfig(2, 16, 16, 64, 512, 512, 0.0, "no_mask", "no_bias"),  # short seq
    "test_1": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "no_bias"),  # longer seq, mask
    "test_2": ModelConfig(2, 16, 16, 128, 2048, 2048, 0.0, "causal", "post_scale_bias"),  # bias
    "test_3": ModelConfig(2, 32, 4, 128, 8192, 8192, 0.0, "causal", "no_bias"),  # GQA
}


def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supported):
    config = model_configs[model]
    if dtype == torch.bfloat16:
        tols = dict(atol=2.5e-2, rtol=2.5e-2)
    else:
        tols = dict(atol=5e-3, rtol=5e-3)

    cudnn_times = []
    flash_times = []
    warmup_iters = 3
    for i in range(warmup_iters):
        if fused_attn_supported:
            fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention(
                dtype,
                config,
                "FusedAttention",
                ckpt_attn,
                qkv_layout,
                workspace_opt,
                swa,
                pad_between_seqs,
                is_training,
            )
        if flash_attn_supported:
            flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention(
                dtype,
                config,
                "FlashAttention",
                ckpt_attn,
                qkv_layout,
                workspace_opt,
                swa,
                pad_between_seqs,
                is_training,
            )
        if fused_attn_supported and flash_attn_supported:
            torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, **tols)
            for i, _ in enumerate(flash_attn_bwd):
                torch.testing.assert_close(fused_attn_bwd[i], flash_attn_bwd[i], **tols)

    torch.cuda.cudart().cudaProfilerStart()
    torch.cuda.synchronize()
    fused_attn_start = time.time()
    if fused_attn_supported:
        for i in range(num_iters):
            fused_attn_fwd, fused_attn_bwd = _run_dot_product_attention(
                dtype,
                config,
                "FusedAttention",
                ckpt_attn,
                qkv_layout,
                workspace_opt,
                swa,
                pad_between_seqs,
                is_training,
            )
    torch.cuda.synchronize()
    fused_attn_time = time.time() - fused_attn_start if fused_attn_supported else 0

    torch.cuda.synchronize()
    flash_attn_start = time.time()
    if flash_attn_supported:
        for i in range(num_iters):
            flash_attn_fwd, flash_attn_bwd = _run_dot_product_attention(
                dtype,
                config,
                "FlashAttention",
                ckpt_attn,
                qkv_layout,
                workspace_opt,
                swa,
                pad_between_seqs,
                is_training,
            )
    torch.cuda.synchronize()
    flash_attn_time = time.time() - flash_attn_start if flash_attn_supported else 0

    df = pd.read_csv("times.csv")
    df = pd.concat(
        [
            df,
            pd.DataFrame(
                [
                    [
                        fused_attn_time * 1e3 / num_iters,
                        0,
                        0,
                        0,
                        flash_attn_time * 1e3 / num_iters,
                        0,
                        0,
                        0,
                        0,
                    ]
                ],
                columns=df.columns,
            ),
        ],
        ignore_index=True,
    )
    df.to_csv("times.csv", index=False)
    torch.cuda.cudart().cudaProfilerStop()


def parse_results(per_cudnn, per_flash, model):
    filename = f"prof_{model}_cuda_gpu_trace.csv"
    df = pd.read_csv(os.path.join("./", filename))
    df_times = pd.read_csv("times.csv")
    row = len(df_times.index) - 1

    if per_cudnn > 0:
        t_cudnn_all = df[df["Name"].str.contains("cudnn")]["Duration (ns)"].to_numpy()
        t_cudnn_all = t_cudnn_all.reshape(-1, per_cudnn)
        t_cudnn_avg = np.average(t_cudnn_all, axis=0)
        df_times.loc[row, "FusedAttention Kernels (fwd)"] = t_cudnn_avg[0] / 1e6
        df_times.loc[row, "FusedAttention Kernels (bwd)"] = t_cudnn_avg[1:4].sum() / 1e6
        df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"] = t_cudnn_avg.sum() / 1e6

    if per_flash > 0:
        t_flash_all = df[df["Name"].str.contains("void flash")]["Duration (ns)"].to_numpy()
        t_flash_all = t_flash_all.reshape(-1, per_flash)
        t_flash_avg = np.average(t_flash_all, axis=0)
        df_times.loc[row, "FlashAttention Kernels (fwd)"] = t_flash_avg[0] / 1e6
        df_times.loc[row, "FlashAttention Kernels (bwd)"] = t_flash_avg[1:4].sum() / 1e6
        df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"] = t_flash_avg.sum() / 1e6

    if per_cudnn > 0 and per_flash > 0:
        df_times.loc[row, "Fused vs Flash Kernels Speedup (fwd+bwd)"] = (
            df_times.loc[row, "FlashAttention Kernels (fwd+bwd)"]
            / df_times.loc[row, "FusedAttention Kernels (fwd+bwd)"]
        )
    df_times.to_csv("times.csv", index=False)


def main():
    times = pd.DataFrame(
        columns=[
            "FusedAttention Module",
            "FusedAttention Kernels (fwd)",
            "FusedAttention Kernels (bwd)",
            "FusedAttention Kernels (fwd+bwd)",
            "FlashAttention Module",
            "FlashAttention Kernels (fwd)",
            "FlashAttention Kernels (bwd)",
            "FlashAttention Kernels (fwd+bwd)",
            "Fused vs Flash Kernels Speedup (fwd+bwd)",
        ]
    )
    times.to_csv("times.csv", index=False)

    device_id = torch.cuda.current_device()
    device_properties = torch.cuda.get_device_properties(device_id)
    print(
        f"Device {device_id}: "
        f"{device_properties.name} GPU, "
        f"sm{device_properties.major}{device_properties.minor} compute capability, "
        f"{device_properties.total_memory/1024**3:.1f}GB memory"
    )
    for model in model_configs.keys():
        config = model_configs[model]
        fused_attn_supported, fused_attn_backend = _is_fused_attention_supported(
            config,
            dtype,
            qkv_layout=qkv_layout,
        )
        fused_attn_supported = fused_attn_supported and not swa
        flash_attn_supported = _is_flash_attention_supported(config)
        print(
            f'Running {model} with {"cuDNN attention" if fused_attn_supported else ""}'
            f'{" and flash-attention" if flash_attn_supported else ""}...'
        )

        prof_cmd = [
            "nsys",
            "profile",
            "--capture-range=cudaProfilerApi",
            "--capture-range-end=stop-shutdown",
            "--force-overwrite=true",
            f"--output=prof_{model}",
            "python",
            "-c",
            f""" "import benchmark_attention;""",
            f"""benchmark_attention.benchmark_dot_product_attention("""
            f"""'{model}', {fused_attn_supported}, {flash_attn_supported})" """,
        ]
        prof_cmd = " ".join(prof_cmd)
        subprocess.call(prof_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)
        stats_cmd = [
            "nsys",
            "stats",
            "-q",
            "-r",
            "cuda_gpu_trace",
            "--format",
            "csv,column",
            "--force-overwrite=true",
            "--force-export=true",
            f"--output=prof_{model}",
            f"prof_{model}.nsys-rep",
        ]
        if fused_attn_supported:
            num_kernels_cudnn = 4
            if config.attn_bias_type == "post_scale_bias":
                num_kernels_cudnn = num_kernels_cudnn + 1
            if config.num_heads != config.num_gqa_groups:
                num_kernels_cudnn = num_kernels_cudnn + 2
        else:
            num_kernels_cudnn = 0
        num_kernels_flash = 4 if flash_attn_supported else 0
        stats_cmd = " ".join(stats_cmd)
        subprocess.call(stats_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)
        parse_cmd = [
            "python",
            "-c",
            f""" "import benchmark_attention;""",
            f"""benchmark_attention.parse_results("""
            f"""{num_kernels_cudnn}, {num_kernels_flash}, '{model}')" """,
        ]
        parse_cmd = " ".join(parse_cmd)
        subprocess.call(parse_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)

    df_times = pd.read_csv("times.csv")
    df_times.index = list(model_configs.keys())
    a = df_times[
        [
            "FusedAttention Kernels (fwd+bwd)",
            "FlashAttention Kernels (fwd+bwd)",
            "Fused vs Flash Kernels Speedup (fwd+bwd)",
        ]
    ]
    a.columns = ["cuDNN fwd+bwd (ms)", "flash-attn fwd+bwd (ms)", "cuDNN vs flash speedup"]
    print()
    print(a)


if __name__ == "__main__":
    main()