#  Copyright (c) Meta Platforms, Inc. and affiliates.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
"""benchmark for vit"""

import os

import click
import numpy as np
import torch
from aitemplate.compiler import compile_model, Model

from aitemplate.frontend import Tensor
from aitemplate.testing import detect_target

from modeling.vision_transformer import VisionTransformer
from weight_utils import export_to_torch_tensor

# flake8: noqa


def mark_output(y):
    if type(y) is not tuple:
        y = (y,)
    for i in range(len(y)):
        y[i]._attrs["is_output"] = True
        y[i]._attrs["name"] = "output_%d" % (i)
        y_shape = [d._attrs["values"][0] for d in y[i]._attrs["shape"]]
        print("output_{} shape: {}".format(i, y_shape))


USE_CUDA = detect_target().name() == "cuda"


def compile_vit(
    model_name,
    batch_size,
    class_token=False,
    global_pool="avg",
    use_fp16_acc=True,
):
    img_size = 224
    patch_size = 16
    embed_dim = 768
    num_heads = 12
    depth = 12
    if model_name == "vit_base_patch16_224":
        img_size = 224
        patch_size = 16
        embed_dim = 768
        num_heads = 12
        depth = 12
    elif model_name == "vit_large_patch16_384":
        img_size = 384
        patch_size = 16
        embed_dim = 1024
        num_heads = 16
        depth = 24
    seqlen = (img_size // patch_size) ** 2 + (1 if class_token else 0)
    ait_model = VisionTransformer(
        batch_size=batch_size,
        img_size=img_size,
        class_token=class_token,
        global_pool=global_pool,
        num_heads=num_heads,
        embed_dim=embed_dim,
        patch_size=patch_size,
        depth=depth,
        act_layer="GELU",
    )
    ait_model.name_parameter_tensor()
    inputs_ait = Tensor(
        [batch_size, img_size, img_size, 3], name="input0", is_input=True
    )
    Y = ait_model(inputs_ait)
    mark_output(Y)

    target = detect_target(use_fp16_acc=use_fp16_acc)
    exe_module = compile_model(
        Y, target, "./tmp", "vision_transformer_bs%d_seq%d" % (batch_size, seqlen), profile_devs=[0,1,2,3]
    )
    return exe_module


def benchmark(model_name, batch_size, params_ait, mod=None, graph_mode=True):
    # load mod
    if model_name == "vit_base_patch16_224":
        img_size = 224
        patch_size = 16
        embed_dim = 768
        num_heads = 12
        depth = 12
    elif model_name == "vit_large_patch16_384":
        img_size = 384
        patch_size = 16
        embed_dim = 1024
        num_heads = 16
        depth = 24
    else:
        raise NotImplementedError

    seqlen = (img_size // patch_size) ** 2 + 1

    if mod is None:
        model_dir = f"vision_transformer_bs{batch_size}_seq{seqlen}"
        mod = Model(os.path.join("./tmp", model_dir, "test.so"))

    # prepare params
    params_ait["cls_token_mask"] = torch.zeros((batch_size, 1, embed_dim)).cuda().half()
    params_ait["fc_norm_weight"] = params_ait["norm_weight"]
    params_ait["fc_norm_bias"] = params_ait["norm_bias"]
    if detect_target().name() == "cuda":
        ait_key = "attn_cu_length"
        for i in range(depth):
            prefix = "blocks_%d" % (i)
            cu_len = np.cumsum([0] + [seqlen] * batch_size).astype("int32")
            params_ait[f"{prefix}_{ait_key}"] = torch.from_numpy(cu_len).cuda()

    # set weights
    mod.set_many_constants_with_tensors(params_ait)
    mod.fold_constants(sync=True)

    # prepare input/output tensor
    inputs = [torch.randn([batch_size, img_size, img_size, 3]).cuda().half()]
    ys = []
    num_outputs = len(mod.get_output_name_to_index_map())
    for i in range(num_outputs):
        shape = mod.get_output_maximum_shape(i)
        ys.append(torch.empty(shape).cuda().half())
    # warm up
    t, _, __ = mod.benchmark_with_tensors(
        inputs,
        ys,
        count=100,
        repeat=4,
        graph_mode=graph_mode,
    )
    # benchmark
    t, _, __ = mod.benchmark_with_tensors(
        inputs,
        ys,
        count=100,
        repeat=4,
        graph_mode=graph_mode,
    )
    print(f"batch_size: {batch_size}, latency: {t}")
    dev_flag = os.environ.get("HIP_VISIBLE_DEVICES", "-1")
    dev_flag = dev_flag.replace(",", "_")
    with open(f"{model_name}_ait_benchmark_dev_{dev_flag}.txt", "a") as f:
        f.write(f"batch_size: {batch_size}, latency: {t}\n")


@click.command()
@click.option("--model-name", type=str, default="vit_base_patch16_224")
@click.option(
    "--use-fp16-acc",
    type=bool,
    default=True,
    help="Whether to use FP16 for accumulation (similar to TensorRT)",
)
@click.option("--use-graph", type=bool, default=True, help="Whether to use CUDA graph")
@click.option("--batch-size", type=int, default=0, help="Batch size")
def main(
    model_name="vit_base_patch16_224", use_fp16_acc=True, use_graph=True, batch_size=0
):
    if detect_target().name() == "rocm":
        use_graph = False
        
    if model_name == "vit_base_patch16_224":
        pretrained_path = "./vit_base_patch16_224.augreg2_in21k_ft_in1k/pytorch_model.bin"
    elif model_name == "vit_large_patch16_384":
        pretrained_path = "./vit_large_patch16_384.augreg_in21k_ft_in1k/pytorch_model.bin"
    else:
        raise NotImplementedError
    params_ait = export_to_torch_tensor(model_name, model_path=pretrained_path, pretrained=True)
    
    if batch_size < 1:
        for bs in (1, 2, 4, 8, 16, 32, 64, 128, 256):
            compile_vit(model_name, bs, class_token=True, use_fp16_acc=use_fp16_acc)
            benchmark(model_name, bs, params_ait, graph_mode=use_graph)
    else:
        benchmark(model_name, batch_size, params_ait, graph_mode=use_graph)


if __name__ == "__main__":
    main()