# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module of the cuBLASLt GEMM benchmark."""

import os

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import BlasLtBaseBenchmark


class CublasLtBenchmark(BlasLtBaseBenchmark):
    """The cuBLASLt GEMM benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)

        self._bin_name = 'cublaslt_gemm'
        self._in_types = ['fp64', 'fp32', 'fp16', 'bf16', 'fp8e4m3', 'fp8e5m2', 'fp4e2m1', 'int8']

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

        self._parser.add_argument(
            '--in_types',
            type=str,
            nargs='+',
            default=['fp8e4m3'],
            required=False,
            help='List of input data types, support {}.'.format(' '.join(self._in_types)),
        )
        self._parser.add_argument(
            '--enable_autotune',
            action='store_true',
            required=False,
            help='Enable exhaustive autotune mode to find best algorithm.',
        )
        self._parser.add_argument(
            '--num_warmup_autotune',
            type=int,
            default=20,
            required=False,
            help='Number of warm up steps for autotune.',
        )
        self._parser.add_argument(
            '--num_steps_autotune',
            type=int,
            default=50,
            required=False,
            help='Number of steps to measure for autotune.',
        )
        self._parser.add_argument(
            '--enable_ncu_profiling',
            action='store_true',
            required=False,
            help='Enable ncu profiling for each run.',
        )
        self._parser.add_argument(
            '--profiling_metrics',
            type=str,
            nargs='+',
            default=None,
            required=False,
            help='List of ncu profiling metrics, support all ncu metrics.',
        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

        self._commands = []
        for _m, _n, _k, _b, _in_type in self._shapes_to_run:
            # pull out the autotune args onto their own short f-string
            autotune_args = (
                f' -a -W {self._args.num_warmup_autotune}'
                f' -I {self._args.num_steps_autotune}'
            ) if self._args.enable_autotune else ''
            command = f'{self.__bin_path} -m {_m} -n {_n} -k {_k} -b {_b} ' + \
                f'-w {self._args.num_warmup} -i {self._args.num_steps} -t {_in_type}' + \
                f'{(" " + autotune_args) if autotune_args else ""}'
            if self._args.enable_ncu_profiling:
                skip_num = self._args.num_warmup - 1 if self._args.num_warmup > 1 else 0
                command = f'ncu --set full --launch-skip {skip_num} --launch-count 1 --csv ' + command
            self._commands.append(command)

        return True

    def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
        """Function to parse raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        self._result.add_raw_data(f'raw_output_{cmd_idx}', raw_output, self._args.log_raw_data)

        try:
            if not self._args.enable_ncu_profiling:
                fields = raw_output.strip().split()
                if len(fields) != 6 or not all(x.isdigit() for x in fields[:4]):
                    raise ValueError('Invalid result.')
                self._result.add_result(
                    f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
                    float(fields[-1])
                )
            else:
                lines = raw_output.strip().split('\n')
                # find line index of the line that starts with "ID","Process ID"
                start_idx = next(i for i, line in enumerate(lines) if 'Metric Name' in line)
                if start_idx == 0 or start_idx == len(lines) - 1:
                    raise ValueError('Invalid result.')
                result_lines = lines[0:start_idx - 1]
                result = False
                size = ''
                for line in result_lines:
                    fields = line.strip().split()
                    if len(fields) == 6 and all(x.isdigit() for x in fields[:4]):
                        result = True
                        size = f'{fields[3]}_{"_".join(fields[:3])}'
                        self._result.add_result(
                            f'{self._commands[cmd_idx].split()[-1]}_{fields[3]}_{"_".join(fields[:3])}_flops',
                            float(fields[-1])
                        )
                if not result:
                    raise ValueError('Invalid result.')
                metric_name_index = lines[start_idx].strip().split(',').index('"Metric Name"')
                metric_value_index = lines[start_idx].strip().split(',').index('"Metric Value"')
                if metric_name_index < 0 or metric_value_index < 0:
                    raise ValueError('Can not find Metric Name and Value.')
                for line in lines[start_idx + 1:]:
                    fields = line.strip().split('","')
                    metric_name = fields[metric_name_index].strip('"').replace(' ', '_')
                    if len(fields) < 15:
                        continue
                    if not self._args.profiling_metrics or metric_name in self._args.profiling_metrics:
                        value = fields[metric_value_index].strip(',').strip('"')
                        try:
                            float_value = float(value)
                            self._result.add_result(
                                f'{self._commands[cmd_idx].split()[-1]}_{size}_{metric_name}', float_value
                            )
                        except ValueError:
                            pass
        except BaseException as e:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
                    self._curr_run_index, self._name, raw_output, str(e)
                )
            )
            return False

        return True


BenchmarkRegistry.register_benchmark('cublaslt-gemm', CublasLtBenchmark, platform=Platform.CUDA)