// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

/**
 * @file cublas_helper.cpp
 * @brief  Cpp file for some functions related to cublas
 */

#include "cublas_benchmark.h"

/**
 * @brief check cuda function running status and throw error str
 */
void check_cuda(cudaError_t result, char const *const func, const char *const file, int const line) {
    if (result != cudaSuccess) {
        const char *msg = cudaGetErrorString(result);
        std::stringstream safe_call_ss;
        safe_call_ss << func << " failed with error"
                     << "\nfile: " << file << "\nline: " << line << "\nmsg: " << msg;
        // Make sure we call CUDA Device Reset before exiting
        throw std::runtime_error(safe_call_ss.str());
    }
}

/**
 * @brief check cublas function running status and throw error str
 */
void check_cublas(cublasStatus_t result, char const *const func, const char *const file, int const line) {
    if (result != CUBLAS_STATUS_SUCCESS) {

        std::stringstream safe_call_ss;
        safe_call_ss << func << " failed with error"
                     << "\nfile: " << file << "\nline: " << line << "\nmsg: " << result;
        // Make sure we call CUDA Device Reset before exiting
        throw std::runtime_error(safe_call_ss.str());
    }
}

/**
 * @brief Cuda context init
 */
void cuda_init(cublasHandle_t *cublas_handle) {
    CUDA_SAFE_CALL(cudaDeviceReset());
    CUDA_SAFE_CALL(cudaSetDevice(0));
    // create streams/handles
    CUBLAS_SAFE_CALL(cublasCreate(cublas_handle));
}

/**
 * @brief Cuda context free
 */
void cuda_free(cublasHandle_t *cublas_handle) {
    CUBLAS_SAFE_CALL(cublasDestroy(*cublas_handle));
    CUDA_SAFE_CALL(cudaSetDevice(0));
}

/**
 * @brief                   cublas function of gemm, wrapper of cublasSgemm
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 */
void sgemm(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const float *a, const float *b,
           float *c) {
    float alpha = 1.0f;
    float beta = 1.0f;
    CUBLAS_SAFE_CALL(cublasSgemm(handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N), (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m,
                                 n, k, &alpha, a, (transa ? k : m), b, (transb ? n : k), &beta, c, m));
}

/**
 * @brief                   cublas function of gemm, wrapper of cublasCgemm
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 */
void cgemm(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const cuComplex *a, const cuComplex *b,
           cuComplex *c) {
    cuComplex alpha = make_cuComplex(1.0f, 0.0f);
    cuComplex beta = make_cuComplex(0.0f, 0.0f);
    CUBLAS_SAFE_CALL(cublasCgemm(handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N), (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m,
                                 n, k, &alpha, a, (transa ? k : m), b, (transb ? n : k), &beta, c, m));
}

/**
 * @brief                   cublas function of GemmEx, wrapper of cublasGemmEx
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 * @param  type             matrix type, 'float' or 'half'
 * @param  use_tensor_core  whether use tensor core
 */
void gemmEx(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const void *a, const void *b, void *c,
            std::string type, bool use_tensor_core) {
    float alpha = 1.0f;
    float beta = 0.0f;
    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
    cudaDataType_t matrix_type;
    cublasGemmAlgo_t algo;
    algo = (use_tensor_core ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
    if (type.compare("float")) {
        matrix_type = CUDA_R_32F;
    } else {
        if (type.compare("half")) {
            matrix_type = CUDA_R_16F;
        } else {
            throw "invalid datatype";
        }
    }
    CUBLAS_SAFE_CALL(cublasGemmEx(handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N), (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m,
                                  n, k, &alpha, a, matrix_type, (transa ? k : m), b, matrix_type, (transb ? n : k),
                                  &beta, c, matrix_type, m, compute_type, algo));
}

/**
 * @brief                   cublas function of gemmStridedBatchedEx, wrapper of cublasGemmStridedBatchedEx
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 * @param  type             matrix type, 'float' or 'half'
 * @param  use_tensor_core  whether use tensor core
 * @param  batchCount       My Param doc
 */
void gemmStridedBatchedEx(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const void *a,
                          const void *b, void *c, std::string type, bool use_tensor_core, int batchCount) {
    float alpha = 1.0f;
    float beta = 1.0f;
    cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F;
    cudaDataType_t matrix_type;
    cublasGemmAlgo_t algo;
    algo = (use_tensor_core ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
    if (type.compare("float")) {
        matrix_type = CUDA_R_32F;
    } else {
        if (type.compare("half")) {
            matrix_type = CUDA_R_16F;
        } else {
            throw "invalid datatype";
        }
    }
    CUBLAS_SAFE_CALL(cublasGemmStridedBatchedEx(handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N),
                                                (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m, n, k, &alpha, a, matrix_type,
                                                (transa ? k : m), m * k, b, matrix_type, (transb ? n : k), n * k, &beta,
                                                c, matrix_type, m, m * n, batchCount, compute_type, algo));
}

/**
 * @brief                   cublas function of gemmStridedBatchedEx, wrapper of cublasGemmStridedBatchedEx
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 * @param  batchCount       the count of batch used to compute
 */
void sgemmStridedBatched(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const float *a,
                         const float *b, float *c, int batchCount) {
    float alpha = 1.0f;
    float beta = 1.0f;
    CUBLAS_SAFE_CALL(cublasSgemmStridedBatched(
        handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N), (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m, n, k, &alpha, a,
        (transa ? k : m), m * k, b, (transb ? n : k), n * k, &beta, c, m, m * n, batchCount));
}

/**
 * @brief
 * @brief                   cublas function of sgemmStridedBatched, wrapper of cublasSgemmStridedBatched
 * @param  handle           cublas handle
 * @param  transa           whether matrixA transpose
 * @param  transb           whether matrixB transpose
 * @param  m                m of matrix m*n,n*k
 * @param  n                n of matrix m*n,n*k
 * @param  k                k of matrix m*n,n*k
 * @param  a                input matrixA
 * @param  b                input matrixB
 * @param  c                output matrix
 * @param  batchCount       the count of batch used to compute
 */
void cgemm3mStridedBatched(cublasHandle_t handle, int transa, int transb, int m, int n, int k, const cuComplex *a,
                           const cuComplex *b, cuComplex *c, int batchCount) {
    cuComplex alpha = make_cuComplex(1.0f, 0.0f);
    cuComplex beta = make_cuComplex(0.0f, 0.0f);
    CUBLAS_SAFE_CALL(cublasCgemm3mStridedBatched(
        handle, (transa ? CUBLAS_OP_T : CUBLAS_OP_N), (transb ? CUBLAS_OP_T : CUBLAS_OP_N), m, n, k, &alpha, a,
        (transa ? k : m), m * k, b, (transb ? n : k), n * k, &beta, c, m, m * n, batchCount));
}