// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
// Copyright (c) Facebook, Inc. and its affiliates.
//
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#include <ops_hip.cuh>
#include <kernels_hip.cuh>
#include <hipcub/hipcub.hpp>
#include <hipblas/hipblas.h>
#include <hipsparse/hipsparse.h>
#ifndef NO_HIPBLASLT
#include <hipblaslt/hipblaslt.h>
#endif
#include <limits>
#include <BinSearch.h>
#include <cassert>
#include <common.h>

#define ERR_NOT_IMPLEMENTED 100

#if defined(__GFX9__)
  #define WARP_SIZE 64
#else
  #define WARP_SIZE 32
#endif

using namespace BinSearch;
using std::cout;
using std::endl;

void quantize(float *code, float *A, unsigned char *out, int n)
{
  int num_blocks = n/1024;
  num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
 hipLaunchKernelGGL(( kQuantize), dim3(num_blocks), dim3(1024), 0, 0, code, A, out, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

void dequantize(float *code, unsigned char *A, float *out, int n, hipStream_t stream)
{
  int num_blocks = n/1024;
  num_blocks = n % 1024 == 0 ? num_blocks : num_blocks + 1;
  hipLaunchKernelGGL(( kDequantize), dim3(num_blocks), dim3(1024), 0, stream, code, A, out, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, int blocksize, const int n)
{
  int num_blocks = n/blocksize;
  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;

  if(blocksize == 4096)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 2048)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(512), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 1024)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 1024, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(256), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 512)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 512, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(256), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 256)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 256, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(128), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  else if(blocksize == 128)
   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 128, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(64), 0, 0, code, A, absmax, out, rand, rand_offset, n);
  //else if(blocksize == 64)
  // hipLaunchKernelGGL(( kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE>), dim3(num_blocks), dim3(32), 0, 0, code, A, absmax, out, rand, rand_offset, n);


  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

template<typename T, int DATA_TYPE> void dequantizeBlockwise(float *code, unsigned char *A, float *absmax, T *out, int blocksize, const int n, hipStream_t stream)
{
  int num_blocks = n/blocksize;
  num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
  int tile_size = (DATA_TYPE > 0) ? 1024 : 512;

  if(DATA_TYPE > 0)
   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize/2, n);
  else
   hipLaunchKernelGGL(( kDequantizeBlockwise<T, 512, 64, 8, DATA_TYPE>), dim3((n+tile_size-1)/tile_size), dim3(64), 0, stream, code, A, absmax, out, blocksize, n);

  CUDA_CHECK_RETURN(hipPeekAtLastError());
}


template<typename T, int OPTIMIZER> void optimizer32bit(T* g, T* p,
                float* state1, float* state2, float *unorm, float max_unorm, float param_norm,
                const float beta1, const float beta2, const float beta3, const float alpha,
                const float eps, const float weight_decay,
                const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n)
{
  int num_blocks = n/4096;
  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;
	switch(OPTIMIZER)
	{
		case ADAM:
    case ADEMAMIX:
      if(max_unorm > 0.0f)
			{
				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
       hipLaunchKernelGGL(( kPreconditionOptimizer32bit2State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, state2, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
      }
		hipLaunchKernelGGL((	kOptimizer32bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, beta3, alpha, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
			break;
		case MOMENTUM:
    case RMSPROP:
    case ADAGRAD:
      if(max_unorm > 0.0f)
			{
				CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
			hipLaunchKernelGGL((	kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
			}

		hipLaunchKernelGGL((	kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
			break;
    case LION:
      // in lion, the momentum update after the parameter update
     hipLaunchKernelGGL(( kOptimizer32bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, g, p, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());

      if(max_unorm > 0.0f)
      {
        CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float)));
       hipLaunchKernelGGL(( kPreconditionOptimizer32bit1State<T, OPTIMIZER, 4096, 8>), dim3(num_blocks), dim3(512), 0, 0, g, p, state1, unorm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, n);
        CUDA_CHECK_RETURN(hipPeekAtLastError());
      }
      break;
	}
}

template<typename T, int OPTIMIZER> void optimizerStatic8bit(T* p, T* g,
                unsigned char* state1, unsigned char* state2,
                float *unorm, float max_unorm, float param_norm,
                float beta1, float beta2,
                float eps, int step, float lr,
                float* quantiles1, float* quantiles2,
                float* max1, float* max2, float* new_max1, float* new_max2,
                float weight_decay,
                const float gnorm_scale, int n)
{
  int num_blocks = n/4096;
  num_blocks = n % 4096 == 0 ? num_blocks : num_blocks + 1;

  if(max_unorm > 0.0f){ CUDA_CHECK_RETURN(hipMemset(unorm, 0, 1*sizeof(float))); }

	switch(OPTIMIZER)
	{
		case ADAM:
			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
			CUDA_CHECK_RETURN(hipMemset(new_max2, 0, 1*sizeof(float)));
		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, state2, unorm, beta1, beta2, eps, step, quantiles1, quantiles2, max1, max2, new_max1, new_max2, gnorm_scale, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
		hipLaunchKernelGGL((	kOptimizerStatic8bit2State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
																														quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
		break;
		case MOMENTUM:
    case RMSPROP:
    case ADAGRAD:
			CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
		hipLaunchKernelGGL((	kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
		hipLaunchKernelGGL((	kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
																														quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
			break;
    case LION:
      // in lion, the momentum update happens after the parameter update
     hipLaunchKernelGGL(( kOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(1024), 0, 0, p, g, state1, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr,
                                                            quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());

      CUDA_CHECK_RETURN(hipMemset(new_max1, 0, 1*sizeof(float)));
     hipLaunchKernelGGL(( kPreconditionOptimizerStatic8bit1State<T, OPTIMIZER>), dim3(num_blocks), dim3(256), 0, 0, p, g, state1, unorm, beta1, beta2, eps, step, quantiles1, max1, new_max1, weight_decay, gnorm_scale, n);
      CUDA_CHECK_RETURN(hipPeekAtLastError());
      break;
		default:
			break;
	}
}

#define BLOCKSIZE_2STATE 256
#define NUM_2STATE 1
#define BLOCKSIZE_1STATE 256
#define NUM_1STATE 1

template<typename T, int OPTIMIZER> void optimizerStatic8bitBlockwise(
	T* p,
	T* g,
        unsigned char* state1,
	unsigned char* state2,
	float beta1,
	float beta2,
	float beta3,
	float alpha,
	float eps,
	int step,
	float lr,
        float* quantiles1,
	float* quantiles2,
	float* absmax1,
	float* absmax2,
	float weight_decay,
	const float gnorm_scale,
	bool skip_zeros,
	int n
) {

	int num_blocks = 0;
	switch(OPTIMIZER)
	{
		case ADAM:
    case ADEMAMIX:
			num_blocks = n/BLOCKSIZE_2STATE;
			num_blocks = n % BLOCKSIZE_2STATE == 0 ? num_blocks : num_blocks + 1;
		hipLaunchKernelGGL((	kOptimizerStatic8bit2StateBlockwise<T, OPTIMIZER, BLOCKSIZE_2STATE, NUM_2STATE>), dim3(num_blocks), dim3(BLOCKSIZE_2STATE/NUM_2STATE), 0, 0, p, g, state1, state2, beta1, beta2, beta3, alpha, eps, step, lr,
																														quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
		break;
		case MOMENTUM:
		case RMSPROP:
    case ADAGRAD:
    case LION:
			num_blocks = n/BLOCKSIZE_1STATE;
			num_blocks = n % BLOCKSIZE_1STATE == 0 ? num_blocks : num_blocks + 1;
		hipLaunchKernelGGL((	kOptimizerStatic8bit1StateBlockwise<T, OPTIMIZER, BLOCKSIZE_1STATE, NUM_1STATE>), dim3(num_blocks), dim3(BLOCKSIZE_1STATE/NUM_1STATE), 0, 0, p, g, state1, beta1, beta2, eps, step, lr,
																														quantiles1, absmax1, weight_decay, gnorm_scale, skip_zeros, n);
			CUDA_CHECK_RETURN(hipPeekAtLastError());
		break;
	}
}


template<typename T> void percentileClipping(T * g, float *gnorm_vec, int step, const int n)
{
  int num_blocks = n/2048;
  num_blocks = n % 2048 == 0 ? num_blocks : num_blocks + 1;
	CUDA_CHECK_RETURN(hipMemset(&gnorm_vec[step % 100], 0, 1*sizeof(float)));
 hipLaunchKernelGGL(( kPercentileClipping<T, 2048, 4>), dim3(num_blocks), dim3(512), 0, 0, g, gnorm_vec, step, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

void gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc)
{
  const int falpha = 1;
  const int fbeta = 0;
  const void * alpha = &falpha;
  const void * beta = &fbeta;
	hipblasStatus_t status;

#if hipblasVersionMajor >= 3
			status = hipblasGemmEx(context->m_handle,
					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					m, n,	k,
					alpha, A, HIP_R_8I, lda, B, HIP_R_8I, ldb, beta,
					C, HIP_R_32I, ldc,
          HIPBLAS_COMPUTE_32I, HIPBLAS_GEMM_DEFAULT);
#else
			status = hipblasGemmEx(context->m_handle,
					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					m, n,	k,
					alpha, A, HIPBLAS_R_8I, lda, B, HIPBLAS_R_8I, ldb, beta,
					C, HIPBLAS_R_32I, ldc,
          HIPBLAS_R_32I, HIPBLAS_GEMM_DEFAULT);
#endif

    if (status != HIPBLAS_STATUS_SUCCESS)
    {
      std::cout << "HIPBLAS ERROR: Status " << status << std::endl;
    }

}

void strided_gemmex(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
                    long long int strideA, long long int strideB, long long int strideC, int batchCount)
{
  const int falpha = 1;
  const int fbeta = 0;
  const void * alpha = &falpha;
  const void * beta = &fbeta;
	hipblasStatus_t status;

  //cout << transposeA << transposeB << endl;
  //printf("%i %i %i\n", m,n,k);
  //printf("%i %i %i\n", lda,ldb,ldc);
  //printf("%i %i %i\n", strideA, strideB, strideC);
  //printf("%i\n", batchCount);

#if hipblasVersionMajor >= 3
			status = hipblasGemmStridedBatchedEx(context->m_handle,
					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					m, n,	k,
					alpha, A, HIP_R_8I, lda, (long long int)strideA, B, HIP_R_8I, ldb, (long long int)strideB, beta,
					C, HIP_R_32I, ldc, (long long int)strideC, batchCount,
          HIPBLAS_COMPUTE_32I, HIPBLAS_GEMM_DEFAULT);
#else
			status = hipblasGemmStridedBatchedEx(context->m_handle,
					transposeA ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					transposeB ? HIPBLAS_OP_T : HIPBLAS_OP_N,
					m, n,	k,
					alpha, A, HIPBLAS_R_8I, lda, (long long int)strideA, B, HIPBLAS_R_8I, ldb, (long long int)strideB, beta,
					C, HIPBLAS_R_32I, ldc, (long long int)strideC, batchCount,
          HIPBLAS_R_32I, HIPBLAS_GEMM_DEFAULT);
#endif

    if (status != HIPBLAS_STATUS_SUCCESS)
    {
      std::cout << "HIPBLAS ERROR: Status " << status << std::endl;
    }

}

int roundoff(int v, int d) {
    return (v + d - 1) / d * d;
}

#ifdef NO_HIPBLASLT
#else
template<int ORDER> hipblasLtOrder_t get_order()
{
	switch(ORDER)
	{
		case ROW:
      return HIPBLASLT_ORDER_ROW;
			break;
    case COL:
      return HIPBLASLT_ORDER_COL;
      break;
    case COL32:
      //return HIPBLASLT_ORDER_COL32;
      return HIPBLASLT_ORDER_COL;
      break;
    case COL_TURING:
      //return HIPBLASLT_ORDER_COL4_4R2_8C;
      return HIPBLASLT_ORDER_COL;
     break;
    case COL_AMPERE:
      //return HIPBLASLT_ORDER_COL32_2R_4R4;
      return HIPBLASLT_ORDER_COL;
      break;
		default:
			break;
  }

	return HIPBLASLT_ORDER_ROW;
}

template hipblasLtOrder_t get_order<ROW>();
template hipblasLtOrder_t get_order<COL>();
template hipblasLtOrder_t get_order<COL32>();
//template hipblasLtOrder_t get_order<COL_TURING>();
//template hipblasLtOrder_t get_order<COL_AMPERE>();
#endif

template<int ORDER> int get_leading_dim(int dim1, int dim2)
{
	switch(ORDER)
	{
		case ROW:
      return dim2;
			break;
    case COL:
      return dim1;
      break;
    default:
      return dim1;
      break;
    /*case COL32:
      // 32*row tiles
      return dim1*32;
      break;
    case COL_TURING:
      return 32*roundoff(dim1, 8);
      break;
    case COL_AMPERE:
      // 32*32 tiles
      return 32*roundoff(dim1, 32);
      break;
		default:
			return 0;
			break;
     */
  }
}

static std::string hipError_to_string(const hipError_t ret)
{
	switch(ret)
	{
		case hipSuccess:
			return "hipSuccess";
		case hipErrorInvalidContext:
			return "hipErrorInvalidContext";
		case hipErrorInvalidKernelFile:
			return "hipErrorInvalidKernelFile";
		case hipErrorMemoryAllocation:
			return "hipErrorMemoryAllocation";
		case hipErrorInitializationError:
			return "hipErrorInitializationError";
		case hipErrorLaunchFailure:
			return "hipErrorLaunchFailure";
		case hipErrorLaunchOutOfResources:
			return "hipErrorLaunchOutOfResources";
		case hipErrorInvalidDevice:
			return "hipErrorInvalidDevice";
		case hipErrorInvalidValue:
			return "hipErrorInvalidValue";
		case hipErrorInvalidDevicePointer:
			return "hipErrorInvalidDevicePointer";
		case hipErrorInvalidMemcpyDirection:
			return "hipErrorInvalidMemcpyDirection";
		case hipErrorUnknown:
			return "hipErrorUnknown";
		case hipErrorInvalidResourceHandle:
			return "hipErrorInvalidResourceHandle";
		case hipErrorNotReady:
			return "hipErrorNotReady";
		case hipErrorNoDevice:
			return "hipErrorNoDevice";
		case hipErrorPeerAccessAlreadyEnabled:
			return "hipErrorPeerAccessAlreadyEnabled";
		case hipErrorPeerAccessNotEnabled:
			return "hipErrorPeerAccessNotEnabled";
		case hipErrorRuntimeMemory:
			return "hipErrorRuntimeMemory";
		case hipErrorRuntimeOther:
			return "hipErrorRuntimeOther";
		case hipErrorHostMemoryAlreadyRegistered:
			return "hipErrorHostMemoryAlreadyRegistered";
		case hipErrorHostMemoryNotRegistered:
			return "hipErrorHostMemoryNotRegistered";
		case hipErrorMapBufferObjectFailed:
			return "hipErrorMapBufferObjectFailed";
		case hipErrorTbd:
			return "hipErrorTbd";
		default:
			throw std::runtime_error("unknown hipError");
	}
}

template <int DTYPE_OUT, int SCALE_ROWS> int igemmlt(
  hipblasLtHandle_t ltHandle,
  int m, int n, int k,
  const int8_t *A,
  const int8_t *B,
  void *C,
  float *row_scale,
  int lda, int ldb, int ldc,
  hipStream_t stream
) {
#ifdef NO_HIPBLASLT
	return ERR_NOT_IMPLEMENTED;
#else

  // Calculate C = A^T @ B, in col-major layout.
  //
  // Use the IMMA kernels requires:
  // * A must be transposed and B must be non-transposed.
  // * Dimensions m and k must be multiples of 4.
  // * All pointers must be 4-byte aligned; 16-byte alignment preferred.

  int has_error = 0;
  const int64_t max_workspace_size = 0;//set to 0 to avoid choosing GSU kernel

  hipblasLtMatmulDesc_t matmulDesc;
  hipblasLtMatrixLayout_t aDesc, bDesc, cDesc;
  hipblasOperation_t opT = HIPBLAS_OP_T;

  hipDataType outType = DTYPE_OUT == 32 ? HIP_R_32I : HIP_R_8I;
  hipDataType scaleType = DTYPE_OUT == 32 ? HIP_R_32I : HIP_R_32F;

  hipblasLtPointerMode_t pointerMode = HIPBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST;

  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&aDesc, HIP_R_8I, m, k, lda));
  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&bDesc, HIP_R_8I, m, n, ldb));
  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutCreate(&cDesc, outType, k, n, ldc));

  // Default layout order is col major

  has_error |= checkHipblasStatus(hipblasLtMatmulDescCreate(&matmulDesc, HIPBLAS_COMPUTE_32I, scaleType));
  has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, HIPBLASLT_MATMUL_DESC_TRANSA, &opT, sizeof(opT)));

  if (DTYPE_OUT == 32) {

      /* Algo and workspace TODO: need to rework to not be duplicated */
      // Set User Preference attributes
      hipblasLtMatmulPreference_t pref;
      checkHipblasStatus(hipblasLtMatmulPreferenceCreate(&pref));
      checkHipblasStatus(
          hipblasLtMatmulPreferenceSetAttribute(pref,
                                                HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                                                &max_workspace_size,
                                                sizeof(max_workspace_size)));

      const int                        request_solutions = 1;
      hipblasLtMatmulHeuristicResult_t heuristicResult[request_solutions];
      int                              returnedAlgoCount = 0;
      checkHipblasStatus(hipblasLtMatmulAlgoGetHeuristic(ltHandle,
                                                            matmulDesc,
                                                            aDesc,
                                                            bDesc,
                                                            cDesc,
                                                            cDesc,
                                                            pref,
                                                            request_solutions,
                                                            heuristicResult,
                                                            &returnedAlgoCount));

      if (returnedAlgoCount == 0)
      {
        has_error = 1;
        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
      } else {
        int alpha = 1, beta = 0;
        has_error |= checkHipblasStatus(hipblasLtMatmul(
          ltHandle, matmulDesc,
          &alpha, A, aDesc,
          B, bDesc, &beta,
          (int32_t*)C, cDesc,
          (int32_t*)C, cDesc,
          &heuristicResult[0].algo, NULL, 0, stream
        ));
      }
  } else {
    // This path is unlikely to be used, as 8-bit accumulation can lead to likely overflows.

    if (!SCALE_ROWS) {
      float alpha = 1.0f, beta = 0.0f;
      has_error |= checkHipblasStatus(hipblasLtMatmul(
        ltHandle, matmulDesc,
        &alpha, A, aDesc,
        B, bDesc, &beta,
        (int8_t*)C, cDesc,
        (int8_t*)C, cDesc,
        NULL, NULL, 0, stream
      ));
    } else {
      hipblasLtPointerMode_t alphaVec = HIPBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST;
      float beta = 0.0f;
      has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(
        matmulDesc,
        HIPBLASLT_MATMUL_DESC_POINTER_MODE,
        &pointerMode,
        sizeof(alphaVec)
      ));
      has_error |= checkHipblasStatus(hipblasLtMatmul(
        ltHandle, matmulDesc,
        row_scale, A, aDesc,
        B, bDesc, &beta,
        (int8_t*)C, cDesc,
        (int8_t*)C, cDesc,
        NULL, NULL, 0, stream
      ));
    }
  }

  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(cDesc));
  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(bDesc));
  has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(aDesc));
  has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));

  if(has_error == 1)
    printf("error detected");

  return has_error;
#endif // NO_HIPBLASLT
}

int fill_up_to_nearest_multiple(int value, int multiple)
{
  return value + (value % multiple == 0 ? 0 : (multiple - (value % multiple)));
}

void dequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, half *bias, int numRows, int numCols, hipStream_t stream)
{
  const int threads = 512;
  const int num_per_thread = 4;
  const int num_per_block = threads * num_per_thread;
  const int n = numRows*numCols;
  const int num_blocks = (n + num_per_block - 1) / num_per_block;

  hipLaunchKernelGGL(( kdequant_mm_int32_fp16<num_per_thread, threads>), dim3(num_blocks), dim3(threads), 0, stream, A, rowStats, colStats, out, bias, numRows, numCols, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

void int8VectorQuant(half * __restrict__ A, int8_t *out, float *rowStats, float threshold, int rows, int cols, hipStream_t stream) {  if (threshold == 0.0) {
    kInt8VectorQuant<half, 1024, 0><<<rows, 1024, 0, stream>>>(A, out, rowStats, threshold, rows, cols);
  } else {
    kInt8VectorQuant<half, 1024, 1><<<rows, 1024, 0, stream>>>(A, out, rowStats, threshold, rows, cols);
  }
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

void getRowStats(half *A, float *rowStats, float threshold, int rows, int cols, hipStream_t stream) {
  if (threshold == 0.0)
    kgetRowStats<half, 1024, 0><<<rows, 1024, 0, stream>>>(A, rowStats, threshold, rows, cols);
  else
    kgetRowStats<half, 1024, 1><<<rows, 1024, 0, stream>>>(A, rowStats, threshold, rows, cols);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

void spmm_coo(hipsparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
{

#ifdef NO_HIPBLASLT
#else

  hipsparseSpMatDescr_t descA;
  hipsparseDnMatDescr_t descB, descC;

  float alpha = 1.0f;
  float beta = 0.0f;
  void *dBuffer = NULL;
  size_t bufferSize = 0;

  CHECK_HIPSPARSE( hipsparseCreateCoo(&descA, A_rows, A_cols, A_nnz,
                                    A_rowidx, A_colidx, A_vals,
                                    HIPSPARSE_INDEX_32I,
                                    HIPSPARSE_INDEX_BASE_ZERO, HIP_R_16F) );
  // Create dense matrix C
  CHECK_HIPSPARSE( hipsparseCreateDnMat(&descC, A_rows, B_cols, ldc, C,
                                      HIP_R_16F, HIPSPARSE_ORDER_ROW) );
  // Create dense matrix B
  if(transposed_B)
  {
    int tmp = A_cols;
    A_cols = B_cols;
    B_cols = tmp;
  }

  CHECK_HIPSPARSE( hipsparseCreateDnMat(&descB, A_cols, B_cols, ldb, B,
                                      HIP_R_16F, HIPSPARSE_ORDER_ROW) );
  // allocate an external buffer if needed
  CHECK_HIPSPARSE( hipsparseSpMM_bufferSize(
                                handle,
                                HIPSPARSE_OPERATION_NON_TRANSPOSE,
                                transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
                                &alpha, descA, descB, &beta, descC, HIP_R_32F,
                                HIPSPARSE_SPMM_ALG_DEFAULT, &bufferSize) );
  CUDA_CHECK_RETURN( hipMalloc(&dBuffer, bufferSize) );

  // execute SpMM
  CHECK_HIPSPARSE( hipsparseSpMM(handle,
                                HIPSPARSE_OPERATION_NON_TRANSPOSE,
                                transposed_B ? HIPSPARSE_OPERATION_TRANSPOSE : HIPSPARSE_OPERATION_NON_TRANSPOSE,
                                &alpha, descA, descB, &beta, descC, HIP_R_32F,
                                HIPSPARSE_SPMM_ALG_DEFAULT, dBuffer));

  // destroy matrix/vector descriptors
  CHECK_HIPSPARSE( hipsparseDestroySpMat(descA) );
  CHECK_HIPSPARSE( hipsparseDestroyDnMat(descB) );
  CHECK_HIPSPARSE( hipsparseDestroyDnMat(descC) );
  CUDA_CHECK_RETURN( hipFree(dBuffer) );
#endif
}

template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
{

 hipLaunchKernelGGL(( kspmm_coo_very_sparse_naive<T, 8, BITS>), dim3(nnz_rows), dim3(256), 0, 0, max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz, rowsA, rowsB, colsB);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

template <typename T> void gemm_host(int m, int n, int k, T * A,  T* B,  T * out,  int lda, int ldb, int ldc, int bits)
{

	int num_blocks = (m+31)/32;

  if(bits == 32)
   hipLaunchKernelGGL(( gemm_device<T, 32, 32>),  dim3(num_blocks), dim3(32), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
  if(bits == 16)
   hipLaunchKernelGGL(( gemm_device<T, 16, 160>),  dim3(num_blocks), dim3(160), 0, 0, m,  n,  k, A,  B,  out, lda, ldb, ldc);
}

template <typename T> void gemm_4bit_inference(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, T * out,  int lda, int ldb, int ldc, int blocksize)
{

	int num_blocks = (m+31)/32;

 hipLaunchKernelGGL(( kgemm_4bit_inference<T, 96>),  dim3(num_blocks), dim3(96), 0, 0, m,  n,  k, A,  B, absmax, out, lda, ldb, ldc, blocksize);
}

template <typename T, int BITS> void gemm_4bit_inference_naive(int m, int n, int k, T * A,  unsigned char* B,  float *absmax, float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream)
{

	//warpsize - 32
        int num_blocks = (m+3)/4;
	//warpsize - 64
        if (WARP_SIZE == 64) {
          num_blocks = (m+1)/2;
        }

  hipLaunchKernelGGL(( kgemm_4bit_inference_naive<T, 128, BITS>), dim3(num_blocks), dim3(128), 0, stream, m,  n,  k, A,  B, absmax, datatype, out, lda, ldb, ldc, blocksize);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

template <typename T, int FUNC> void func(T *A, T *B, T value, long n)
{
  int threads = 512;
  int blocks = n/threads;
  blocks = n % threads == 0 ? blocks : blocks + 1;
  blocks = blocks > 65535 ? 65535 : blocks;
 hipLaunchKernelGGL(( kfunc<T, FUNC>), dim3(blocks), dim3(512), 0, 0, A, B, value, n);
  CUDA_CHECK_RETURN(hipPeekAtLastError());
}

//==============================================================
//                   TEMPLATE DEFINITIONS
//==============================================================

template void func<float, FILL>(float *A, float *B, float value, long n);
template void func<unsigned char, FILL>(unsigned char *A, unsigned char *B, unsigned char value, long n);
template void func<float, ARANGE>(float *A, float *B, float value, long n);
template void func<float, _MUL>(float *A, float *B, float value, long n);

template void gemm_4bit_inference<half>(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize);
template void gemm_4bit_inference_naive<half, 16>(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, float *datatype, half * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream);
template void gemm_4bit_inference_naive<hip_bfloat16, 16>(int m, int n, int k, hip_bfloat16 * A,  unsigned char* B,  float *absmax, float *datatype, hip_bfloat16 * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream);
template void gemm_4bit_inference_naive<float, 32>(int m, int n, int k, float * A,  unsigned char* B,  float *absmax, float *datatype, float * out,  int lda, int ldb, int ldc, int blocksize, hipStream_t stream);

//template void gemm_host<float>(int m, int n, int k, float * A,  float* B,  float * out,  int lda, int ldb, int ldc, int bits);
template void gemm_host<half>(int m, int n, int k, half * A,  half* B,  half * out,  int lda, int ldb, int ldc, int bits);

template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);

template int igemmlt<32, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
template int igemmlt<8, 0>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);
template int igemmlt<8, 1>(hipblasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, hipStream_t stream);

template void quantizeBlockwise<half, 1, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<half, 0, General8bit>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<half, 0, FP4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<half, 0, NF4>(float * code, half *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 1, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 0, General8bit>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 0, FP4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<float, 0, NF4>(float * code, float *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<hip_bfloat16, 1, General8bit>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<hip_bfloat16, 0, General8bit>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<hip_bfloat16, 0, FP4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);
template void quantizeBlockwise<hip_bfloat16, 0, NF4>(float * code, hip_bfloat16 *A, float *absmax, unsigned char *out, float* rand, int rand_offset, int blocksize, const int n);

template void dequantizeBlockwise<float, General8bit>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<float, FP4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<float, NF4>(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<half, General8bit>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<half, FP4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<half, NF4>(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<hip_bfloat16, General8bit>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<hip_bfloat16, FP4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);
template void dequantizeBlockwise<hip_bfloat16, NF4>(float *code, unsigned char *A, float *absmax, hip_bfloat16 *out, int blocksize, const int n, hipStream_t stream);

#define MAKE_optimizer32bit(name, gtype) \
template void optimizer32bit<gtype, name>(gtype* g, gtype* p, \
                float* state1, float* state2, float* unorm, float max_unorm, float param_norm, \
                const float beta1, const float beta2, const float beta3, const float alpha, const float eps, const float weight_decay, \
                const int step, const float lr, const float gnorm_scale, const bool skip_zeros, const int n);

MAKE_optimizer32bit(ADAM, half)
MAKE_optimizer32bit(ADAM, float)
MAKE_optimizer32bit(ADAM, hip_bfloat16)
MAKE_optimizer32bit(MOMENTUM, half)
MAKE_optimizer32bit(MOMENTUM, float)
MAKE_optimizer32bit(MOMENTUM, hip_bfloat16)
MAKE_optimizer32bit(RMSPROP, half)
MAKE_optimizer32bit(RMSPROP, float)
MAKE_optimizer32bit(RMSPROP, hip_bfloat16)
MAKE_optimizer32bit(LION, half)
MAKE_optimizer32bit(LION, float)
MAKE_optimizer32bit(LION, hip_bfloat16)
MAKE_optimizer32bit(ADAGRAD, half)
MAKE_optimizer32bit(ADAGRAD, float)
MAKE_optimizer32bit(ADAGRAD, hip_bfloat16)
MAKE_optimizer32bit(ADEMAMIX, half)
MAKE_optimizer32bit(ADEMAMIX, hip_bfloat16)
MAKE_optimizer32bit(ADEMAMIX, float)

#define MAKE_optimizerStatic8bit(name, gtype) \
template void optimizerStatic8bit<gtype, name>(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
                float *unorm, float max_unorm, float param_norm, \
                float beta1, float beta2, \
                float eps, int step, float lr,  \
                float* quantiles1, float* quantiles2, \
                float* max1, float* max2, float* new_max1, float* new_max2, \
                float weight_decay, \
                const float gnorm_scale, int n); \

MAKE_optimizerStatic8bit(ADAM, half)
MAKE_optimizerStatic8bit(ADAM, float)
MAKE_optimizerStatic8bit(MOMENTUM, half)
MAKE_optimizerStatic8bit(MOMENTUM, float)
MAKE_optimizerStatic8bit(RMSPROP, half)
MAKE_optimizerStatic8bit(RMSPROP, float)
MAKE_optimizerStatic8bit(LION, half)
MAKE_optimizerStatic8bit(LION, float)
MAKE_optimizerStatic8bit(ADAGRAD, half)
MAKE_optimizerStatic8bit(ADAGRAD, float)

#define MAKE_optimizerStatic8bitBlockwise(gtype, optim_name) \
template void optimizerStatic8bitBlockwise<gtype, optim_name>(gtype* p, gtype* g, \
                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float beta3, float alpha, float eps, int step, float lr,  \
                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n); \

MAKE_optimizerStatic8bitBlockwise(half, ADAM);
MAKE_optimizerStatic8bitBlockwise(float, ADAM);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAM);
MAKE_optimizerStatic8bitBlockwise(half, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(float, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, MOMENTUM);
MAKE_optimizerStatic8bitBlockwise(half, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(float, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, RMSPROP);
MAKE_optimizerStatic8bitBlockwise(half, LION);
MAKE_optimizerStatic8bitBlockwise(float, LION);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, LION);
MAKE_optimizerStatic8bitBlockwise(half, ADAGRAD);
MAKE_optimizerStatic8bitBlockwise(float, ADAGRAD);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADAGRAD);
MAKE_optimizerStatic8bitBlockwise(half, ADEMAMIX);
MAKE_optimizerStatic8bitBlockwise(hip_bfloat16, ADEMAMIX);
MAKE_optimizerStatic8bitBlockwise(float, ADEMAMIX);

template void percentileClipping(float * g, float *gnorm_vec, int step, const int n);
template void percentileClipping(half * g, float *gnorm_vec, int step, const int n);

template int get_leading_dim<ROW>(int dim1, int dim2);
template int get_leading_dim<COL>(int dim1, int dim2);
template int get_leading_dim<COL32>(int dim1, int dim2);