#ifndef CK_CONFIG_HPP
#define CK_CONFIG_HPP

#cmakedefine01 DEVICE_BACKEND_HIP
#cmakedefine01 DEVICE_BACKEND_CUDA

#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1

#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0
#endif

namespace ck {

#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
#else
// For some reason, CUDA need this definition, otherwise
//   compiler won't generate optimal load and store instruction, and
//   kernel would produce wrong result, indicating the compiler fail to generate correct
//   instruction,
using float2_t = float2;
using float4_t = float4;
#endif

using index_t = uint32_t;

__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
    d += s0 * s1;
}

#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }

__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
    d += s0.x * s1.x;
    d += s0.y * s1.y;
}

__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
    d += s0.x * s1.x + s0.y * s1.y;
}

__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }

// TODO:: this interface is misleading, s0, s1 are actually int8x4
//  need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
    d = __dp4a(s0, s1, d);
#endif
}
#endif

} // namespace ck

#endif