/*******************************************************************************************
 * This file contains declarations of generic helper functions and implementations of 
 * helper macros.
 ******************************************************************************************/

#ifndef _COMMON_H
#define _COMMON_H

#include <sys/time.h>
#include <hip/hip_runtime.h>
#include "blockdim_set.h"
#include "opencc.h"


#ifndef __NVCC__

#ifndef HIP_SYMBOL
#define HIP_SYMBOL( var ) (&var)
#endif

#else

#ifndef HIP_SYMBOL
#define HIP_SYMBOL( var ) (var)
#define warpSize 32
#endif

#endif

/** constant of state of ideal gas, J/(kmol·K)
 */
#define R 8313.8462 

#define Tsd 298

/** constant of state of ideal gas, Pa
 */
#define p_atm 101325

#define p_atm_d_R 12.18749993234178

#define SMALL 1e-6

#define VSMALL 1e-35

#define log_ten 2.302585092994046

#define k_max 12

#define atol 1e-12

#define rtol 1e-2

#define jac_redo 1e-4

#define GREAT 1e6


extern size_t pitch_J;
extern size_t pitch_sp_num;
extern size_t pitch_react_num;
extern size_t pitch_n_vars;
extern size_t pitch_n_vars2;
extern size_t pitch_seulex_tmp_ptr;
extern size_t pitch_table;

extern __device__ __constant__ size_t pitch_J_d;
extern __device__ __constant__ size_t pitch_sp_num_d;
extern __device__ __constant__ size_t pitch_react_num_d;
extern __device__ __constant__ size_t pitch_n_vars_d;
extern __device__ __constant__ size_t pitch_n_vars2_d;
extern __device__ __constant__ size_t pitch_seulex_tmp_ptr_d;
extern __device__ __constant__ size_t pitch_table_d;

extern __device__ __constant__ size_t size_d;
extern __device__ __constant__ size_t sp_num_d;
extern __device__ __constant__ size_t react_num_d;

extern __device__ __constant__ size_t n_seq_d[k_max+1];
extern __device__ __constant__ REAL   coeff_d[(k_max+1)*(k_max+1)];

#define access_J(ptr, x, y, size)\
*(ptr + (x) + (y)*(sp_num_d+2) + size*pitch_J_d) 

#define access_J_atomic(ptr, x, y, size, value)\
atomicAdd((ptr + (x) + (y)*(sp_num_d+2) + size*pitch_J_d), value)
//__builtin_amdgcn_global_atomic_fadd_f64((ptr + (x) + (y)*(sp_num_d+2) + size*pitch_J_d), value)

#define access_vars_atomic(ptr, x, y, value)\
atomicAdd((ptr + (x) + (y)*pitch_n_vars_d), value)
//__builtin_amdgcn_global_atomic_fadd_f64((ptr + (x) + (y)*pitch_n_vars_d), value)

#define access_data(ptr, y)\
*(ptr + y) 

#define access_vars_data(ptr, x, y)\
*(ptr + (x) + (y)*pitch_n_vars_d) 

#define access_vars2_data(ptr, x, y)\
*(ptr + (x) + (y)*pitch_n_vars2_d) 

#define access_sp_num_data(ptr, x, y)\
*(ptr + (x) + (y)*pitch_sp_num_d) 

//#define access_react_num_data(ptr, x, y)\
//*(ptr + (x) + (y)*pitch_react_num_d) 

#define access_react_num_data(ptr, x, y)\
*(ptr + (x) + (y)*react_num_d) 

#define access_P(ptr, x, y)\
*(ptr + (x) + (y)*pitch_seulex_tmp_ptr_d)

#define access_L(ptr, x, y, size)\
*(ptr + sp_num_d + 2 + (x) + (y)*(sp_num_d+2) + size*pitch_seulex_tmp_ptr_d)

#define access_U(ptr, x, y, size)\
*(ptr + (sp_num_d + 2)*(sp_num_d + 3) + (x) + (y)*(sp_num_d+2) + size*pitch_seulex_tmp_ptr_d)

#define access_table(ptr, x, y, size)\
*(ptr + (x) + (y)*(sp_num_d+2) + size*pitch_table_d)

extern REAL t_end_h;

extern REAL *dt_sum_d;

extern int *real_num_total;

extern int *real_index;

extern int *real_num;

#define Stream_num 2

extern hipStream_t Stream_opencc[Stream_num];
extern hipEvent_t  Event[Stream_num];

/** Macro used to check cuda errors. 
 */
#define CUDACHECK(cmd)                                                                       \
{                                                                                         \
    hipError_t e = cmd;                                                                     \
    if (e != hipSuccess) {                                                                  \
      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, hipGetErrorString(e));  \
      exit(EXIT_FAILURE);                                                                    \
    }                                                                                        \
}

/** If MPI is used, only the first process prints.
 */
#ifdef __USE_MPI__
#define MPI_PRINTF(...)\
{\
  int flag;\
  MPI_Initialized(&flag);\
  if(flag) {\
      int rank;\
      MPI_Comm_rank(MPI_COMM_WORLD, &rank);\
      if(rank == 0) printf(__VA_ARGS__);\
  }\
  else printf("\033[31mWARNING!!! MPI IS NOT INITIALIZED. CAN'T USE MPI_PRINTF\n")\
}
#else
#define MPI_PRINTF(...) printf(__VA_ARGS__)
#endif //__USE_MPI__

/** Allocate host side page lock memory.
 *  \param[in, out] ptr_h    Host memory pointer. (host)
 *  \param[in]      size     Number of bytes. (host)
 */
#define malloc_Host(ptr_h, size) malloc_Host_((void**)&ptr_h, size,  __FUNCTION__, __FILE__, __LINE__)
void malloc_Host_(void **p, int size, const char *funname, const char *file, int line);

/** Allocate host memory and transfer data from the src to the des and do not work if 
 *  device memory is already allocated. 
 *  \details  If no device memory pointer is nullptr when data is created, allocate it and 
 *  copy the host data to the device.   
 * 
 *  \param[in] ptr_d    DES memory pointer. (device)
 *  \param[in] ptr_h    SRC memory pointer. (host)
 *  \param[in] size     Number of bytes. (host)
 */
#define HostDataset(ptr_d, ptr_h, size)\
if (ptr_d == nullptr) {\
    malloc_Host(ptr_d, size);\
}\
if (ptr_h != nullptr) {\
    memcpy(ptr_d, ptr_h, size);\
}

//#define HostDataset(ptr_d, ptr_h, size)\
//if (ptr_d == nullptr) {\
//    malloc_Host(ptr_d, size);\
//}\
//if (ptr_h != nullptr) {\
//    memcpy(ptr_d, ptr_h, size);\
//} else {\
//        MPI_PRINTF("\033[31mWORRY!!! WHEN SETTING HOST, NEITHER SRC MEMORY NOR\
// DES MEMORY IS ALLOCATED.\033[0m\n");\
//}

/** Transfer data from the src to the des.
 *  
 *  \param[in] ptr_d    Src memory pointer. (host)
 *  \param[in] ptr_h    Des memory pointer. (host)
 *  \param[in] size     Number of bytes. (host)
 */
#define HostDataget(ptr_d, ptr_h, size)\
{\
    if (ptr_h != nullptr && ptr_d != nullptr) {\
        memcpy(ptr_h, ptr_d, size);\
    } else {\
        MPI_PRINTF("\033[31mWORRY!!! WHEN GETTING HOST DATA, NEITHER SRC SIDE MEMORY NOR\
 DES SIDE MEMORY IS ALLOCATED.\033[0m\n");\
    }\
}

/** Allocate device memory and transfer data from the host to the device and do not work if 
 *  device memory is already allocated. 
 *  \details  If no device memory pointer is nullptr when data is created, allocate it and 
 *  copy the host data to the device.   
 * 
 *  \param[in] ptr_d    Device memory pointer. (device)
 *  \param[in] ptr_h    Host memory pointer. (host)
 *  \param[in] size     Number of bytes. (host)
 */
#define DeviceDataset(ptr_d, ptr_h, size)\
if (ptr_d == nullptr) {\
    CUDACHECK(hipMalloc((void**)&ptr_d, size))\
}\
if (ptr_h != nullptr) {\
    CUDACHECK(hipMemcpyAsync(ptr_d, ptr_h, size, hipMemcpyHostToDevice, Stream_opencc[0]))\
}

//#define DeviceDataset(ptr_d, ptr_h, size)\
//if (ptr_d == nullptr) {\
//    CUDACHECK(hipMalloc((void**)&ptr_d, size))\
//}\
//if (ptr_h != nullptr) {\
//    CUDACHECK(hipMemcpy(ptr_d, ptr_h, size, hipMemcpyHostToDevice))\
//} else {\
//    MPI_PRINTF("\033[31mWORRY!!! WHEN SETTING DEVICE DATA, HOST SIDE MEMORY\
// IS NOT ALLOCATED.\033[0m\n");\
//}

/** Transfer data from the device to the host.
 *  
 *  \param[in] ptr_d    Device memory pointer. (host)
 *  \param[in] ptr_h    Host memory pointer. (host)
 *  \param[in] size     Number of bytes. (host)
 */
#define DeviceDataget(ptr_d, ptr_h, size)\
{\
    if (ptr_h != nullptr && ptr_d != nullptr) {\
        CUDACHECK(hipMemcpyAsync(ptr_h, ptr_d, size, hipMemcpyDeviceToHost, Stream_opencc[0]))\
    } else if (ptr_h == nullptr && ptr_d != nullptr) {\
        malloc_Host(ptr_h, size);\
        CUDACHECK(hipMemcpyAsync(ptr_h, ptr_d, size, hipMemcpyDeviceToHost, Stream_opencc[0]))\
    } else {\
        MPI_PRINTF("\033[31mWORRY!!! WHEN GETTING DEVICE DATA, NEITHER HOST SIDE MEMORY NOR\
 DEVICE SIDE MEMORY IS ALLOCATED.\033[0m\n");\
    }\
}

/** Allocate 2D device memory and transfer data from the host to the device and do not work if 
 *  device memory is already allocated. 
 *  \details  If no device memory pointer is nullptr when data is created, allocate it and 
 *  copy the host data to the device.   
 * 
 *  \param[in] ptr_d    Device memory pointer. (device)
 *  \param[in] ptr_h    Host memory pointer. (host)
 *  \param[in] height   Number of bytes. (host)
 *  \param[in] width    Number of bytes. (host)
 *  \param[in] pitch    Number of bytes. (host)
 */
#define DeviceDataset2D(ptr_d, ptr_h, width, height, pitch)\
if (ptr_d == nullptr) {\
    CUDACHECK(hipMallocPitch((void**)&ptr_d, &pitch, width, height))\
}\
if (ptr_h != nullptr) {\
    CUDACHECK(hipMemcpy2D(ptr_d, pitch, ptr_h, width, width, height, hipMemcpyHostToDevice))\
}

//#define DeviceDataset2D(ptr_d, ptr_h, width, height, pitch)\
//if (ptr_d == nullptr) {\
//    CUDACHECK(hipMallocPitch((void**)&ptr_d, &pitch, width, height))\
//}\
//if (ptr_h != nullptr) {\
//    CUDACHECK(hipMemcpy2D(ptr_d, pitch, ptr_h, width, width, height, hipMemcpyHostToDevice))\
//} else {\
//    MPI_PRINTF("\033[31mWORRY!!! WHEN SETTING DEVICE 2D DATA, HOST SIDE MEMORY\
// MEMORY IS NOT ALLOCATED.\033[0m\n");\
//}

/** Transfer 2D data from the device to the host.
 *  
 *  \param[in] ptr_d    Device memory pointer. (host)
 *  \param[in] ptr_h    Host memory pointer. (host)
 *  \param[in] height     Number of bytes. (host)
 *  \param[in] width     Number of bytes. (host)
 *  \param[in] pitch    Number of bytes. (host)
 */
#define DeviceDataget2D(ptr_d, ptr_h, width, height, pitch)\
{\
    if (ptr_h != nullptr && ptr_d != nullptr) {\
        CUDACHECK(hipMemcpy2D(ptr_h, width, ptr_d, pitch, width, height, hipMemcpyDeviceToHost))\
    } else if (ptr_h == nullptr && ptr_d != nullptr) {\
        malloc_Host(ptr_h, width*height);\
        CUDACHECK(hipMemcpy2D(ptr_h, width, ptr_d, pitch, width, height, hipMemcpyDeviceToHost))\
    } else {\
        MPI_PRINTF("\033[31mWORRY!!! WHEN GETTING DEVICE 2D DATA, HOST SIDE MEMORY OR\
 DEVICE SIDE MEMORY IS ALLOCATED.\033[0m\n");\
    }\
}

/** A macro that configures griddim and blockdim according to the values set.
 *  
 *  \param[in] block_set    The value of blockdim. (int) 
 */
#define set_block_grid(size, block_set, griddim, blockdim)\
{\
  blockdim.x = block_set;\
  griddim.x = (size + block_set - 1)/block_set;\
}

/** A macro that configures griddim and blockdim according to the values set.
 *  
 *  \param[in] block_set    The value of blockdim. (int) 
 */
#define set_block_grid2d(width, height, block_set, griddim, blockdim)\
{\
  blockdim.x = block_set.x;\
  blockdim.y = block_set.y;\
  griddim.x = (width + block_set.x - 1)/block_set.x;\
  griddim.y = (height + block_set.y - 1)/block_set.y;\
}

/** A macro that configures griddim and blockdim according to the values set.
 *  
 *  \param[in] block_set    The value of blockdim. (int) 
 */
#define set_block_grid3d(width, height, size, block_set, griddim, blockdim)\
{\
  blockdim.x = block_set.x;\
  blockdim.y = block_set.y;\
  blockdim.z = block_set.z;\
  griddim.x = (width + block_set.x - 1)/block_set.x;\
  griddim.y = (height + block_set.y - 1)/block_set.y;\
  griddim.z = (size + block_set.z - 1)/block_set.z;\
}

/** Initialize device-side memory to a value.
 *  
 *  \param[in] value   The value to be initialized. 
 *  \param[in] ptr     The device pointer to be initialized. 
 *  \param[in] size    The number of grids.
 */
void cuda_mem_value_init(REAL value, REAL *ptr, unsigned int width, unsigned int pitch, unsigned int height, dim3 blockset);

void cuda_copy(REAL *src, REAL *ptr, unsigned int width, unsigned int pitch, unsigned int height, dim3 blockset);

void cuda_min(REAL *src, REAL *ptr, unsigned int width, unsigned int pitch, unsigned int height, dim3 blockset);

void cuda_max(REAL *src, REAL *ptr, unsigned int width, unsigned int pitch, unsigned int height, dim3 blockset);

#define __shfl_xor_double(val , delta , witdh) __shfl_xor_double_( *( (int2*)(&val) ) , delta , witdh)
__device__ __forceinline__ double __shfl_xor_double_(int2 & val , unsigned char srcLane , unsigned char width){
    int2 out = *( (int2*)(&val) );
#ifndef __NVCC__
    out.x = __shfl_xor(out.x , srcLane , width);
    out.y = __shfl_xor(out.y , srcLane , width);
#else
    out.x = __shfl_xor_sync(0xffffffff, out.x , srcLane , width);
    out.y = __shfl_xor_sync(0xffffffff, out.y , srcLane , width);
#endif
    return ( *( (double*)(&out) ) );
}

void sum(size_t size, size_t size_all, REAL *P_d, REAL *result_h);

struct my_timer_opencc
{
    struct timeval start_time, end_time;
    double time_use;

    void start()
    {
        gettimeofday(&start_time, NULL);
    }

    void stop()
    {
        gettimeofday(&end_time, NULL);
        time_use = (end_time.tv_sec - start_time.tv_sec) + (double)(end_time.tv_usec - start_time.tv_usec)/1000000.0;
    }
};

void get_id_index(int size, int sp_num, REAL *T, REAL *T_origin, REAL *P, REAL *P_origin, REAL *Y, REAL *Y_origin, int *real_index_ref, int *real_num_ref, int *real_num_total_ref);

void reconstructY(int size, int sp_num, REAL *Y, REAL *Y_origin, int *real_index_ref, int *real_num_ref);

#endif //_COMMON_H
