// this file can either be read and passed to an OpenCL compiler directly,
// or included in a C++11 source file as a string literal
#ifndef __OPENCL_VERSION__
// If we are including this file in C++,
// the entire source file following (except the last #endif) will become
// a raw string literal. The extra ")" is just for mathcing parentheses
// to make the editor happy. The extra ")" and extra endif will be skipped.
// DO NOT add anything between here and the next #ifdef, otherwise you need
// to modify the skip count at the end of this file.
R""()
#endif

#ifndef _HISTOGRAM_256_KERNEL_
#define _HISTOGRAM_256_KERNEL_

#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

// use double precision or not
#ifndef USE_DP_FLOAT
#define USE_DP_FLOAT 0
#endif
// ignore hessian, and use the local memory for hessian as an additional bank for gradient
#ifndef CONST_HESSIAN
#define CONST_HESSIAN 0
#endif

#define LOCAL_SIZE_0 256
#define NUM_BINS 256
#if USE_DP_FLOAT == 1
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
typedef double acc_type;
typedef ulong acc_int_type;
#define as_acc_type as_double
#define as_acc_int_type as_ulong
#else
typedef float acc_type;
typedef uint acc_int_type;
#define as_acc_type as_float
#define as_acc_int_type as_uint
#endif
#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS)

// unroll the atomic operation for a few times. Takes more code space, 
// but compiler can generate better code for faster atomics.
#define UNROLL_ATOMIC 1

// Options passed by compiler at run time:
// IGNORE_INDICES will be set when the kernel does not 
// #define IGNORE_INDICES
// #define POWER_FEATURE_WORKGROUPS 10

// detect Nvidia platforms
#ifdef cl_nv_pragma_unroll
#define NVIDIA 1
#endif

// use all features and do not use feature mask
#ifndef ENABLE_ALL_FEATURES
#define ENABLE_ALL_FEATURES 1
#endif

// use binary patching for AMD GCN 1.2 or newer
#ifndef AMD_USE_DS_ADD_F32
#define AMD_USE_DS_ADD_F32 0
#endif

typedef uint data_size_t;
typedef float score_t;


#define ATOMIC_FADD_SUB1 { \
    expected.f_val = current.f_val; \
    next.f_val = expected.f_val + val; \
    current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val); \
    if (current.u_val == expected.u_val) \
        goto end; \
    }
#define ATOMIC_FADD_SUB2  ATOMIC_FADD_SUB1 \
                          ATOMIC_FADD_SUB1
#define ATOMIC_FADD_SUB4  ATOMIC_FADD_SUB2 \
                          ATOMIC_FADD_SUB2
#define ATOMIC_FADD_SUB8  ATOMIC_FADD_SUB4 \
                          ATOMIC_FADD_SUB4
#define ATOMIC_FADD_SUB16 ATOMIC_FADD_SUB8 \
                          ATOMIC_FADD_SUB8
#define ATOMIC_FADD_SUB32 ATOMIC_FADD_SUB16\
                          ATOMIC_FADD_SUB16
#define ATOMIC_FADD_SUB64 ATOMIC_FADD_SUB32\
                          ATOMIC_FADD_SUB32


// atomic add for float number in local memory
inline void atomic_local_add_f(__local acc_type *addr, const float val)
{
    union{
        acc_int_type u_val;
        acc_type f_val;
    } next, expected, current;
#if (NVIDIA == 1 && USE_DP_FLOAT == 0)
    float res = 0;
    asm volatile ("atom.shared.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(addr), "f"(val));
#elif (AMD_USE_DS_ADD_F32 == 1 && USE_DP_FLAT == 0)
    // this instruction (DS_AND_U32) will be patched into a DS_ADD_F32
    // we need to hack here because DS_ADD_F32 is not exposed via OpenCL
    atom_and((__local acc_int_type *)addr, as_acc_int_type(val));
#else
    current.f_val = *addr;
    #if UNROLL_ATOMIC == 1
    // provide a fast path
    // then do the complete loop
    // this should work on all devices
    ATOMIC_FADD_SUB8
    ATOMIC_FADD_SUB4
    ATOMIC_FADD_SUB2
    #endif
    do {
        expected.f_val = current.f_val;
        next.f_val = expected.f_val + val;
        current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val);
    } while (current.u_val != expected.u_val);
    end:
        ;
#endif
}

/* Makes MSVC happy with long string literal
)""
R""()
*/
// this function will be called by histogram256
// we have one sub-histogram of one feature in local memory, and need to read others
void within_kernel_reduction256x4(uchar4 feature_mask,
                           __global const acc_type* restrict feature4_sub_hist, 
                           const uint skip_id,
                           const uint old_val_f0_cont_bin0,
                           const ushort num_sub_hist,
                           __global acc_type* restrict output_buf,
                           __local acc_type* restrict local_hist) {
    const ushort ltid = get_local_id(0);
    const ushort lsize = LOCAL_SIZE_0;
    // initialize register counters from our local memory
    // TODO: try to avoid bank conflict here
    acc_type f0_grad_bin = local_hist[ltid * 8];
    acc_type f1_grad_bin = local_hist[ltid * 8 + 1];
    acc_type f2_grad_bin = local_hist[ltid * 8 + 2];
    acc_type f3_grad_bin = local_hist[ltid * 8 + 3];
    acc_type f0_hess_bin = local_hist[ltid * 8 + 4];
    acc_type f1_hess_bin = local_hist[ltid * 8 + 5];
    acc_type f2_hess_bin = local_hist[ltid * 8 + 6];
    acc_type f3_hess_bin = local_hist[ltid * 8 + 7];
    __local uint* restrict local_cnt = (__local uint *)(local_hist + 4 * 2 * NUM_BINS);
    #if POWER_FEATURE_WORKGROUPS != 0
    uint  f0_cont_bin = ltid ? local_cnt[ltid * 4] : old_val_f0_cont_bin0;
    #else
    uint  f0_cont_bin = local_cnt[ltid * 4];
    #endif
    uint  f1_cont_bin = local_cnt[ltid * 4 + 1];
    uint  f2_cont_bin = local_cnt[ltid * 4 + 2];
    uint  f3_cont_bin = local_cnt[ltid * 4 + 3];
    ushort i;
    // printf("%d-pre(skip %d): %f %f %f %f %f %f %f %f %d %d %d %d", ltid, skip_id, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
#if POWER_FEATURE_WORKGROUPS != 0
    // add all sub-histograms for 4 features
    __global const acc_type* restrict p = feature4_sub_hist + ltid;
    for (i = 0; i < skip_id; ++i) {
        if (feature_mask.s3) {
            f0_grad_bin += *p;          p += NUM_BINS;
            f0_hess_bin += *p;          p += NUM_BINS;
            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s2) {
            f1_grad_bin += *p;          p += NUM_BINS;
            f1_hess_bin += *p;          p += NUM_BINS;
            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s1) {
            f2_grad_bin += *p;          p += NUM_BINS;
            f2_hess_bin += *p;          p += NUM_BINS;
            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s0) {
            f3_grad_bin += *p;          p += NUM_BINS;
            f3_hess_bin += *p;          p += NUM_BINS;
            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
    }
    // skip the counters we already have
    p += 3 * 4 * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
        if (feature_mask.s3) {
            f0_grad_bin += *p;          p += NUM_BINS;
            f0_hess_bin += *p;          p += NUM_BINS;
            f0_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s2) {
            f1_grad_bin += *p;          p += NUM_BINS;
            f1_hess_bin += *p;          p += NUM_BINS;
            f1_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s1) {
            f2_grad_bin += *p;          p += NUM_BINS;
            f2_hess_bin += *p;          p += NUM_BINS;
            f2_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
        if (feature_mask.s0) {
            f3_grad_bin += *p;          p += NUM_BINS;
            f3_hess_bin += *p;          p += NUM_BINS;
            f3_cont_bin += as_acc_int_type(*p); p += NUM_BINS;
        }
        else {
            p += 3 * NUM_BINS;
        }
    }
    // printf("%d-aft: %f %f %f %f %f %f %f %f %d %d %d %d", ltid, f0_grad_bin, f1_grad_bin, f2_grad_bin, f3_grad_bin, f0_hess_bin, f1_hess_bin, f2_hess_bin, f3_hess_bin, f0_cont_bin, f1_cont_bin, f2_cont_bin, f3_cont_bin);
    #endif
    // now overwrite the local_hist for final reduction and output
    barrier(CLK_LOCAL_MEM_FENCE);
    #if USE_DP_FLOAT == 0
    // reverse the f3...f0 order to match the real order
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
    local_hist[2 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
    local_hist[3 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 4 * 3 * NUM_BINS; i += lsize) {
        output_buf[i] = local_hist[i];
    }
    */
    i = ltid;
    if (feature_mask.s0) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    i += 1 * 3 * NUM_BINS;
    if (feature_mask.s1) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    i += 1 * 3 * NUM_BINS;
    if (feature_mask.s2) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    i += 1 * 3 * NUM_BINS;
    if (feature_mask.s3 && i < 4 * 3 * NUM_BINS) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    #else
    // when double precision is used, we need to write twice, because local memory size is not enough
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f3_grad_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f3_hess_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f3_cont_bin);
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f2_grad_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f2_hess_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f2_cont_bin);
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
        output_buf[i] = local_hist[i];
    }
    */
    i = ltid;
    if (feature_mask.s0) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    i += 1 * 3 * NUM_BINS;
    if (feature_mask.s1) {
        output_buf[i] = local_hist[i];
        output_buf[i + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 0] = f1_grad_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 1] = f1_hess_bin;
    local_hist[0 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f1_cont_bin);
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 0] = f0_grad_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 1] = f0_hess_bin;
    local_hist[1 * 3 * NUM_BINS + ltid * 3 + 2] = as_acc_type((acc_int_type)f0_cont_bin);
    barrier(CLK_LOCAL_MEM_FENCE);
    /*
    for (ushort i = ltid; i < 2 * 3 * NUM_BINS; i += lsize) {
        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
    }
    */
    i = ltid;
    if (feature_mask.s2) {
        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    i += 1 * 3 * NUM_BINS;
    if (feature_mask.s3) {
        output_buf[i + 2 * 3 * NUM_BINS] = local_hist[i];
        output_buf[i + 2 * 3 * NUM_BINS + NUM_BINS] = local_hist[i + NUM_BINS];
        output_buf[i + 2 * 3 * NUM_BINS + 2 * NUM_BINS] = local_hist[i + 2 * NUM_BINS];
    }
    #endif
}

/* Makes MSVC happy with long string literal
)""
R""()
*/
__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
#if USE_CONSTANT_BUF == 1
__kernel void histogram256(__global const uchar4* restrict feature_data_base, 
                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
                      const data_size_t feature_size,
                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
                      const data_size_t num_data, 
                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
#if CONST_HESSIAN == 0
                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
#else
                      const score_t const_hessian,
#endif
                      __global char* restrict output_buf,
                      __global volatile int * sync_counters,
                      __global acc_type* restrict hist_buf_base) {
#else
__kernel void histogram256(__global const uchar4* feature_data_base, 
                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
                      const data_size_t feature_size,
                      __global const data_size_t* data_indices, 
                      const data_size_t num_data, 
                      __global const score_t*  ordered_gradients, 
#if CONST_HESSIAN == 0
                      __global const score_t*  ordered_hessians,
#else
                      const score_t const_hessian,
#endif
                      __global char* restrict output_buf, 
                      __global volatile int * sync_counters,
                      __global acc_type* restrict hist_buf_base) {
#endif
    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
    // otherwise a "Misaligned Address" exception may occur
    __local float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
    const uint gtid = get_global_id(0);
    const uint gsize = get_global_size(0);
    const ushort ltid = get_local_id(0);
    const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
    const ushort group_id = get_group_id(0);

    // local memory per workgroup is 12 KB
    // clear local memory
    __local uint * ptr = (__local uint *) shared_array;
    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
        ptr[i] = 0;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // gradient/hessian histograms
    // assume this starts at 32 * 4 = 128-byte boundary
    // total size: 2 * 4 * 256 * size_of(float) = 8 KB
    // organization: each feature/grad/hessian is at a different bank, 
    //               as indepedent of the feature value as possible
    __local acc_type * gh_hist = (__local acc_type *)shared_array;
    // counter histogram
    // total size: 4 * 256 * size_of(uint) = 4 KB
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS);

    // thread 0, 1, 2, 3 compute histograms for gradients first
    // thread 4, 5, 6, 7 compute histograms for hessians  first
    // etc.
    uchar is_hessian_first = (ltid >> 2) & 1;
    
    ushort group_feature = group_id >> POWER_FEATURE_WORKGROUPS;
    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
    // feature_size is the number of examples per feature
    __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
    // size of threads that process this feature4
    const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
    // equavalent thread ID in this subgroup for this feature4
    const uint subglobal_tid  = gtid - group_feature * subglobal_size;
    // extract feature mask, when a byte is set to 0, that feature is disabled
    #if ENABLE_ALL_FEATURES == 1
    // hopefully the compiler will propogate the constants and eliminate all branches
    uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff);
    #else
    uchar4 feature_mask = feature_masks[group_feature];
    #endif
    // exit if all features are masked
    if (!as_uint(feature_mask)) {
        return;
    }

    // STAGE 1: read feature data, and gradient and hessian
    // first half of the threads read feature data from global memory
    // 4 features stored in a tuple MSB...(0, 1, 2, 3)...LSB
    // We will prefetch data into the "next" variable at the beginning of each iteration
    uchar4 feature4;
    uchar4 feature4_next;
    uchar4 feature4_prev;
    // offset used to rotate feature4 vector
    ushort offset = (ltid & 0x3);
    // store gradient and hessian
    float stat1, stat2;
    float stat1_next, stat2_next;
    ushort bin, addr, addr2;
    data_size_t ind;
    data_size_t ind_next;
    stat1 = ordered_gradients[subglobal_tid];
    #if CONST_HESSIAN == 0
    stat2 = ordered_hessians[subglobal_tid];
    #endif
    #ifdef IGNORE_INDICES
    ind = subglobal_tid;
    #else
    ind = data_indices[subglobal_tid];
    #endif
    feature4 = feature_data[ind];
    feature4_prev = feature4;
    feature4_prev = as_uchar4(rotate(as_uint(feature4_prev), (uint)offset*8));
    #if ENABLE_ALL_FEATURES == 0
    // rotate feature_mask to match the feature order of each thread
    feature_mask = as_uchar4(rotate(as_uint(feature_mask), (uint)offset*8));
    #endif
    acc_type s3_stat1 = 0.0f, s3_stat2 = 0.0f;
    acc_type s2_stat1 = 0.0f, s2_stat2 = 0.0f;
    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;


/* Makes MSVC happy with long string literal
)""
R""()
*/
    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
        // prefetch the next iteration variables
        // we don't need bondary check because we have made the buffer larger
        stat1_next = ordered_gradients[i + subglobal_size];
        #if CONST_HESSIAN == 0
        stat2_next = ordered_hessians[i + subglobal_size];
        #endif
        #ifdef IGNORE_INDICES
        // we need to check to bounds here
        ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
        // start load next feature as early as possible
        feature4_next = feature_data[ind_next];
        #else
        ind_next = data_indices[i + subglobal_size];
        #endif
        #if CONST_HESSIAN == 0
        // swap gradient and hessian for threads 4, 5, 6, 7
        float tmp = stat1;
        stat1 = is_hessian_first ? stat2 : stat1;
        stat2 = is_hessian_first ? tmp   : stat2;
        // stat1 = select(stat1, stat2, is_hessian_first);
        // stat2 = select(stat2, tmp, is_hessian_first);
        #endif

        // STAGE 2: accumulate gradient and hessian
        offset = (ltid & 0x3);
        feature4 = as_uchar4(rotate(as_uint(feature4), (uint)offset*8));
        bin = feature4.s3;
        if ((bin != feature4_prev.s3) && feature_mask.s3) {
            // printf("%3d (%4d): writing s3 %d %d offset %d", ltid, i, bin, feature4_prev.s3, offset);
            bin = feature4_prev.s3;
            feature4_prev.s3 = feature4.s3;
            addr = bin * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            atomic_local_add_f(gh_hist + addr, s3_stat1);
            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians  for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s3_stat2);
            #endif
            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
            s3_stat1 = stat1;
            s3_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s3 %d", ltid, i, bin);
            s3_stat1 += stat1;
            s3_stat2 += stat2;
        }

        bin = feature4.s2;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s2) && feature_mask.s2) {
            // printf("%3d (%4d): writing s2 %d %d feature %d", ltid, i, bin, feature4_prev.s2, offset);
            bin = feature4_prev.s2;
            feature4_prev.s2 = feature4.s2;
            addr = bin * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            atomic_local_add_f(gh_hist + addr, s2_stat1);
            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians  for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s2_stat2);
            #endif
            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
            s2_stat1 = stat1;
            s2_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s2 %d", ltid, i, bin);
            s2_stat1 += stat1;
            s2_stat2 += stat2;
        }


        // prefetch the next iteration variables
        // we don't need bondary check because if it is out of boundary, ind_next = 0
        #ifndef IGNORE_INDICES
        feature4_next = feature_data[ind_next];
        #endif

        bin = feature4.s1;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s1) && feature_mask.s1) {
            // printf("%3d (%4d): writing s1 %d %d feature %d", ltid, i, bin, feature4_prev.s1, offset);
            bin = feature4_prev.s1;
            feature4_prev.s1 = feature4.s1;
            addr = bin * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            atomic_local_add_f(gh_hist + addr, s1_stat1);
            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians  for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s1_stat2);
            #endif
            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
            s1_stat1 = stat1;
            s1_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s1 %d", ltid, i, bin);
            s1_stat1 += stat1;
            s1_stat2 += stat2;
        }

        bin = feature4.s0;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s0) && feature_mask.s0) {
            // printf("%3d (%4d): writing s0 %d %d feature %d", ltid, i, bin, feature4_prev.s0, offset);
            bin = feature4_prev.s0;
            feature4_prev.s0 = feature4.s0;
            addr = bin * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            atomic_local_add_f(gh_hist + addr, s0_stat1);
            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians  for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s0_stat2);
            #endif
            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
            s0_stat1 = stat1;
            s0_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s0 %d", ltid, i, bin);
            s0_stat1 += stat1;
            s0_stat2 += stat2;
        }

        // STAGE 3: accumulate counter
        // there are 4 counters for 4 features
        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
        offset = (ltid & 0x3);
        if (feature_mask.s3) {
            bin = feature4.s3;
            addr = bin * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s2) {
            bin = feature4.s2;
            addr = bin * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s1) {
            bin = feature4.s1;
            addr = bin * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s0) {
            bin = feature4.s0;
            addr = bin * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
    }

    bin = feature4_prev.s3;
    offset = (ltid & 0x3);
    addr = bin * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s3_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s3_stat2);
    #endif

    bin = feature4_prev.s2;
    offset = (offset + 1) & 0x3;
    addr = bin * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s2_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s2_stat2);
    #endif

    bin = feature4_prev.s1;
    offset = (offset + 1) & 0x3;
    addr = bin * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s1_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s1_stat2);
    #endif

    bin = feature4_prev.s0;
    offset = (offset + 1) & 0x3;
    addr = bin * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s0_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s0_stat2);
    #endif
    barrier(CLK_LOCAL_MEM_FENCE);

/* Makes MSVC happy with long string literal
)""
R""()
*/
    #if ENABLE_ALL_FEATURES == 0
    // restore feature_mask
    feature_mask = feature_masks[group_feature];
    #endif

    #if CONST_HESSIAN == 1
    barrier(CLK_LOCAL_MEM_FENCE);
    // make a final reduction
    offset = ltid & 0x3; // helps avoid LDS bank conflicts
    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
    offset = (offset + 1) & 0x3;
    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
    offset = (offset + 1) & 0x3;
    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
    offset = (offset + 1) & 0x3;
    gh_hist[ltid * 8 + offset] += gh_hist[ltid * 8 + offset + 4];
    gh_hist[ltid * 8 + offset + 4] = const_hessian * cnt_hist[ltid * 4 + offset];
    barrier(CLK_LOCAL_MEM_FENCE);
    #endif

    // write to output
    // write gradients and hessians histogram for all 4 features
    /* memory layout in gh_hist (total 2 * 4 * 256 * sizeof(float) = 8 KB):
       -----------------------------------------------------------------------------------------------
       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0   h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0
       g_f0_bin1   g_f1_bin1   g_f2_bin1   g_f3_bin1   h_f0_bin1   h_f1_bin1   h_f2_bin1   h_f3_bin1
       ...
       g_f0_bin255 g_f1_bin255 g_f2_bin255 g_f3_bin255 h_f0_bin255 h_f1_bin255 h_f2_bin255 h_f3_bin255
       -----------------------------------------------------------------------------------------------
    */
    /* memory layout in cnt_hist (total 4 * 256 * sizeof(uint) = 4 KB):
       -----------------------------------------------
       c_f0_bin0   c_f1_bin0   c_f2_bin0   c_f3_bin0
       c_f0_bin1   c_f1_bin1   c_f2_bin1   c_f3_bin1
       ...
       c_f0_bin255 c_f1_bin255 c_f2_bin255 c_f3_bin255
       -----------------------------------------------
    */
    // output data in linear order for further reduction
    // output size = 4 (features) * 3 (counters) * 256 (bins) * sizeof(float)
    /* memory layout of output:
       --------------------------------------------
       g_f0_bin0   g_f0_bin1   ...   g_f0_bin255   \
       h_f0_bin0   h_f0_bin1   ...   h_f0_bin255    |
       c_f0_bin0   c_f0_bin1   ...   c_f0_bin255    |
       g_f1_bin0   g_f1_bin1   ...   g_f1_bin255    |
       h_f1_bin0   h_f1_bin1   ...   h_f1_bin255    |
       c_f1_bin0   c_f1_bin1   ...   c_f1_bin255    |--- 1 sub-histogram block
       g_f2_bin0   g_f2_bin1   ...   g_f2_bin255    |
       h_f2_bin0   h_f2_bin1   ...   h_f2_bin255    |
       c_f2_bin0   c_f2_bin1   ...   c_f2_bin255    |
       g_f3_bin0   g_f3_bin1   ...   g_f3_bin255    |
       h_f3_bin0   h_f3_bin1   ...   h_f3_bin255    |
       c_f3_bin0   c_f3_bin1   ...   c_f3_bin255   /
       --------------------------------------------
    */
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    // if there is only one workgroup processing this feature4, don't even need to write
    #if POWER_FEATURE_WORKGROUPS != 0
    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 3 * NUM_BINS;
    // write gradients and hessians
    __global acc_type * restrict ptr_f = output;
    for (ushort j = 0; j < 4; ++j) {
        for (ushort i = ltid; i < 2 * NUM_BINS; i += lsize) {
            // even threads read gradients, odd threads read hessians
            // FIXME: 2-way bank conflict
            acc_type value = gh_hist[i * 4 + j];
            ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
        }
        ptr_f += 3 * NUM_BINS;
    }
    // write counts
    __global acc_int_type * restrict ptr_i = (__global acc_int_type * restrict)(output + 2 * NUM_BINS);
    for (ushort j = 0; j < 4; ++j) {
        for (ushort i = ltid; i < NUM_BINS; i += lsize) {
            // FIXME: 2-way bank conflict
            uint value = cnt_hist[i * 4 + j];
            ptr_i[i] = value;
        }
        ptr_i += 3 * NUM_BINS;
    }
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
    // To avoid the cost of an extra reducting kernel, we have to deal with some 
    // gray area in OpenCL. We want the last work group that process this feature to
    // make the final reduction, and other threads will just quit.
    // This requires that the results written by other workgroups available to the
    // last workgroup (memory consistency)
    #if NVIDIA == 1
    // this is equavalent to CUDA __threadfence();
    // ensure the writes above goes to main memory and other workgroups can see it
    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
    #else
    // FIXME: how to do the above on AMD GPUs??
    // GCN ISA says that the all writes will bypass L1 cache (write through),
    // however when the last thread is reading sub-histogram data we have to
    // make sure that no part of data is modified in local L1 cache of other workgroups.
    // Otherwise reading can be a problem (atomic operations to get consistency).
    // But in our case, the sub-histogram of this workgroup cannot be in the cache
    // of another workgroup, so the following trick will work just fine.
    #endif
    // Now, we want one workgroup to do the final reduction.
    // Other workgroups processing the same feature quit.
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
    __local uint * counter_val = cnt_hist;
    // backup the old value
    uint old_val = *counter_val;
    if (ltid == 0) {
        // all workgroups processing the same feature add this counter
        *counter_val = atom_inc(sync_counters + feature4_id);
    }
    // make sure everyone in this workgroup is here
    barrier(CLK_LOCAL_MEM_FENCE);
    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
    if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
        if (ltid == 0) {
            // printf("workgroup %d: %g %g %g %g %g %g %g %g\n", group_id, gh_hist[0], gh_hist[1], gh_hist[2], gh_hist[3], gh_hist[4], gh_hist[5], gh_hist[6], gh_hist[7]);
            // printf("feature_data[0] = %d %d %d %d", feature_data[0].s0, feature_data[0].s1, feature_data[0].s2, feature_data[0].s3);
            // clear the sync counter for using it next time
            sync_counters[feature4_id] = 0;
        }
    #else
    // only 1 work group, no need to increase counter
    // the reduction will become a simple copy
    if (1) {
        uint old_val; // dummy
    #endif
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
                 (__global acc_type *)output_buf + output_offset * 4 * 3 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 3 * NUM_BINS;
        within_kernel_reduction256x4(feature_mask, feature4_subhists, skip_id, old_val, 1 << POWER_FEATURE_WORKGROUPS, 
                                     hist_buf, (__local acc_type *)shared_array);
        // if (ltid == 0) 
        //    printf("workgroup %d reduction done, %g %g %g %g %g %g %g %g\n", group_id, hist_buf[0], hist_buf[3*NUM_BINS], hist_buf[2*3*NUM_BINS], hist_buf[3*3*NUM_BINS], hist_buf[1], hist_buf[3*NUM_BINS+1], hist_buf[2*3*NUM_BINS+1], hist_buf[3*3*NUM_BINS+1]);
    }
}

// The following line ends the string literal, adds an extra #endif at the end
// the +9 skips extra characters ")", newline, "#endif" and newline at the beginning
// )"" "\n#endif" + 9
#endif