histogram64.cl

/*!
 * Copyright (c) 2017 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 *
 * \brief This file can either be read and passed to an OpenCL compiler directly,
 *        or included in a C++11 source file as a string literal.
 */
#ifndef __OPENCL_VERSION__
// If we are including this file in C++,
// the entire source file following (except the last #endif) will become
// a raw string literal. The extra ")" is just for mathcing parentheses
// to make the editor happy. The extra ")" and extra endif will be skipped.
// DO NOT add anything between here and the next #ifdef, otherwise you need
// to modify the skip count at the end of this file.
R""()
#endif

#ifndef _HISTOGRAM_64_KERNEL_
#define _HISTOGRAM_64_KERNEL_

#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

// Configurable options:
// NUM_BANKS should be a power of 2
#ifndef NUM_BANKS
#define NUM_BANKS 4
#endif
// how many bits in thread ID represent the bank = log2(NUM_BANKS)
#ifndef BANK_BITS
#define BANK_BITS 2
#endif
// use double precision or not
#ifndef USE_DP_FLOAT
#define USE_DP_FLOAT 0
#endif
// ignore hessian, and use the local memory for hessian as an additional bank for gradient
#ifndef CONST_HESSIAN
#define CONST_HESSIAN 0
#endif


#define LOCAL_SIZE_0 256
#define NUM_BINS 64
// if USE_DP_FLOAT is set to 1, we will use double precision for the accumulator
#if USE_DP_FLOAT == 1
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
typedef double acc_type;
typedef ulong acc_int_type;
#define as_acc_type as_double
#define as_acc_int_type as_ulong
#else
typedef float acc_type;
typedef uint acc_int_type;
#define as_acc_type as_float
#define as_acc_int_type as_uint
#endif
// mask for getting the bank ID
#define BANK_MASK (NUM_BANKS - 1)
// 4 features, each has a gradient and a hessian
#define HG_BIN_MULT (NUM_BANKS * 4 * 2)
// 4 features, each has a counter
#define CNT_BIN_MULT (NUM_BANKS * 4)
// local memory size in bytes
#define LOCAL_MEM_SIZE (4 * (sizeof(uint) + 2 * sizeof(acc_type)) * NUM_BINS * NUM_BANKS)

// unroll the atomic operation for a few times. Takes more code space, 
// but compiler can generate better code for faster atomics.
#define UNROLL_ATOMIC 1

// Options passed by compiler at run time:
// IGNORE_INDICES will be set when the kernel does not 
// #define IGNORE_INDICES
// #define POWER_FEATURE_WORKGROUPS 10

// use all features and do not use feature mask
#ifndef ENABLE_ALL_FEATURES
#define ENABLE_ALL_FEATURES 1
#endif

// detect Nvidia platforms
#ifdef cl_nv_pragma_unroll
#define NVIDIA 1
#endif

// use binary patching for AMD GCN 1.2 or newer
#ifndef AMD_USE_DS_ADD_F32
#define AMD_USE_DS_ADD_F32 0
#endif

typedef uint data_size_t;
typedef float score_t;

#define ATOMIC_FADD_SUB1 { \
    expected.f_val = current.f_val; \
    next.f_val = expected.f_val + val; \
    current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val); \
    if (current.u_val == expected.u_val) \
        goto end; \
    }
#define ATOMIC_FADD_SUB2  ATOMIC_FADD_SUB1 \
                          ATOMIC_FADD_SUB1
#define ATOMIC_FADD_SUB4  ATOMIC_FADD_SUB2 \
                          ATOMIC_FADD_SUB2
#define ATOMIC_FADD_SUB8  ATOMIC_FADD_SUB4 \
                          ATOMIC_FADD_SUB4
#define ATOMIC_FADD_SUB16 ATOMIC_FADD_SUB8 \
                          ATOMIC_FADD_SUB8
#define ATOMIC_FADD_SUB32 ATOMIC_FADD_SUB16\
                          ATOMIC_FADD_SUB16
#define ATOMIC_FADD_SUB64 ATOMIC_FADD_SUB32\
                          ATOMIC_FADD_SUB32


// atomic add for float number in local memory
inline void atomic_local_add_f(__local acc_type *addr, const float val)
{
    union{
        acc_int_type u_val;
        acc_type f_val;
    } next, expected, current;
#if (NVIDIA == 1 && USE_DP_FLOAT == 0)
    float res = 0;
    asm volatile ("atom.shared.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(addr), "f"(val));
#elif (AMD_USE_DS_ADD_F32 == 1 && USE_DP_FLAT == 0)
    // this instruction (DS_AND_U32) will be patched into a DS_ADD_F32
    // we need to hack here because DS_ADD_F32 is not exposed via OpenCL
    atom_and((__local acc_int_type *)addr, as_acc_int_type(val));
#else
    current.f_val = *addr;
    #if UNROLL_ATOMIC == 1
    // provide a fast path
    // then do the complete loop
    // this should work on all devices
    ATOMIC_FADD_SUB8
    ATOMIC_FADD_SUB4
    ATOMIC_FADD_SUB2
    #endif
    do {
        expected.f_val = current.f_val;
        next.f_val = expected.f_val + val;
        current.u_val = atom_cmpxchg((volatile __local acc_int_type *)addr, expected.u_val, next.u_val);
    } while (current.u_val != expected.u_val);
    end:
        ;
#endif
}


/* Makes MSVC happy with long string literal
)""
R""()
*/
// this function will be called by histogram64
// we have one sub-histogram of one feature in registers, and need to read others
void within_kernel_reduction64x4(uchar4 feature_mask,
                           __global const acc_type* restrict feature4_sub_hist, 
                           const uint skip_id,
                           acc_type g_val, acc_type h_val,
                           const ushort num_sub_hist,
                           __global acc_type* restrict output_buf,
                           __local acc_type * restrict local_hist) {
    const ushort ltid = get_local_id(0); // range 0 - 255
    const ushort lsize = LOCAL_SIZE_0;
    ushort feature_id = ltid & 3; // range 0 - 4
    const ushort bin_id = ltid >> 2; // range 0 - 63W
    ushort i;
    #if POWER_FEATURE_WORKGROUPS != 0 
    // if there is only 1 work group, no need to do the reduction
    // add all sub-histograms for 4 features
    __global const acc_type* restrict p = feature4_sub_hist + ltid;
    for (i = 0; i < skip_id; ++i) {
            g_val += *p;            p += NUM_BINS * 4; // 256 threads working on 4 features' 64 bins
            h_val += *p;            p += NUM_BINS * 4;
    }
    // skip the counters we already have
    p += 2 * 4 * NUM_BINS;
    for (i = i + 1; i < num_sub_hist; ++i) {
            g_val += *p;            p += NUM_BINS * 4;
            h_val += *p;            p += NUM_BINS * 4;
    }
    #endif
    // printf("thread %d: g_val=%f, h_val=%f cnt=%d", ltid, g_val, h_val, cnt_val);
    // now overwrite the local_hist for final reduction and output
    // reverse the f3...f0 order to match the real order
    feature_id = 3 - feature_id;
    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 0] = g_val;
    local_hist[feature_id * 2 * NUM_BINS + bin_id * 2 + 1] = h_val;
    barrier(CLK_LOCAL_MEM_FENCE);
    i = ltid;
    if (feature_mask.s0 && i < 1 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s1 && i < 2 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s2 && i < 3 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
    i += 1 * 2 * NUM_BINS;
    if (feature_mask.s3 && i < 4 * 2 * NUM_BINS) {
        output_buf[i] = local_hist[i];
    }
}

/* Makes MSVC happy with long string literal
)""
R""()
*/
__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
#if USE_CONSTANT_BUF == 1
__kernel void histogram64(__global const uchar4* restrict feature_data_base, 
                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
                      const data_size_t feature_size,
                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))), 
                      const data_size_t num_data, 
                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))), 
#if CONST_HESSIAN == 0
                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
#else
                      const score_t const_hessian,
#endif
                      __global char* restrict output_buf,
                      __global volatile int * sync_counters,
                      __global acc_type* restrict hist_buf_base) {
#else
__kernel void histogram64(__global const uchar4* feature_data_base, 
                      __constant const uchar4* restrict feature_masks __attribute__((max_constant_size(65536))),
                      const data_size_t feature_size,
                      __global const data_size_t* data_indices, 
                      const data_size_t num_data, 
                      __global const score_t*  ordered_gradients, 
#if CONST_HESSIAN == 0
                      __global const score_t*  ordered_hessians,
#else
                      const score_t const_hessian,
#endif
                      __global char* restrict output_buf, 
                      __global volatile int * sync_counters,
                      __global acc_type* restrict hist_buf_base) {
#endif
    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
    // otherwise a "Misaligned Address" exception may occur
    __local float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
    const uint gtid = get_global_id(0);
    const uint gsize = get_global_size(0);
    const ushort ltid = get_local_id(0);
    const ushort lsize = LOCAL_SIZE_0; // get_local_size(0);
    const ushort group_id = get_group_id(0);

    // local memory per workgroup is 12 KB
    // clear local memory
    __local uint * ptr = (__local uint *) shared_array;
    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(uint); i += lsize) {
        ptr[i] = 0;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // gradient/hessian histograms
    // assume this starts at 32 * 4 = 128-byte boundary
    // each bank: 2 * 4 * 64 * size_of(float) = 2 KB
    // there are 4 banks (sub-histograms) used by 256 threads total 8 KB
    /* memory layout of gh_hist:
       -----------------------------------------------------------------------------------------------
       bk0_g_f0_bin0   bk0_g_f1_bin0   bk0_g_f2_bin0   bk0_g_f3_bin0   bk0_h_f0_bin0   bk0_h_f1_bin0   bk0_h_f2_bin0   bk0_h_f3_bin0
       bk1_g_f0_bin0   bk1_g_f1_bin0   bk1_g_f2_bin0   bk1_g_f3_bin0   bk1_h_f0_bin0   bk1_h_f1_bin0   bk1_h_f2_bin0   bk1_h_f3_bin0
       bk2_g_f0_bin0   bk2_g_f1_bin0   bk2_g_f2_bin0   bk2_g_f3_bin0   bk2_h_f0_bin0   bk2_h_f1_bin0   bk2_h_f2_bin0   bk2_h_f3_bin0
       bk3_g_f0_bin0   bk3_g_f1_bin0   bk3_g_f2_bin0   bk3_g_f3_bin0   bk3_h_f0_bin0   bk3_h_f1_bin0   bk3_h_f2_bin0   bk3_h_f3_bin0
       bk0_g_f0_bin1   bk0_g_f1_bin1   bk0_g_f2_bin1   bk0_g_f3_bin1   bk0_h_f0_bin1   bk0_h_f1_bin1   bk0_h_f2_bin1   bk0_h_f3_bin1
       bk1_g_f0_bin1   bk1_g_f1_bin1   bk1_g_f2_bin1   bk1_g_f3_bin1   bk1_h_f0_bin1   bk1_h_f1_bin1   bk1_h_f2_bin1   bk1_h_f3_bin1
       bk2_g_f0_bin1   bk2_g_f1_bin1   bk2_g_f2_bin1   bk2_g_f3_bin1   bk2_h_f0_bin1   bk2_h_f1_bin1   bk2_h_f2_bin1   bk2_h_f3_bin1
       bk3_g_f0_bin1   bk3_g_f1_bin1   bk3_g_f2_bin1   bk3_g_f3_bin1   bk3_h_f0_bin1   bk3_h_f1_bin1   bk3_h_f2_bin1   bk3_h_f3_bin1
       ...
       bk0_g_f0_bin64 bk0_g_f1_bin64 bk0_g_f2_bin64 bk0_g_f3_bin64 bk0_h_f0_bin64 bk0_h_f1_bin64 bk0_h_f2_bin64 bk0_h_f3_bin64
       bk1_g_f0_bin64 bk1_g_f1_bin64 bk1_g_f2_bin64 bk1_g_f3_bin64 bk1_h_f0_bin64 bk1_h_f1_bin64 bk1_h_f2_bin64 bk1_h_f3_bin64
       bk2_g_f0_bin64 bk2_g_f1_bin64 bk2_g_f2_bin64 bk2_g_f3_bin64 bk2_h_f0_bin64 bk2_h_f1_bin64 bk2_h_f2_bin64 bk2_h_f3_bin64
       bk3_g_f0_bin64 bk3_g_f1_bin64 bk3_g_f2_bin64 bk3_g_f3_bin64 bk3_h_f0_bin64 bk3_h_f1_bin64 bk3_h_f2_bin64 bk3_h_f3_bin64
       -----------------------------------------------------------------------------------------------
    */
    // with this organization, the LDS/shared memory bank is independent of the bin value
    // all threads within a quarter-wavefront (half-warp) will not have any bank conflict

    __local acc_type * gh_hist = (__local acc_type *)shared_array;
    // counter histogram
    // each bank: 4 * 64 * size_of(uint) = 1 KB
    // there are 4 banks used by 256 threads total 4 KB
    /* memory layout in cnt_hist:
       -----------------------------------------------
       bk0_c_f0_bin0   bk0_c_f1_bin0   bk0_c_f2_bin0   bk0_c_f3_bin0
       bk1_c_f0_bin0   bk1_c_f1_bin0   bk1_c_f2_bin0   bk1_c_f3_bin0
       bk2_c_f0_bin0   bk2_c_f1_bin0   bk2_c_f2_bin0   bk2_c_f3_bin0
       bk3_c_f0_bin0   bk3_c_f1_bin0   bk3_c_f2_bin0   bk3_c_f3_bin0
       bk0_c_f0_bin1   bk0_c_f1_bin1   bk0_c_f2_bin1   bk0_c_f3_bin1
       bk1_c_f0_bin1   bk1_c_f1_bin1   bk1_c_f2_bin1   bk1_c_f3_bin1
       bk2_c_f0_bin1   bk2_c_f1_bin1   bk2_c_f2_bin1   bk2_c_f3_bin1
       bk3_c_f0_bin1   bk3_c_f1_bin1   bk3_c_f2_bin1   bk3_c_f3_bin1
       ...
       bk0_c_f0_bin64 bk0_c_f1_bin64 bk0_c_f2_bin64 bk0_c_f3_bin64
       bk1_c_f0_bin64 bk1_c_f1_bin64 bk1_c_f2_bin64 bk1_c_f3_bin64
       bk2_c_f0_bin64 bk2_c_f1_bin64 bk2_c_f2_bin64 bk2_c_f3_bin64
       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
       -----------------------------------------------
    */
    #if CONST_HESSIAN == 1
    __local uint * cnt_hist = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);
    #endif

    // thread 0, 1, 2, 3 compute histograms for gradients first
    // thread 4, 5, 6, 7 compute histograms for hessians  first
    // etc.
    uchar is_hessian_first = (ltid >> 2) & 1;
    // thread 0-7 write result to bank0, 8-15 to bank1, 16-23 to bank2, 24-31 to bank3
    ushort bank = (ltid >> 3) & BANK_MASK;
    
    ushort group_feature = group_id >> POWER_FEATURE_WORKGROUPS;
    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
    // feature_size is the number of examples per feature
    __global const uchar4* feature_data = feature_data_base + group_feature * feature_size;
    // size of threads that process this feature4
    const uint subglobal_size = lsize * (1 << POWER_FEATURE_WORKGROUPS);
    // equavalent thread ID in this subgroup for this feature4
    const uint subglobal_tid  = gtid - group_feature * subglobal_size;
    // extract feature mask, when a byte is set to 0, that feature is disabled
    #if ENABLE_ALL_FEATURES == 1
    // hopefully the compiler will propogate the constants and eliminate all branches
    uchar4 feature_mask = (uchar4)(0xff, 0xff, 0xff, 0xff);
    #else
    uchar4 feature_mask = feature_masks[group_feature];
    #endif
    // exit if all features are masked
    if (!as_uint(feature_mask)) {
        return;
    }

    // STAGE 1: read feature data, and gradient and hessian
    // first half of the threads read feature data from global memory
    // 4 features stored in a tuple MSB...(0, 1, 2, 3)...LSB
    // We will prefetch data into the "next" variable at the beginning of each iteration
    uchar4 feature4;
    uchar4 feature4_next;
    uchar4 feature4_prev;
    // offset used to rotate feature4 vector
    ushort offset = (ltid & 0x3);
    // store gradient and hessian
    float stat1, stat2;
    float stat1_next, stat2_next;
    ushort bin, addr, addr2;
    data_size_t ind;
    data_size_t ind_next;
    stat1 = ordered_gradients[subglobal_tid];
    #if CONST_HESSIAN == 0
    stat2 = ordered_hessians[subglobal_tid];
    #endif
    #ifdef IGNORE_INDICES
    ind = subglobal_tid;
    #else
    ind = data_indices[subglobal_tid];
    #endif
    feature4 = feature_data[ind];
    feature4 = as_uchar4(as_uint(feature4) & 0x3f3f3f3f);
    feature4_prev = feature4;
    feature4_prev = as_uchar4(rotate(as_uint(feature4_prev), (uint)offset*8));
    #if ENABLE_ALL_FEATURES == 0
    // rotate feature_mask to match the feature order of each thread
    feature_mask = as_uchar4(rotate(as_uint(feature_mask), (uint)offset*8));
    #endif
    acc_type s3_stat1 = 0.0f, s3_stat2 = 0.0f;
    acc_type s2_stat1 = 0.0f, s2_stat2 = 0.0f;
    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;

/* Makes MSVC happy with long string literal
)""
R""()
*/
    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
        // prefetch the next iteration variables
        // we don't need bondary check because we have made the buffer larger
        stat1_next = ordered_gradients[i + subglobal_size];
        #if CONST_HESSIAN == 0
        stat2_next = ordered_hessians[i + subglobal_size];
        #endif
        #ifdef IGNORE_INDICES
        // we need to check to bounds here
        ind_next = i + subglobal_size < num_data ? i + subglobal_size : i;
        // start load next feature as early as possible
        feature4_next = feature_data[ind_next];
        #else
        ind_next = data_indices[i + subglobal_size];
        #endif
        #if CONST_HESSIAN == 0
        // swap gradient and hessian for threads 4, 5, 6, 7
        float tmp = stat1;
        stat1 = is_hessian_first ? stat2 : stat1;
        stat2 = is_hessian_first ? tmp   : stat2;
        // stat1 = select(stat1, stat2, is_hessian_first);
        // stat2 = select(stat2, tmp, is_hessian_first);
        #endif

        // STAGE 2: accumulate gradient and hessian
        offset = (ltid & 0x3);
        feature4 = as_uchar4(rotate(as_uint(feature4), (uint)offset*8));
        bin = feature4.s3;
        if ((bin != feature4_prev.s3) && feature_mask.s3) {
            // printf("%3d (%4d): writing s3 %d %d offset %d", ltid, i, bin, feature4_prev.s3, offset);
            bin = feature4_prev.s3;
            feature4_prev.s3 = feature4.s3;
            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's hessians  for example 4, 5, 6, 7
            atomic_local_add_f(gh_hist + addr, s3_stat1);
            // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 0, 1, 2, 3's gradients for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s3_stat2);
            #endif
            s3_stat1 = stat1;
            s3_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s3 %d", ltid, i, bin);
            s3_stat1 += stat1;
            s3_stat2 += stat2;
        }

        bin = feature4.s2;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s2) && feature_mask.s2) {
            // printf("%3d (%4d): writing s2 %d %d feature %d", ltid, i, bin, feature4_prev.s2, offset);
            bin = feature4_prev.s2;
            feature4_prev.s2 = feature4.s2;
            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's hessians  for example 4, 5, 6, 7
            atomic_local_add_f(gh_hist + addr, s2_stat1);
            // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 1, 2, 3, 0's gradients for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s2_stat2);
            #endif
            s2_stat1 = stat1;
            s2_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s2 %d", ltid, i, bin);
            s2_stat1 += stat1;
            s2_stat2 += stat2;
        }


        // prefetch the next iteration variables
        // we don't need bondary check because if it is out of boundary, ind_next = 0
        #ifndef IGNORE_INDICES
        feature4_next = feature_data[ind_next];
        #endif

        bin = feature4.s1 & 0x3f;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s1) && feature_mask.s1) {
            // printf("%3d (%4d): writing s1 %d %d feature %d", ltid, i, bin, feature4_prev.s1, offset);
            bin = feature4_prev.s1;
            feature4_prev.s1 = feature4.s1;
            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's hessians  for example 4, 5, 6, 7
            atomic_local_add_f(gh_hist + addr, s1_stat1);
            // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 2, 3, 0, 1's gradients for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s1_stat2);
            #endif
            s1_stat1 = stat1;
            s1_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s1 %d", ltid, i, bin);
            s1_stat1 += stat1;
            s1_stat2 += stat2;
        }

        bin = feature4.s0;
        offset = (offset + 1) & 0x3;
        if ((bin != feature4_prev.s0) && feature_mask.s0) {
            // printf("%3d (%4d): writing s0 %d %d feature %d", ltid, i, bin, feature4_prev.s0, offset);
            bin = feature4_prev.s0;
            feature4_prev.s0 = feature4.s0;
            addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
            addr2 = addr + 4 - 8 * is_hessian_first;
            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's gradients for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's hessians  for example 4, 5, 6, 7
            atomic_local_add_f(gh_hist + addr, s0_stat1);
            // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's hessians  for example 0, 1, 2, 3
            // thread 4, 5, 6, 7 now process feature 3, 0, 1, 2's gradients for example 4, 5, 6, 7
            #if CONST_HESSIAN == 0
            atomic_local_add_f(gh_hist + addr2, s0_stat2);
            #endif
            s0_stat1 = stat1;
            s0_stat2 = stat2;
        }
        else {
            // printf("%3d (%4d): acc s0 %d", ltid, i, bin);
            s0_stat1 += stat1;
            s0_stat2 += stat2;
        }
        #if CONST_HESSIAN == 1
        // STAGE 3: accumulate counter
        // there are 4 counters for 4 features
        // thread 0, 1, 2, 3 now process feature 0, 1, 2, 3's counts for example 0, 1, 2, 3
        offset = (ltid & 0x3);
        if (feature_mask.s3) {
            bin = feature4.s3;
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 1, 2, 3, 0's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s2) {
            bin = feature4.s2;
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 2, 3, 0, 1's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s1) {
            bin = feature4.s1;
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        // thread 0, 1, 2, 3 now process feature 3, 0, 1, 2's counts for example 0, 1, 2, 3
        offset = (offset + 1) & 0x3;
        if (feature_mask.s0) {
            bin = feature4.s0;
            addr = bin * CNT_BIN_MULT + bank * 4 + offset;
            atom_inc(cnt_hist + addr);
        }
        #endif
        stat1 = stat1_next;
        stat2 = stat2_next;
        feature4 = feature4_next;
        feature4 = as_uchar4(as_uint(feature4) & 0x3f3f3f3f);
    }

    bin = feature4_prev.s3;
    offset = (ltid & 0x3);
    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s3_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s3_stat2);
    #endif

    bin = feature4_prev.s2;
    offset = (offset + 1) & 0x3;
    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s2_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s2_stat2);
    #endif

    bin = feature4_prev.s1;
    offset = (offset + 1) & 0x3;
    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s1_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s1_stat2);
    #endif

    bin = feature4_prev.s0;
    offset = (offset + 1) & 0x3;
    addr = bin * HG_BIN_MULT + bank * 8 + is_hessian_first * 4 + offset;
    addr2 = addr + 4 - 8 * is_hessian_first;
    atomic_local_add_f(gh_hist + addr, s0_stat1);
    #if CONST_HESSIAN == 0
    atomic_local_add_f(gh_hist + addr2, s0_stat2);
    #endif
    barrier(CLK_LOCAL_MEM_FENCE);
    
/* Makes MSVC happy with long string literal
)""
R""()
*/
    #if ENABLE_ALL_FEATURES == 0
    // restore feature_mask
    feature_mask = feature_masks[group_feature];
    #endif
    
    // now reduce the 4 banks of subhistograms into 1
    /* memory layout of gh_hist:
       -----------------------------------------------------------------------------------------------
       bk0_g_f0_bin0   bk0_g_f1_bin0   bk0_g_f2_bin0   bk0_g_f3_bin0   bk0_h_f0_bin0   bk0_h_f1_bin0   bk0_h_f2_bin0   bk0_h_f3_bin0
       bk1_g_f0_bin0   bk1_g_f1_bin0   bk1_g_f2_bin0   bk1_g_f3_bin0   bk1_h_f0_bin0   bk1_h_f1_bin0   bk1_h_f2_bin0   bk1_h_f3_bin0
       bk2_g_f0_bin0   bk2_g_f1_bin0   bk2_g_f2_bin0   bk2_g_f3_bin0   bk2_h_f0_bin0   bk2_h_f1_bin0   bk2_h_f2_bin0   bk2_h_f3_bin0
       bk3_g_f0_bin0   bk3_g_f1_bin0   bk3_g_f2_bin0   bk3_g_f3_bin0   bk3_h_f0_bin0   bk3_h_f1_bin0   bk3_h_f2_bin0   bk3_h_f3_bin0
       bk0_g_f0_bin1   bk0_g_f1_bin1   bk0_g_f2_bin1   bk0_g_f3_bin1   bk0_h_f0_bin1   bk0_h_f1_bin1   bk0_h_f2_bin1   bk0_h_f3_bin1
       bk1_g_f0_bin1   bk1_g_f1_bin1   bk1_g_f2_bin1   bk1_g_f3_bin1   bk1_h_f0_bin1   bk1_h_f1_bin1   bk1_h_f2_bin1   bk1_h_f3_bin1
       bk2_g_f0_bin1   bk2_g_f1_bin1   bk2_g_f2_bin1   bk2_g_f3_bin1   bk2_h_f0_bin1   bk2_h_f1_bin1   bk2_h_f2_bin1   bk2_h_f3_bin1
       bk3_g_f0_bin1   bk3_g_f1_bin1   bk3_g_f2_bin1   bk3_g_f3_bin1   bk3_h_f0_bin1   bk3_h_f1_bin1   bk3_h_f2_bin1   bk3_h_f3_bin1
       ...
       bk0_g_f0_bin64 bk0_g_f1_bin64 bk0_g_f2_bin64 bk0_g_f3_bin64 bk0_h_f0_bin64 bk0_h_f1_bin64 bk0_h_f2_bin64 bk0_h_f3_bin64
       bk1_g_f0_bin64 bk1_g_f1_bin64 bk1_g_f2_bin64 bk1_g_f3_bin64 bk1_h_f0_bin64 bk1_h_f1_bin64 bk1_h_f2_bin64 bk1_h_f3_bin64
       bk2_g_f0_bin64 bk2_g_f1_bin64 bk2_g_f2_bin64 bk2_g_f3_bin64 bk2_h_f0_bin64 bk2_h_f1_bin64 bk2_h_f2_bin64 bk2_h_f3_bin64
       bk3_g_f0_bin64 bk3_g_f1_bin64 bk3_g_f2_bin64 bk3_g_f3_bin64 bk3_h_f0_bin64 bk3_h_f1_bin64 bk3_h_f2_bin64 bk3_h_f3_bin64
       -----------------------------------------------------------------------------------------------
    */
    /* memory layout in cnt_hist:
       -----------------------------------------------
       bk0_c_f0_bin0   bk0_c_f1_bin0   bk0_c_f2_bin0   bk0_c_f3_bin0
       bk1_c_f0_bin0   bk1_c_f1_bin0   bk1_c_f2_bin0   bk1_c_f3_bin0
       bk2_c_f0_bin0   bk2_c_f1_bin0   bk2_c_f2_bin0   bk2_c_f3_bin0
       bk3_c_f0_bin0   bk3_c_f1_bin0   bk3_c_f2_bin0   bk3_c_f3_bin0
       bk0_c_f0_bin1   bk0_c_f1_bin1   bk0_c_f2_bin1   bk0_c_f3_bin1
       bk1_c_f0_bin1   bk1_c_f1_bin1   bk1_c_f2_bin1   bk1_c_f3_bin1
       bk2_c_f0_bin1   bk2_c_f1_bin1   bk2_c_f2_bin1   bk2_c_f3_bin1
       bk3_c_f0_bin1   bk3_c_f1_bin1   bk3_c_f2_bin1   bk3_c_f3_bin1
       ...
       bk0_c_f0_bin64 bk0_c_f1_bin64 bk0_c_f2_bin64 bk0_c_f3_bin64
       bk1_c_f0_bin64 bk1_c_f1_bin64 bk1_c_f2_bin64 bk1_c_f3_bin64
       bk2_c_f0_bin64 bk2_c_f1_bin64 bk2_c_f2_bin64 bk2_c_f3_bin64
       bk3_c_f0_bin64 bk3_c_f1_bin64 bk3_c_f2_bin64 bk3_c_f3_bin64
       -----------------------------------------------
    */
    acc_type g_val = 0.0f;
    acc_type h_val = 0.0f;
    uint cnt_val = 0;
    // 256 threads, working on 4 features and 64 bins,
    // so each thread has an independent feature/bin to work on.
    const ushort feature_id = ltid & 3; // range 0 - 4
    const ushort bin_id = ltid >> 2; // range 0 - 63
    offset = (ltid >> 2) & BANK_MASK; // helps avoid LDS bank conflicts
    for (int i = 0; i < NUM_BANKS; ++i) {
        ushort bank_id = (i + offset) & BANK_MASK;
        g_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id];
        h_val += gh_hist[bin_id * HG_BIN_MULT + bank_id * 8 + feature_id + 4];
        #if CONST_HESSIAN == 1
        cnt_val += cnt_hist[bin_id * CNT_BIN_MULT + bank_id * 4 + feature_id];
        #endif
    }
    // now thread 0 - 3 holds feature 0, 1, 2, 3's gradient, hessian and count bin 0
    // now thread 4 - 7 holds feature 0, 1, 2, 3's gradient, hessian and count bin 1
    // etc,

    #if CONST_HESSIAN == 1
    g_val += h_val;
    h_val = cnt_val * const_hessian;
    #endif
    // write to output
    // write gradients and hessians histogram for all 4 features
    // output data in linear order for further reduction
    // output size = 4 (features) * 3 (counters) * 64 (bins) * sizeof(float)
    /* memory layout of output:
       g_f0_bin0   g_f1_bin0   g_f2_bin0   g_f3_bin0
       g_f0_bin1   g_f1_bin1   g_f2_bin1   g_f3_bin1
       ...
       g_f0_bin63  g_f1_bin63  g_f2_bin63  g_f3_bin63
       h_f0_bin0   h_f1_bin0   h_f2_bin0   h_f3_bin0
       h_f0_bin1   h_f1_bin1   h_f2_bin1   h_f3_bin1
       ...
       h_f0_bin63  h_f1_bin63  h_f2_bin63  h_f3_bin63
       c_f0_bin0   c_f1_bin0   c_f2_bin0   c_f3_bin0
       c_f0_bin1   c_f1_bin1   c_f2_bin1   c_f3_bin1
       ...
       c_f0_bin63  c_f1_bin63  c_f2_bin63  c_f3_bin63
    */
    // if there is only one workgroup processing this feature4, don't even need to write
    uint feature4_id = (group_id >> POWER_FEATURE_WORKGROUPS);
    #if POWER_FEATURE_WORKGROUPS != 0
    __global acc_type * restrict output = (__global acc_type * restrict)output_buf + group_id * 4 * 2 * NUM_BINS;
    // if g_val and h_val are double, they are converted to float here
    // write gradients for 4 features
    output[0 * 4 * NUM_BINS + ltid] = g_val;
    // write hessians for 4 features
    output[1 * 4 * NUM_BINS + ltid] = h_val;
    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
    mem_fence(CLK_GLOBAL_MEM_FENCE);
    // To avoid the cost of an extra reducting kernel, we have to deal with some 
    // gray area in OpenCL. We want the last work group that process this feature to
    // make the final reduction, and other threads will just quit.
    // This requires that the results written by other workgroups available to the
    // last workgroup (memory consistency)
    #if NVIDIA == 1
    // this is equavalent to CUDA __threadfence();
    // ensure the writes above goes to main memory and other workgroups can see it
    asm volatile("{\n\tmembar.gl;\n\t}\n\t" :::"memory");
    #else
    // FIXME: how to do the above on AMD GPUs??
    // GCN ISA says that the all writes will bypass L1 cache (write through),
    // however when the last thread is reading sub-histogram data we have to
    // make sure that no part of data is modified in local L1 cache of other workgroups.
    // Otherwise reading can be a problem (atomic operations to get consistency).
    // But in our case, the sub-histogram of this workgroup cannot be in the cache
    // of another workgroup, so the following trick will work just fine.
    #endif
    // Now, we want one workgroup to do the final reduction.
    // Other workgroups processing the same feature quit.
    // The is done by using an global atomic counter.
    // On AMD GPUs ideally this should be done in GDS,
    // but currently there is no easy way to access it via OpenCL.
    __local uint * counter_val = (__local uint *)(gh_hist + 2 * 4 * NUM_BINS * NUM_BANKS);;
    if (ltid == 0) {
        // all workgroups processing the same feature add this counter
        *counter_val = atom_inc(sync_counters + feature4_id);
    }
    // make sure everyone in this workgroup is here
    barrier(CLK_LOCAL_MEM_FENCE);
    // everyone in this wrokgroup: if we are the last workgroup, then do reduction!
    if (*counter_val == (1 << POWER_FEATURE_WORKGROUPS) - 1) {
        if (ltid == 0) {
            // printf("workgroup %d start reduction!\n", group_id);
            // printf("feature_data[0] = %d %d %d %d", feature_data[0].s0, feature_data[0].s1, feature_data[0].s2, feature_data[0].s3);
            // clear the sync counter for using it next time
            sync_counters[feature4_id] = 0;
        }
    #else
    // only 1 work group, no need to increase counter
    // the reduction will become a simple copy
    if (1) {
        barrier(CLK_LOCAL_MEM_FENCE);
    #endif
        // locate our feature4's block in output memory
        uint output_offset = (feature4_id << POWER_FEATURE_WORKGROUPS);
        __global acc_type const * restrict feature4_subhists = 
                 (__global acc_type *)output_buf + output_offset * 4 * 2 * NUM_BINS;
        // skip reading the data already in local memory
        uint skip_id = group_id ^ output_offset;
        // locate output histogram location for this feature4
        __global acc_type* restrict hist_buf = hist_buf_base + feature4_id * 4 * 2 * NUM_BINS;
        within_kernel_reduction64x4(feature_mask, feature4_subhists, skip_id, g_val, h_val, 
                                    1 << POWER_FEATURE_WORKGROUPS, hist_buf, (__local acc_type *)shared_array);
    }
}

// The following line ends the string literal, adds an extra #endif at the end
// )"" "\n#endif"
#endif