[CUDA] remove src/treelearner/kernels (#6766)

* remove src/treelearner/kernels * Update CMakeLists.txt * clean up

[CUDA] remove src/treelearner/kernels (#6766)
* remove src/treelearner/kernels * Update CMakeLists.txt * clean up
f3bd64a1 · shiyu1994 · GitHub · 60b0155a · f3bd64a1 · 60b0155a
Unverified Commit f3bd64a1 authored Jan 02, 2025 by shiyu1994 Committed by GitHub Jan 02, 2025
3 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,54 +252,6 @@ if(USE_CUDA)
      set(CMAKE_CUDA_STANDARD 11)
      set(CMAKE_CUDA_STANDARD_REQUIRED ON)
    endif()
-
-    set(
-      BASE_DEFINES
-      -DPOWER_FEATURE_WORKGROUPS=12
-      -DUSE_CONSTANT_BUF=0
-    )
-    set(
-      ALLFEATS_DEFINES
-      ${BASE_DEFINES}
-      -DENABLE_ALL_FEATURES
-    )
-    set(
-      FULLDATA_DEFINES
-      ${ALLFEATS_DEFINES}
-      -DIGNORE_INDICES
-    )
-
-    message(STATUS "ALLFEATS_DEFINES: ${ALLFEATS_DEFINES}")
-    message(STATUS "FULLDATA_DEFINES: ${FULLDATA_DEFINES}")
-
-    function(add_histogram hsize hname hadd hconst hdir)
-      add_library(histo${hsize}${hname} OBJECT src/treelearner/kernels/histogram${hsize}.cu)
-      set_target_properties(
-        histo${hsize}${hname}
-        PROPERTIES
-          CUDA_SEPARABLE_COMPILATION ON
-          CUDA_ARCHITECTURES ${CUDA_ARCHS}
-      )
-      if(hadd)
-        list(APPEND histograms histo${hsize}${hname})
-        set(histograms ${histograms} PARENT_SCOPE)
-      endif()
-      target_compile_definitions(
-        histo${hsize}${hname}
-        PRIVATE
-        -DCONST_HESSIAN=${hconst}
-        ${hdir}
-      )
-    endfunction()
-
-    foreach(hsize _16_64_256)
-      add_histogram("${hsize}" "_sp_const" "True" "1" "${BASE_DEFINES}")
-      add_histogram("${hsize}" "_sp" "True" "0" "${BASE_DEFINES}")
-      add_histogram("${hsize}" "-allfeats_sp_const" "False" "1" "${ALLFEATS_DEFINES}")
-      add_histogram("${hsize}" "-allfeats_sp" "False" "0" "${ALLFEATS_DEFINES}")
-      add_histogram("${hsize}" "-fulldata_sp_const" "True" "1" "${FULLDATA_DEFINES}")
-      add_histogram("${hsize}" "-fulldata_sp" "True" "0" "${FULLDATA_DEFINES}")
-    endforeach()
 endif()

 include(CheckCXXSourceCompiles)
@@ -634,14 +586,6 @@ if(USE_CUDA)
        CUDA_RESOLVE_DEVICE_SYMBOLS ON
    )
  endif()
-
-  # histograms are list of object libraries. Linking object library to other
-  # object libraries only gets usage requirements, the linked objects won't be
-  # used. Thus we have to call target_link_libraries on final targets here.
-  if(BUILD_CLI)
-    target_link_libraries(lightgbm PRIVATE ${histograms})
-  endif()
-  target_link_libraries(_lightgbm PRIVATE ${histograms})
 endif()

 if(WIN32)

--- a/src/treelearner/kernels/histogram_16_64_256.cu
+++ b/src/treelearner/kernels/histogram_16_64_256.cu
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-
-#include <LightGBM/meta.h>
-
-#include <cstdint>
-#include <cstdio>
-
-#include "histogram_16_64_256.hu"
-
-namespace LightGBM {
-
-// atomic add for float number in local memory
-inline __device__ void atomic_local_add_f(acc_type *addr, const acc_type val) {
-    atomicAdd(addr, static_cast<acc_type>(val));
-}
-
-// histogram16 stuff
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram16_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram16
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram16
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 16
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram16
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction16x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-    __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction16x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram16 stuff
-
-// histogram64 stuff
-#undef KERNEL_NAME
-#undef NUM_BINS
-#undef LOCAL_MEM_SIZE
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram64_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram64  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-// #define KERNEL_NAME histogram64_allfeats
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram64
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 64
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram64
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction64x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for Hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-    __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction64x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram64 stuff
-
-// histogram256 stuff
-#undef KERNEL_NAME
-#undef NUM_BINS
-#undef LOCAL_MEM_SIZE
-#ifdef ENABLE_ALL_FEATURES
-#ifdef IGNORE_INDICES
-#define KERNEL_NAME histogram256_fulldata
-#else  // IGNORE_INDICES
-#define KERNEL_NAME histogram256  // seems like ENABLE_ALL_FEATURES is set to 1 in the header if its disabled
-// #define KERNEL_NAME histogram256_allfeats
-#endif  // IGNORE_INDICES
-#else  // ENABLE_ALL_FEATURES
-#error "ENABLE_ALL_FEATURES should always be 1"
-#define KERNEL_NAME histogram256
-#endif  // ENABLE_ALL_FEATURES
-#define NUM_BINS 256
-#define LOCAL_MEM_SIZE ((sizeof(unsigned int) + 2 * sizeof(acc_type)) * NUM_BINS)
-
-// this function will be called by histogram256
-// we have one sub-histogram of one feature in local memory, and need to read others
-inline void __device__ within_kernel_reduction256x4(const acc_type* __restrict__ feature_sub_hist,
-                           const unsigned int skip_id,
-                           const unsigned int old_val_cont_bin0,
-                           const uint16_t num_sub_hist,
-                           acc_type* __restrict__ output_buf,
-                           acc_type* __restrict__ local_hist,
-                           const size_t power_feature_workgroups) {
-    const uint16_t ltid = threadIdx.x;
-    acc_type grad_bin = local_hist[ltid * 2];
-    acc_type hess_bin = local_hist[ltid * 2 + 1];
-    unsigned int* __restrict__ local_cnt = reinterpret_cast<unsigned int *>(local_hist + 2 * NUM_BINS);
-
-    unsigned int cont_bin;
-    if (power_feature_workgroups != 0) {
-      cont_bin = ltid ? local_cnt[ltid] : old_val_cont_bin0;
-    } else {
-      cont_bin = local_cnt[ltid];
-    }
-    uint16_t i;
-
-    if (power_feature_workgroups != 0) {
-        // add all sub-histograms for feature
-        const acc_type* __restrict__ p = feature_sub_hist + ltid;
-        for (i = 0; i < skip_id; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-
-        // skip the counters we already have
-        p += 3 * NUM_BINS;
-
-        for (i = i + 1; i < num_sub_hist; ++i) {
-            grad_bin += *p;          p += NUM_BINS;
-            hess_bin += *p;          p += NUM_BINS;
-            cont_bin += as_acc_int_type(*p); p += NUM_BINS;
-        }
-    }
-
-    __syncthreads();
-
-    output_buf[ltid * 2 + 0] = grad_bin;
-    output_buf[ltid * 2 + 1] = hess_bin;
-}
-
-#if USE_CONSTANT_BUF == 1
-__kernel void KERNEL_NAME(__global const uchar* restrict feature_data_base,
-                      __constant const uchar* restrict feature_masks __attribute__((max_constant_size(65536))),
-                      const data_size_t feature_size,
-                      __constant const data_size_t* restrict data_indices __attribute__((max_constant_size(65536))),
-                      const data_size_t num_data,
-                      __constant const score_t* restrict ordered_gradients __attribute__((max_constant_size(65536))),
-#if CONST_HESSIAN == 0
-                      __constant const score_t* restrict ordered_hessians __attribute__((max_constant_size(65536))),
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#else
-__global__ void KERNEL_NAME(const uchar* feature_data_base,
-                      const uchar* __restrict__ feature_masks,
-                      const data_size_t feature_size,
-                      const data_size_t* data_indices,
-                      const data_size_t num_data,
-                      const score_t*  ordered_gradients,
-#if CONST_HESSIAN == 0
-                      const score_t*  ordered_hessians,
-#else
-                      const score_t const_hessian,
-#endif
-                      char* __restrict__ output_buf,
-                      volatile int * sync_counters,
-                      acc_type* __restrict__ hist_buf_base,
-                      const size_t power_feature_workgroups) {
-#endif
-    // allocate the local memory array aligned with float2, to guarantee correct alignment on NVIDIA platforms
-    // otherwise a "Misaligned Address" exception may occur
-    __shared__ float2 shared_array[LOCAL_MEM_SIZE/sizeof(float2)];
-    const unsigned int gtid = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint16_t ltid = threadIdx.x;
-    const uint16_t lsize = NUM_BINS;  // get_local_size(0);
-    const uint16_t group_id = blockIdx.x;
-
-    // local memory per workgroup is 3 KB
-    // clear local memory
-    unsigned int *ptr = reinterpret_cast<unsigned int *>(shared_array);
-    for (int i = ltid; i < LOCAL_MEM_SIZE/sizeof(unsigned int); i += lsize) {
-        ptr[i] = 0;
-    }
-    __syncthreads();
-    // gradient/hessian histograms
-    // assume this starts at 32 * 4 = 128-byte boundary // What does it mean? boundary??
-    // total size: 2 * 256 * size_of(float) = 2 KB
-    // organization: each feature/grad/hessian is at a different bank,
-    //               as independent of the feature value as possible
-    acc_type *gh_hist = reinterpret_cast<acc_type *>(shared_array);
-
-    // counter histogram
-    // total size: 256 * size_of(unsigned int) = 1 KB
-    unsigned int *cnt_hist = reinterpret_cast<unsigned int *>(gh_hist + 2 * NUM_BINS);
-
-    // odd threads (1, 3, ...) compute histograms for hessians first
-    // even thread (0, 2, ...) compute histograms for gradients first
-    // etc.
-    uchar is_hessian_first = ltid & 1;
-
-    uint16_t feature_id = group_id >> power_feature_workgroups;
-
-    // each 2^POWER_FEATURE_WORKGROUPS workgroups process on one feature (compile-time constant)
-    // feature_size is the number of examples per feature
-    const uchar *feature_data = feature_data_base + feature_id * feature_size;
-
-    // size of threads that process this feature4
-    const unsigned int subglobal_size = lsize * (1 << power_feature_workgroups);
-
-    // equivalent thread ID in this subgroup for this feature4
-    const unsigned int subglobal_tid  = gtid - feature_id * subglobal_size;
-
-    data_size_t ind;
-    data_size_t ind_next;
-    #ifdef IGNORE_INDICES
-    ind = subglobal_tid;
-    #else
-    ind = data_indices[subglobal_tid];
-    #endif
-
-    // extract feature mask, when a byte is set to 0, that feature is disabled
-    uchar feature_mask = feature_masks[feature_id];
-    // exit if the feature is masked
-    if (!feature_mask) {
-        return;
-    } else {
-        feature_mask = feature_mask - 1;  // feature_mask is used for get feature (1: 4bit feature, 0: 8bit feature)
-    }
-
-    // STAGE 1: read feature data, and gradient and hessian
-    // first half of the threads read feature data from global memory
-    // We will prefetch data into the "next" variable at the beginning of each iteration
-    uchar feature;
-    uchar feature_next;
-    uint16_t bin;
-
-    feature = feature_data[ind >> feature_mask];
-    if (feature_mask) {
-       feature = (feature >> ((ind & 1) << 2)) & 0xf;
-    }
-    bin = feature;
-    acc_type grad_bin = 0.0f, hess_bin = 0.0f;
-    acc_type *addr_bin;
-
-    // store gradient and hessian
-    score_t grad, hess;
-    score_t grad_next, hess_next;
-    grad = ordered_gradients[ind];
-    #if CONST_HESSIAN == 0
-    hess = ordered_hessians[ind];
-    #endif
-
-    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
-    for (unsigned int i = subglobal_tid; i < num_data; i += subglobal_size) {
-        // prefetch the next iteration variables
-        // we don't need boundary check because we have made the buffer large
-        int i_next = i + subglobal_size;
-        #ifdef IGNORE_INDICES
-        // we need to check to bounds here
-        ind_next = i_next < num_data ? i_next : i;
-        #else
-        ind_next = data_indices[i_next];
-        #endif
-
-        grad_next = ordered_gradients[ind_next];
-        #if CONST_HESSIAN == 0
-        hess_next = ordered_hessians[ind_next];
-        #endif
-        // STAGE 2: accumulate gradient and hessian
-        if (bin != feature) {
-            addr_bin = gh_hist + bin * 2 + is_hessian_first;
-            #if CONST_HESSIAN == 0
-            acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-            acc_bin = is_hessian_first ? grad_bin : hess_bin;
-            atomic_local_add_f(addr_bin, acc_bin);
-
-            #elif CONST_HESSIAN == 1
-            atomic_local_add_f(addr_bin, grad_bin);
-            #endif
-
-            bin = feature;
-            grad_bin = grad;
-            hess_bin = hess;
-        } else {
-            grad_bin += grad;
-            hess_bin += hess;
-        }
-
-        // prefetch the next iteration variables
-        feature_next = feature_data[ind_next >> feature_mask];
-
-        // STAGE 3: accumulate counter
-        atomicAdd(cnt_hist + feature, 1);
-
-        // STAGE 4: update next stat
-        grad = grad_next;
-        hess = hess_next;
-        if (!feature_mask) {
-            feature = feature_next;
-        } else {
-            feature = (feature_next >> ((ind_next & 1) << 2)) & 0xf;
-        }
-    }
-
-    addr_bin = gh_hist + bin * 2 + is_hessian_first;
-    #if CONST_HESSIAN == 0
-    acc_type acc_bin = is_hessian_first ? hess_bin : grad_bin;
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    addr_bin = addr_bin + 1 - 2 * is_hessian_first;
-    acc_bin = is_hessian_first ? grad_bin : hess_bin;
-
-    atomic_local_add_f(addr_bin, acc_bin);
-
-    #elif CONST_HESSIAN == 1
-    atomic_local_add_f(addr_bin, grad_bin);
-    #endif
-     __syncthreads();
-
-    #if CONST_HESSIAN == 1
-    // make a final reduction
-    gh_hist[ltid * 2] += gh_hist[ltid * 2 + 1];
-    gh_hist[ltid * 2 + 1] = const_hessian * cnt_hist[ltid];  // counter move to this position
-    __syncthreads();
-    #endif
-
-#if POWER_FEATURE_WORKGROUPS != 0
-    acc_type *__restrict__ output = reinterpret_cast<acc_type *>(output_buf) + group_id * 3 * NUM_BINS;
-    // write gradients and Hessians
-    acc_type *__restrict__ ptr_f = output;
-    for (uint16_t i = ltid; i < 2 * NUM_BINS; i += lsize) {
-        // even threads read gradients, odd threads read Hessians
-        acc_type value = gh_hist[i];
-        ptr_f[(i & 1) * NUM_BINS + (i >> 1)] = value;
-    }
-    // write counts
-    acc_int_type *__restrict__ ptr_i = reinterpret_cast<acc_int_type *>(output + 2 * NUM_BINS);
-    for (uint16_t i = ltid; i < NUM_BINS; i += lsize) {
-        unsigned int value = cnt_hist[i];
-        ptr_i[i] = value;
-    }
-    __syncthreads();
-    __threadfence();
-    unsigned int * counter_val = cnt_hist;
-    // backup the old value
-    unsigned int old_val = *counter_val;
-    if (ltid == 0) {
-        // all workgroups processing the same feature add this counter
-        *counter_val = atomicAdd(const_cast<int*>(sync_counters + feature_id), 1);
-    }
-    // make sure everyone in this workgroup is here
-    __syncthreads();
-    // everyone in this workgroup: if we are the last workgroup, then do reduction!
-    if (*counter_val == (1 << power_feature_workgroups) - 1) {
-        if (ltid == 0) {
-            sync_counters[feature_id] = 0;
-        }
-#else
-    }
-    // only 1 work group, no need to increase counter
-    // the reduction will become a simple copy
-    {
-        unsigned int old_val;  // dummy
-#endif
-        // locate our feature's block in output memory
-        unsigned int output_offset = (feature_id << power_feature_workgroups);
-        acc_type const * __restrict__ feature_subhists =
-                 reinterpret_cast<acc_type *>(output_buf) + output_offset * 3 * NUM_BINS;
-        // skip reading the data already in local memory
-        unsigned int skip_id = group_id - output_offset;
-        // locate output histogram location for this feature4
-        acc_type *__restrict__ hist_buf = hist_buf_base + feature_id * 2 * NUM_BINS;
-
-        within_kernel_reduction256x4(feature_subhists, skip_id, old_val, 1 << power_feature_workgroups, hist_buf, reinterpret_cast<acc_type *>(shared_array), power_feature_workgroups);
-    }
-}
-
-// end of histogram256 stuff
-
-}  // namespace LightGBM
--- a/src/treelearner/kernels/histogram_16_64_256.hu
+++ b/src/treelearner/kernels/histogram_16_64_256.hu
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-
-#ifndef LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
-#define LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_
-
-#include "LightGBM/meta.h"
-
-namespace LightGBM {
-
-// use double precision or not
-#ifndef USE_DP_FLOAT
-#define USE_DP_FLOAT 1
-#endif
-
-// ignore hessian, and use the local memory for hessian as an additional bank for gradient
-#ifndef CONST_HESSIAN
-#define CONST_HESSIAN 0
-#endif
-
-typedef unsigned char uchar;
-
-template<typename T>
-__device__ double as_double(const T t) {
-  static_assert(sizeof(T) == sizeof(double), "size mismatch");
-  double d;
-  memcpy(&d, &t, sizeof(T));
-  return d;
-}
-template<typename T>
-__device__ unsigned long long as_ulong_ulong(const T t) {
-  static_assert(sizeof(T) == sizeof(unsigned long long), "size mismatch");
-  unsigned long long u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-template<typename T>
-__device__ float as_float(const T t) {
-  static_assert(sizeof(T) == sizeof(float), "size mismatch");
-  float f;
-  memcpy(&f, &t, sizeof(T));
-  return f;
-}
-template<typename T>
-__device__ unsigned int as_uint(const T t) {
-  static_assert(sizeof(T) == sizeof(unsigned int), "size_mismatch");
-  unsigned int u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-template<typename T>
-__device__ uchar4 as_uchar4(const T t) {
-  static_assert(sizeof(T) == sizeof(uchar4), "size mismatch");
-  uchar4 u;
-  memcpy(&u, &t, sizeof(T));
-  return u;
-}
-
-#if USE_DP_FLOAT == 1
-typedef double acc_type;
-typedef unsigned long long acc_int_type;
-#define as_acc_type as_double
-#define as_acc_int_type as_ulong_ulong
-#else
-typedef float acc_type;
-typedef unsigned int acc_int_type;
-#define as_acc_type as_float
-#define as_acc_int_type as_uint
-#endif
-
-// use all features and do not use feature mask
-#ifndef ENABLE_ALL_FEATURES
-#define ENABLE_ALL_FEATURES 1
-#endif
-
-// define all of the different kernels
-
-#define DECLARE_CONST_BUF(name) \
-__global__ void name(__global const uchar* restrict feature_data_base, \
-                     const uchar* restrict feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* restrict data_indices, \
-                     const data_size_t num_data, \
-                     const score_t* restrict ordered_gradients, \
-                     const score_t* restrict ordered_hessians,\
-                     char* __restrict__ output_buf,\
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-#define DECLARE_CONST_HES_CONST_BUF(name) \
-__global__ void name(const uchar* __restrict__ feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* __restrict__ data_indices, \
-                     const data_size_t num_data, \
-                     const score_t* __restrict__ ordered_gradients, \
-                     const score_t const_hessian,\
-                     char* __restrict__ output_buf,\
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-
-#define DECLARE_CONST_HES(name) \
-__global__ void name(const uchar* feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* data_indices, \
-                     const data_size_t num_data, \
-                     const score_t*  ordered_gradients, \
-                     const score_t const_hessian,\
-                     char* __restrict__ output_buf, \
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-#define DECLARE(name) \
-__global__ void name(const uchar* feature_data_base, \
-                     const uchar* __restrict__ feature_masks,\
-                     const data_size_t feature_size,\
-                     const data_size_t* data_indices, \
-                     const data_size_t num_data, \
-                     const score_t*  ordered_gradients, \
-                     const score_t*  ordered_hessians,\
-                     char* __restrict__ output_buf, \
-                     volatile int * sync_counters,\
-                     acc_type* __restrict__ hist_buf_base, \
-                     const size_t power_feature_workgroups);
-
-
-DECLARE_CONST_HES(histogram16_allfeats);
-DECLARE_CONST_HES(histogram16_fulldata);
-DECLARE_CONST_HES(histogram16);
-DECLARE(histogram16_allfeats);
-DECLARE(histogram16_fulldata);
-DECLARE(histogram16);
-
-DECLARE_CONST_HES(histogram64_allfeats);
-DECLARE_CONST_HES(histogram64_fulldata);
-DECLARE_CONST_HES(histogram64);
-DECLARE(histogram64_allfeats);
-DECLARE(histogram64_fulldata);
-DECLARE(histogram64);
-
-DECLARE_CONST_HES(histogram256_allfeats);
-DECLARE_CONST_HES(histogram256_fulldata);
-DECLARE_CONST_HES(histogram256);
-DECLARE(histogram256_allfeats);
-DECLARE(histogram256_fulldata);
-DECLARE(histogram256);
-
-}  // namespace LightGBM
-
-#endif  // LIGHTGBM_TREELEARNER_KERNELS_HISTOGRAM_16_64_256_HU_