Commit 8487d0a2 authored by Huan Zhang's avatar Huan Zhang Committed by Guolin Ke
Browse files

Fix compilation problems with MSVC (#443)

* Fix warnings when compiled with -pedantic

* add -DBOOST_ALL_NO_LIB for windows build

* fix some more MSVC warnings

* Break OpenCL string literal to smaller pieces to avoid error C2026 of MSVC

The string was longer than the limit of 16380 single-byte characters.

This affects Visual Studio 2005 - 2015. Untested on VS 2017.
parent 79c7c09d
...@@ -43,6 +43,10 @@ if(USE_GPU) ...@@ -43,6 +43,10 @@ if(USE_GPU)
MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS}) MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS})
find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED) find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED)
include_directories(${Boost_INCLUDE_DIRS}) include_directories(${Boost_INCLUDE_DIRS})
if (WIN32)
# disable autolinking in boost
ADD_DEFINITIONS(-DBOOST_ALL_NO_LIB)
endif()
ADD_DEFINITIONS(-DUSE_GPU) ADD_DEFINITIONS(-DUSE_GPU)
endif(USE_GPU) endif(USE_GPU)
......
...@@ -104,7 +104,7 @@ int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { ...@@ -104,7 +104,7 @@ int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
// we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
// also guarantee that there are at least 2K examples per workgroup // also guarantee that there are at least 2K examples per workgroup
double x = 256.0 / num_dense_feature4_; double x = 256.0 / num_dense_feature4_;
int exp_workgroups_per_feature = ceil(log2(x)); int exp_workgroups_per_feature = (int)ceil(log2(x));
double t = leaf_num_data / 1024.0; double t = leaf_num_data / 1024.0;
#if GPU_DEBUG >= 4 #if GPU_DEBUG >= 4
printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_); printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_);
...@@ -191,7 +191,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur ...@@ -191,7 +191,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
} }
template <typename HistType> template <typename HistType>
void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used) { void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
HistType* hist_outputs = (HistType*) host_histogram_outputs_; HistType* hist_outputs = (HistType*) host_histogram_outputs_;
// when the output is ready, the computation is done // when the output is ready, the computation is done
histograms_wait_obj_.wait(); histograms_wait_obj_.wait();
...@@ -207,7 +207,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s ...@@ -207,7 +207,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s
for (int j = 0; j < bin_size; ++j) { for (int j = 0; j < bin_size; ++j) {
old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients; old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians; old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
old_histogram_array[j].cnt = hist_outputs[i * device_bin_size_ + j].cnt; old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
} }
} }
else { else {
...@@ -224,7 +224,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s ...@@ -224,7 +224,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s
} }
old_histogram_array[j].sum_gradients = sum_g; old_histogram_array[j].sum_gradients = sum_g;
old_histogram_array[j].sum_hessians = sum_h; old_histogram_array[j].sum_hessians = sum_h;
old_histogram_array[j].cnt = cnt; old_histogram_array[j].cnt = (data_size_t)cnt;
} }
} }
} }
...@@ -323,11 +323,12 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -323,11 +323,12 @@ void GPUTreeLearner::AllocateGPUMemory() {
device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_,
boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr); boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
// find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
int i, k, copied_feature4 = 0, dense_ind[dword_features_]; int k = 0, copied_feature4 = 0;
for (i = 0, k = 0; i < num_feature_groups_; ++i) { std::vector<int> dense_dword_ind(dword_features_);
for (int i = 0; i < num_feature_groups_; ++i) {
// looking for dword_features_ non-sparse feature-groups // looking for dword_features_ non-sparse feature-groups
if (ordered_bins_[i] == nullptr) { if (ordered_bins_[i] == nullptr) {
dense_ind[k] = i; dense_dword_ind[k] = i;
// decide if we need to redistribute the bin // decide if we need to redistribute the bin
double t = device_bin_size_ / (double)train_data_->FeatureGroupNumBin(i); double t = device_bin_size_ / (double)train_data_->FeatureGroupNumBin(i);
// multiplier must be a power of 2 // multiplier must be a power of 2
...@@ -345,7 +346,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -345,7 +346,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (k == dword_features_) { if (k == dword_features_) {
k = 0; k = 0;
for (int j = 0; j < dword_features_; ++j) { for (int j = 0; j < dword_features_; ++j) {
dense_feature_group_map_.push_back(dense_ind[j]); dense_feature_group_map_.push_back(dense_dword_ind[j]);
} }
copied_feature4++; copied_feature4++;
} }
...@@ -369,7 +370,7 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -369,7 +370,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
} }
// building Feature4 bundles; each thread handles dword_features_ features // building Feature4 bundles; each thread handles dword_features_ features
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (unsigned int i = 0; i < dense_feature_group_map_.size() / dword_features_; ++i) { for (int i = 0; i < (int)(dense_feature_group_map_.size() / dword_features_); ++i) {
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
Feature4* host4 = host4_ptrs[tid]; Feature4* host4 = host4_ptrs[tid];
auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_; auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
...@@ -401,14 +402,14 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -401,14 +402,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
*static_cast<Dense4bitsBinIterator*>(bin_iters[6]), *static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
*static_cast<Dense4bitsBinIterator*>(bin_iters[7])}; *static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s0 = (iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1))) host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4); |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
host4[j].s1 = (iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1))) host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
|((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4); |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
host4[j].s2 = (iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1))) host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1)))
|((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4); |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
host4[j].s3 = (iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1))) host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1)))
|((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4); |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
} }
} }
else if (dword_features_ == 4) { else if (dword_features_ == 4) {
...@@ -420,14 +421,14 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -420,14 +421,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
// Dense bin // Dense bin
DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter); DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)); host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
} }
} }
else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) { else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
// Dense 4-bit bin // Dense 4-bit bin
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter); Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)); host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
} }
} }
else { else {
...@@ -458,38 +459,38 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -458,38 +459,38 @@ void GPUTreeLearner::AllocateGPUMemory() {
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
printf("%d features left\n", k); printf("%d features left\n", k);
#endif #endif
for (i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
if (dword_features_ == 8) { if (dword_features_ == 8) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]); BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) { if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter); Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= ((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1))) + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
<< ((i & 1) << 2)); << ((i & 1) << 2));
} }
} }
else { else {
Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_ind[i]); Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_dword_ind[i]);
} }
} }
else if (dword_features_ == 4) { else if (dword_features_ == 4) {
BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]); BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) { if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter); DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)); + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
} }
} }
else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) { else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter); Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)); + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
} }
} }
else { else {
...@@ -504,18 +505,18 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -504,18 +505,18 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (dword_features_ == 8) { if (dword_features_ == 8) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
for (i = k; i < dword_features_; ++i) { for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value // fill this empty feature with some "random" value
host4[j].s[i >> 1] |= ((j & 0xf) << ((i & 1) << 2)); host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
} }
} }
} }
else if (dword_features_ == 4) { else if (dword_features_ == 4) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) { for (int j = 0; j < num_data_; ++j) {
for (i = k; i < dword_features_; ++i) { for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value // fill this empty feature with some "random" value
host4[j].s[i] = j; host4[j].s[i] = (uint8_t)j;
} }
} }
} }
...@@ -525,8 +526,8 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -525,8 +526,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
printf("Last features copied to device\n"); printf("Last features copied to device\n");
#endif #endif
for (i = 0; i < k; ++i) { for (int i = 0; i < k; ++i) {
dense_feature_group_map_.push_back(dense_ind[i]); dense_feature_group_map_.push_back(dense_dword_ind[i]);
} }
} }
// deallocate pinned space for feature copying // deallocate pinned space for feature copying
...@@ -542,12 +543,12 @@ void GPUTreeLearner::AllocateGPUMemory() { ...@@ -542,12 +543,12 @@ void GPUTreeLearner::AllocateGPUMemory() {
end_time * 1e-3, sparse_feature_group_map_.size()); end_time * 1e-3, sparse_feature_group_map_.size());
#if GPU_DEBUG >= 1 #if GPU_DEBUG >= 1
printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size()); printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
for (i = 0; i < num_dense_feature_groups_; ++i) { for (int i = 0; i < num_dense_feature_groups_; ++i) {
printf("%d ", dense_feature_group_map_[i]); printf("%d ", dense_feature_group_map_[i]);
} }
printf("\n"); printf("\n");
printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size()); printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
for (i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) { for (int i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
printf("%d ", sparse_feature_group_map_[i]); printf("%d ", sparse_feature_group_map_[i]);
} }
printf("\n"); printf("\n");
...@@ -584,10 +585,10 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -584,10 +585,10 @@ void GPUTreeLearner::BuildGPUKernels() {
} }
catch (boost::compute::opencl_error &e) { catch (boost::compute::opencl_error &e) {
if (program.build_log().size() > 0) { if (program.build_log().size() > 0) {
Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str()); Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
} }
else { else {
Log::Fatal("GPU program built failure, log unavailable"); Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
} }
} }
histogram_kernels_[i] = program.create_kernel(kernel_name_); histogram_kernels_[i] = program.create_kernel(kernel_name_);
...@@ -599,10 +600,10 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -599,10 +600,10 @@ void GPUTreeLearner::BuildGPUKernels() {
} }
catch (boost::compute::opencl_error &e) { catch (boost::compute::opencl_error &e) {
if (program.build_log().size() > 0) { if (program.build_log().size() > 0) {
Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str()); Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
} }
else { else {
Log::Fatal("GPU program built failure, log unavailable"); Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
} }
} }
histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_); histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_);
...@@ -614,10 +615,10 @@ void GPUTreeLearner::BuildGPUKernels() { ...@@ -614,10 +615,10 @@ void GPUTreeLearner::BuildGPUKernels() {
} }
catch (boost::compute::opencl_error &e) { catch (boost::compute::opencl_error &e) {
if (program.build_log().size() > 0) { if (program.build_log().size() > 0) {
Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str()); Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
} }
else { else {
Log::Fatal("GPU program built failure, log unavailable"); Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
} }
} }
histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_); histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_);
...@@ -979,11 +980,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -979,11 +980,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
if (is_gpu_used) { if (is_gpu_used) {
if (tree_config_->gpu_use_dp) { if (tree_config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used); WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
} }
else { else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used); WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
} }
} }
...@@ -1033,11 +1034,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -1033,11 +1034,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
if (is_gpu_used) { if (is_gpu_used) {
if (tree_config_->gpu_use_dp) { if (tree_config_->gpu_use_dp) {
// use double precision // use double precision
WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used); WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
} }
else { else {
// use single precision // use single precision
WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used); WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
} }
} }
} }
......
...@@ -64,15 +64,7 @@ protected: ...@@ -64,15 +64,7 @@ protected:
private: private:
/*! \brief 4-byte feature tuple used by GPU kernels */ /*! \brief 4-byte feature tuple used by GPU kernels */
struct Feature4 { struct Feature4 {
union { uint8_t s[4];
unsigned char s[4];
struct {
unsigned char s0;
unsigned char s1;
unsigned char s2;
unsigned char s3;
};
};
}; };
/*! \brief Single precision histogram entiry for GPU */ /*! \brief Single precision histogram entiry for GPU */
...@@ -123,10 +115,9 @@ private: ...@@ -123,10 +115,9 @@ private:
/*! /*!
* \brief Wait for GPU kernel execution and read histogram * \brief Wait for GPU kernel execution and read histogram
* \param histograms Destination of histogram results from GPU. * \param histograms Destination of histogram results from GPU.
* \param is_feature_used A predicate vector for enabling each feature
*/ */
template <typename HistType> template <typename HistType>
void WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used); void WaitAndGetHistograms(HistogramBinEntry* histograms);
/*! /*!
* \brief Construct GPU histogram asynchronously. * \brief Construct GPU histogram asynchronously.
...@@ -173,7 +164,7 @@ private: ...@@ -173,7 +164,7 @@ private:
const char *kernel64_src_ = const char *kernel64_src_ =
#include "ocl/histogram64.cl" #include "ocl/histogram64.cl"
; ;
/*! \brief GPU kernel for 64 bins */ /*! \brief GPU kernel for 16 bins */
const char *kernel16_src_ = const char *kernel16_src_ =
#include "ocl/histogram16.cl" #include "ocl/histogram16.cl"
; ;
......
...@@ -149,6 +149,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val) ...@@ -149,6 +149,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
#endif #endif
} }
/* Makes MSVC happy with long string literal
)""
R""()
*/
// this function will be called by histogram16 // this function will be called by histogram16
// we have one sub-histogram of one feature in registers, and need to read others // we have one sub-histogram of one feature in registers, and need to read others
void within_kernel_reduction16x8(uchar8 feature_mask, void within_kernel_reduction16x8(uchar8 feature_mask,
...@@ -204,6 +208,11 @@ void within_kernel_reduction16x8(uchar8 feature_mask, ...@@ -204,6 +208,11 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
} }
/* Makes MSVC happy with long string literal
)""
R""()
*/
__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1))) __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
#if USE_CONSTANT_BUF == 1 #if USE_CONSTANT_BUF == 1
__kernel void histogram16(__global const uchar4* restrict feature_data_base, __kernel void histogram16(__global const uchar4* restrict feature_data_base,
...@@ -379,6 +388,10 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -379,6 +388,10 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
#endif #endif
feature4 = feature_data[ind]; feature4 = feature_data[ind];
/* Makes MSVC happy with long string literal
)""
R""()
*/
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
...@@ -601,6 +614,11 @@ __kernel void histogram16(__global const uchar4* feature_data_base, ...@@ -601,6 +614,11 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
feature4 = feature4_next; feature4 = feature4_next;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* Makes MSVC happy with long string literal
)""
R""()
*/
#if ENABLE_ALL_FEATURES == 0 #if ENABLE_ALL_FEATURES == 0
// restore feature_mask // restore feature_mask
......
...@@ -125,6 +125,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val) ...@@ -125,6 +125,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
#endif #endif
} }
/* Makes MSVC happy with long string literal
)""
R""()
*/
// this function will be called by histogram256 // this function will be called by histogram256
// we have one sub-histogram of one feature in local memory, and need to read others // we have one sub-histogram of one feature in local memory, and need to read others
void within_kernel_reduction256x4(uchar4 feature_mask, void within_kernel_reduction256x4(uchar4 feature_mask,
...@@ -332,8 +336,10 @@ void within_kernel_reduction256x4(uchar4 feature_mask, ...@@ -332,8 +336,10 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
#endif #endif
} }
#define printf /* Makes MSVC happy with long string literal
)""
R""()
*/
__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1))) __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
#if USE_CONSTANT_BUF == 1 #if USE_CONSTANT_BUF == 1
__kernel void histogram256(__global const uchar4* restrict feature_data_base, __kernel void histogram256(__global const uchar4* restrict feature_data_base,
...@@ -453,6 +459,11 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -453,6 +459,11 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f; acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f; acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
/* Makes MSVC happy with long string literal
)""
R""()
*/
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
...@@ -659,6 +670,10 @@ __kernel void histogram256(__global const uchar4* feature_data_base, ...@@ -659,6 +670,10 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* Makes MSVC happy with long string literal
)""
R""()
*/
#if ENABLE_ALL_FEATURES == 0 #if ENABLE_ALL_FEATURES == 0
// restore feature_mask // restore feature_mask
feature_mask = feature_masks[group_feature]; feature_mask = feature_masks[group_feature];
......
...@@ -142,6 +142,11 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val) ...@@ -142,6 +142,11 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
#endif #endif
} }
/* Makes MSVC happy with long string literal
)""
R""()
*/
// this function will be called by histogram64 // this function will be called by histogram64
// we have one sub-histogram of one feature in registers, and need to read others // we have one sub-histogram of one feature in registers, and need to read others
void within_kernel_reduction64x4(uchar4 feature_mask, void within_kernel_reduction64x4(uchar4 feature_mask,
...@@ -199,8 +204,10 @@ void within_kernel_reduction64x4(uchar4 feature_mask, ...@@ -199,8 +204,10 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
} }
} }
#define printf /* Makes MSVC happy with long string literal
)""
R""()
*/
__attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1))) __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
#if USE_CONSTANT_BUF == 1 #if USE_CONSTANT_BUF == 1
__kernel void histogram64(__global const uchar4* restrict feature_data_base, __kernel void histogram64(__global const uchar4* restrict feature_data_base,
...@@ -360,6 +367,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base, ...@@ -360,6 +367,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f; acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f; acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
/* Makes MSVC happy with long string literal
)""
R""()
*/
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4 // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) { for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables // prefetch the next iteration variables
...@@ -567,6 +578,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base, ...@@ -567,6 +578,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
#endif #endif
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
/* Makes MSVC happy with long string literal
)""
R""()
*/
#if ENABLE_ALL_FEATURES == 0 #if ENABLE_ALL_FEATURES == 0
// restore feature_mask // restore feature_mask
feature_mask = feature_masks[group_feature]; feature_mask = feature_masks[group_feature];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment