Fix compilation problems with MSVC (#443)

* Fix warnings when compiled with -pedantic * add -DBOOST_ALL_NO_LIB for windows build * fix some more MSVC warnings * Break OpenCL string literal to smaller pieces to avoid error C2026 of MSVC The string was longer than the limit of 16380 single-byte characters. This affects Visual Studio 2005 - 2015. Untested on VS 2017.

Fix compilation problems with MSVC (#443)
* Fix warnings when compiled with -pedantic * add -DBOOST_ALL_NO_LIB for windows build * fix some more MSVC warnings * Break OpenCL string literal to smaller pieces to avoid error C2026 of MSVC The string was longer than the limit of 16380 single-byte characters. This affects Visual Studio 2005 - 2015. Untested on VS 2017.
8487d0a2 · Huan Zhang · Guolin Ke · 79c7c09d · 8487d0a2 · 8487d0a2
Commit 8487d0a2 authored Apr 23, 2017 by Huan Zhang Committed by Guolin Ke Apr 23, 2017
6 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,10 @@ if(USE_GPU)
    MESSAGE(STATUS "OpenCL include directory:" ${OpenCL_INCLUDE_DIRS})
    find_package(Boost 1.56.0 COMPONENTS filesystem system REQUIRED)
    include_directories(${Boost_INCLUDE_DIRS})
+    if (WIN32)
+        # disable autolinking in boost
+        ADD_DEFINITIONS(-DBOOST_ALL_NO_LIB)
+    endif()
    ADD_DEFINITIONS(-DUSE_GPU)
 endif(USE_GPU)

--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -104,7 +104,7 @@ int GPUTreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
  // also guarantee that there are at least 2K examples per workgroup
  double x = 256.0 / num_dense_feature4_;
-  int exp_workgroups_per_feature = ceil(log2(x));
+  int exp_workgroups_per_feature = (int)ceil(log2(x));
  double t = leaf_num_data / 1024.0;
  #if GPU_DEBUG >= 4
  printf("Computing histogram for %d examples and (%d * %d) feature groups\n", leaf_num_data, dword_features_, num_dense_feature4_);
@@ -191,7 +191,7 @@ void GPUTreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_featur
 }
 template <typename HistType>
-void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used) {
+void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms) {
  HistType* hist_outputs = (HistType*) host_histogram_outputs_;
  // when the output is ready, the computation is done
  histograms_wait_obj_.wait();
@@ -207,7 +207,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s
      for (int j = 0; j < bin_size; ++j) {
        old_histogram_array[j].sum_gradients = hist_outputs[i * device_bin_size_+ j].sum_gradients;
        old_histogram_array[j].sum_hessians = hist_outputs[i * device_bin_size_ + j].sum_hessians;
-        old_histogram_array[j].cnt = hist_outputs[i * device_bin_size_ + j].cnt;
+        old_histogram_array[j].cnt = (data_size_t)hist_outputs[i * device_bin_size_ + j].cnt;
      }
    }
    else {
@@ -224,7 +224,7 @@ void GPUTreeLearner::WaitAndGetHistograms(HistogramBinEntry* histograms, const s
        }
        old_histogram_array[j].sum_gradients = sum_g;
        old_histogram_array[j].sum_hessians = sum_h;
-        old_histogram_array[j].cnt = cnt;
+        old_histogram_array[j].cnt = (data_size_t)cnt;
      }
    }
  }
@@ -323,11 +323,12 @@ void GPUTreeLearner::AllocateGPUMemory() {
  device_histogram_outputs_ = boost::compute::buffer(ctx_, num_dense_feature4_ * dword_features_ * device_bin_size_ * hist_bin_entry_sz_, 
                           boost::compute::memory_object::write_only | boost::compute::memory_object::alloc_host_ptr, nullptr);
  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
-  int i, k, copied_feature4 = 0, dense_ind[dword_features_];
+  int k = 0, copied_feature4 = 0;
-  for (i = 0, k = 0; i < num_feature_groups_; ++i) {
+  std::vector<int> dense_dword_ind(dword_features_);
+  for (int i = 0; i < num_feature_groups_; ++i) {
    // looking for dword_features_ non-sparse feature-groups
    if (ordered_bins_[i] == nullptr) {
-      dense_ind[k] = i;
+      dense_dword_ind[k] = i;
      // decide if we need to redistribute the bin
      double t = device_bin_size_ / (double)train_data_->FeatureGroupNumBin(i);
      // multiplier must be a power of 2
@@ -345,7 +346,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
    if (k == dword_features_) {
      k = 0;
      for (int j = 0; j < dword_features_; ++j) {
-        dense_feature_group_map_.push_back(dense_ind[j]);
+        dense_feature_group_map_.push_back(dense_dword_ind[j]);
      }
      copied_feature4++;
    }
@@ -369,7 +370,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
  }
  // building Feature4 bundles; each thread handles dword_features_ features
  #pragma omp parallel for schedule(static)
-  for (unsigned int i = 0; i < dense_feature_group_map_.size() / dword_features_; ++i) {
+  for (int i = 0; i < (int)(dense_feature_group_map_.size() / dword_features_); ++i) {
    int tid = omp_get_thread_num();
    Feature4* host4 = host4_ptrs[tid];
    auto dense_ind = dense_feature_group_map_.begin() + i * dword_features_;
@@ -401,14 +402,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
        *static_cast<Dense4bitsBinIterator*>(bin_iters[6]),
        *static_cast<Dense4bitsBinIterator*>(bin_iters[7])};
      for (int j = 0; j < num_data_; ++j) {
-        host4[j].s0 = (iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1))) 
+        host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1))) 
-                    |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4);
+                      |((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
-        host4[j].s1 = (iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1))) 
+        host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1))) 
-                    |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4);
+                      |((iters[3].RawGet(j) * dev_bin_mult[3] + ((j+3) & (dev_bin_mult[3] - 1))) << 4));
-        host4[j].s2 = (iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1))) 
+        host4[j].s[2] = (uint8_t)((iters[4].RawGet(j) * dev_bin_mult[4] + ((j+4) & (dev_bin_mult[4] - 1))) 
-                    |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4);
+                      |((iters[5].RawGet(j) * dev_bin_mult[5] + ((j+5) & (dev_bin_mult[5] - 1))) << 4));
-        host4[j].s3 = (iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1))) 
+        host4[j].s[3] = (uint8_t)((iters[6].RawGet(j) * dev_bin_mult[6] + ((j+6) & (dev_bin_mult[6] - 1))) 
-                    |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4);
+                      |((iters[7].RawGet(j) * dev_bin_mult[7] + ((j+7) & (dev_bin_mult[7] - 1))) << 4));
      }
    }
    else if (dword_features_ == 4) {
@@ -420,14 +421,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
          // Dense bin
          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
          for (int j = 0; j < num_data_; ++j) {
-            host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1));
+            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
          }
        }
        else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
          // Dense 4-bit bin
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          for (int j = 0; j < num_data_; ++j) {
-            host4[j].s[s_idx] = iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1));
+            host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
          }
        }
        else {
@@ -458,38 +459,38 @@ void GPUTreeLearner::AllocateGPUMemory() {
    #if GPU_DEBUG >= 1
    printf("%d features left\n", k);
    #endif
-    for (i = 0; i < k; ++i) {
+    for (int i = 0; i < k; ++i) {
      if (dword_features_ == 8) {
-        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]);
+        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
        if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
-            host4[j].s[i >> 1] |= ((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+            host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                                + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
                               << ((i & 1) << 2));
          }
        }
        else {
-          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_ind[i]);
+          Log::Fatal("GPU tree learner assumes that all bins are Dense4bitsBin when num_bin <= 16, but feature %d is not.", dense_dword_ind[i]);
        }
      }
      else if (dword_features_ == 4) {
-        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_ind[i]);
+        BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
        if (dynamic_cast<DenseBinIterator<uint8_t>*>(bin_iter) != 0) {
          DenseBinIterator<uint8_t> iter = *static_cast<DenseBinIterator<uint8_t>*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
-            host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
+            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
-                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1));
+                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
          }
        }
        else if (dynamic_cast<Dense4bitsBinIterator*>(bin_iter) != 0) {
          Dense4bitsBinIterator iter = *static_cast<Dense4bitsBinIterator*>(bin_iter);
          #pragma omp parallel for schedule(static)
          for (int j = 0; j < num_data_; ++j) {
-            host4[j].s[i] = iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
+            host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i] 
-                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1));
+                          + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
          }
        }
        else {
@@ -504,18 +505,18 @@ void GPUTreeLearner::AllocateGPUMemory() {
    if (dword_features_ == 8) {
      #pragma omp parallel for schedule(static)
      for (int j = 0; j < num_data_; ++j) {
-        for (i = k; i < dword_features_; ++i) {
+        for (int i = k; i < dword_features_; ++i) {
          // fill this empty feature with some "random" value
-          host4[j].s[i >> 1] |= ((j & 0xf) << ((i & 1) << 2));
+          host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
        }
      }
    }
    else if (dword_features_ == 4) {
      #pragma omp parallel for schedule(static)
      for (int j = 0; j < num_data_; ++j) {
-        for (i = k; i < dword_features_; ++i) {
+        for (int i = k; i < dword_features_; ++i) {
          // fill this empty feature with some "random" value
-          host4[j].s[i] = j;
+          host4[j].s[i] = (uint8_t)j;
        }
      }
    }
@@ -525,8 +526,8 @@ void GPUTreeLearner::AllocateGPUMemory() {
    #if GPU_DEBUG >= 1
    printf("Last features copied to device\n");
    #endif
-    for (i = 0; i < k; ++i) {
+    for (int i = 0; i < k; ++i) {
-      dense_feature_group_map_.push_back(dense_ind[i]);
+      dense_feature_group_map_.push_back(dense_dword_ind[i]);
    }
  }
  // deallocate pinned space for feature copying
@@ -542,12 +543,12 @@ void GPUTreeLearner::AllocateGPUMemory() {
            end_time * 1e-3, sparse_feature_group_map_.size());
  #if GPU_DEBUG >= 1
  printf("Dense feature group list (size %lu): ", dense_feature_group_map_.size());
-  for (i = 0; i < num_dense_feature_groups_; ++i) {
+  for (int i = 0; i < num_dense_feature_groups_; ++i) {
    printf("%d ", dense_feature_group_map_[i]);
  }
  printf("\n");
  printf("Sparse feature group list (size %lu): ", sparse_feature_group_map_.size());
-  for (i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
+  for (int i = 0; i < num_feature_groups_ - num_dense_feature_groups_; ++i) {
    printf("%d ", sparse_feature_group_map_[i]);
  }
  printf("\n");
@@ -584,10 +585,10 @@ void GPUTreeLearner::BuildGPUKernels() {
    }
    catch (boost::compute::opencl_error &e) {
      if (program.build_log().size() > 0) {
-        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+        Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
      }
      else {
-        Log::Fatal("GPU program built failure, log unavailable");
+        Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
      }
    }
    histogram_kernels_[i] = program.create_kernel(kernel_name_);
@@ -599,10 +600,10 @@ void GPUTreeLearner::BuildGPUKernels() {
    }
    catch (boost::compute::opencl_error &e) {
      if (program.build_log().size() > 0) {
-        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+        Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
      }
      else {
-        Log::Fatal("GPU program built failure, log unavailable");
+        Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
      }
    }
    histogram_allfeats_kernels_[i] = program.create_kernel(kernel_name_);
@@ -614,10 +615,10 @@ void GPUTreeLearner::BuildGPUKernels() {
    }
    catch (boost::compute::opencl_error &e) {
      if (program.build_log().size() > 0) {
-        Log::Fatal("GPU program built failure:\n %s", program.build_log().c_str());
+        Log::Fatal("GPU program built failure: %s\n %s", e.what(), program.build_log().c_str());
      }
      else {
-        Log::Fatal("GPU program built failure, log unavailable");
+        Log::Fatal("GPU program built failure: %s\nlog unavailable", e.what());
      }
    }
    histogram_fulldata_kernels_[i] = program.create_kernel(kernel_name_);
@@ -979,11 +980,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
  if (is_gpu_used) {
    if (tree_config_->gpu_use_dp) {
      // use double precision
-      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used);
+      WaitAndGetHistograms<HistogramBinEntry>(ptr_smaller_leaf_hist_data);
    }
    else {
      // use single precision
-      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data, is_feature_used);
+      WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_smaller_leaf_hist_data);
    }
  }
@@ -1033,11 +1034,11 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
    if (is_gpu_used) {
      if (tree_config_->gpu_use_dp) {
        // use double precision
-        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used);
+        WaitAndGetHistograms<HistogramBinEntry>(ptr_larger_leaf_hist_data);
      }
      else {
        // use single precision
-        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data, is_feature_used);
+        WaitAndGetHistograms<GPUHistogramBinEntry>(ptr_larger_leaf_hist_data);
      }
    }
  }

--- a/src/treelearner/gpu_tree_learner.h
+++ b/src/treelearner/gpu_tree_learner.h
@@ -64,15 +64,7 @@ protected:
 private:
  /*! \brief 4-byte feature tuple used by GPU kernels */
  struct Feature4 {
-	  union {
+      uint8_t s[4];
-		  unsigned char s[4];
-		  struct {
-			  unsigned char s0;
-			  unsigned char s1;
-			  unsigned char s2;
-			  unsigned char s3;
-		  };
-	  };
  };
  /*! \brief Single precision histogram entiry for GPU */
@@ -123,10 +115,9 @@ private:
  /*!
   * \brief Wait for GPU kernel execution and read histogram
   * \param histograms Destination of histogram results from GPU.
-   * \param is_feature_used A predicate vector for enabling each feature
  */
  template <typename HistType>
-  void WaitAndGetHistograms(HistogramBinEntry* histograms, const std::vector<int8_t>& is_feature_used);
+  void WaitAndGetHistograms(HistogramBinEntry* histograms);
  /*!
   * \brief Construct GPU histogram asynchronously. 
@@ -173,7 +164,7 @@ private:
  const char *kernel64_src_ = 
  #include "ocl/histogram64.cl"
  ;
-  /*! \brief GPU kernel for 64 bins */
+  /*! \brief GPU kernel for 16 bins */
  const char *kernel16_src_ = 
  #include "ocl/histogram16.cl"
  ;

--- a/src/treelearner/ocl/histogram16.cl
+++ b/src/treelearner/ocl/histogram16.cl
@@ -149,6 +149,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
 #endif
 }
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 // this function will be called by histogram16
 // we have one sub-histogram of one feature in registers, and need to read others
 void within_kernel_reduction16x8(uchar8 feature_mask,
@@ -204,6 +208,11 @@ void within_kernel_reduction16x8(uchar8 feature_mask,
 }
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
 #if USE_CONSTANT_BUF == 1
 __kernel void histogram16(__global const uchar4* restrict feature_data_base, 
@@ -379,6 +388,10 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
    #endif
    feature4 = feature_data[ind];
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
        // prefetch the next iteration variables
@@ -601,6 +614,11 @@ __kernel void histogram16(__global const uchar4* feature_data_base,
        feature4 = feature4_next;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    #if ENABLE_ALL_FEATURES == 0
    // restore feature_mask

--- a/src/treelearner/ocl/histogram256.cl
+++ b/src/treelearner/ocl/histogram256.cl
@@ -125,6 +125,10 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
 #endif
 }
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 // this function will be called by histogram256
 // we have one sub-histogram of one feature in local memory, and need to read others
 void within_kernel_reduction256x4(uchar4 feature_mask,
@@ -332,8 +336,10 @@ void within_kernel_reduction256x4(uchar4 feature_mask,
    #endif
 }
-#define printf
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
 #if USE_CONSTANT_BUF == 1
 __kernel void histogram256(__global const uchar4* restrict feature_data_base, 
@@ -453,6 +459,11 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
        // prefetch the next iteration variables
@@ -659,6 +670,10 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
    #endif
    barrier(CLK_LOCAL_MEM_FENCE);
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    #if ENABLE_ALL_FEATURES == 0
    // restore feature_mask
    feature_mask = feature_masks[group_feature];

--- a/src/treelearner/ocl/histogram64.cl
+++ b/src/treelearner/ocl/histogram64.cl
@@ -142,6 +142,11 @@ inline void atomic_local_add_f(__local acc_type *addr, const float val)
 #endif
 }
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 // this function will be called by histogram64
 // we have one sub-histogram of one feature in registers, and need to read others
 void within_kernel_reduction64x4(uchar4 feature_mask,
@@ -199,8 +204,10 @@ void within_kernel_reduction64x4(uchar4 feature_mask,
    }
 }
-#define printf
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
 __attribute__((reqd_work_group_size(LOCAL_SIZE_0, 1, 1)))
 #if USE_CONSTANT_BUF == 1
 __kernel void histogram64(__global const uchar4* restrict feature_data_base, 
@@ -360,6 +367,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
    acc_type s1_stat1 = 0.0f, s1_stat2 = 0.0f;
    acc_type s0_stat1 = 0.0f, s0_stat2 = 0.0f;
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    // there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
    for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
        // prefetch the next iteration variables
@@ -567,6 +578,10 @@ __kernel void histogram64(__global const uchar4* feature_data_base,
    #endif
    barrier(CLK_LOCAL_MEM_FENCE);
+/* Makes MSVC happy with long string literal
+)""
+R""()
+*/
    #if ENABLE_ALL_FEATURES == 0
    // restore feature_mask
    feature_mask = feature_masks[group_feature];