fix gpu allocate memory overflow (#4928)

305369dd · 文佳鹏 · GitHub · 5b42c2c3 · 305369dd
Unverified Commit 305369dd authored Jan 08, 2022 by 文佳鹏 Committed by GitHub Jan 08, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

src/treelearner/gpu_tree_learner.cpp src/treelearner/gpu_tree_learner.cpp +3 -3

No files found.
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -245,7 +245,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
  }
  // allocate memory for all features (FIXME: 4 GB barrier on some devices, need to split to multiple buffers)
  device_features_.reset();
-  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>(num_dense_feature4_ * num_data_, ctx_));
+  device_features_ = std::unique_ptr<boost::compute::vector<Feature4>>(new boost::compute::vector<Feature4>((uint64_t)num_dense_feature4_ * num_data_, ctx_));
  // unpin old buffer if necessary before destructing them
  if (ptr_pinned_gradients_) {
    queue_.enqueue_unmap_buffer(pinned_gradients_, ptr_pinned_gradients_);
@@ -427,7 +427,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
    }
    #pragma omp critical
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
-                        i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+                        (uint64_t)i * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
    #if GPU_DEBUG >= 1
    printf("first example of feature-group tuple is: %d %d %d %d\n", host4[0].s[0], host4[0].s[1], host4[0].s[2], host4[0].s[3]);
    printf("Feature-groups copied to device with multipliers ");
@@ -503,7 +503,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
    }
    // copying the last 1 to (dword_features - 1) feature-groups in the last tuple
    queue_.enqueue_write_buffer(device_features_->get_buffer(),
-                        (num_dense_feature4_ - 1) * num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
+                        (num_dense_feature4_ - 1) * (uint64_t)num_data_ * sizeof(Feature4), num_data_ * sizeof(Feature4), host4);
    #if GPU_DEBUG >= 1
    printf("Last features copied to device\n");
    #endif