[CUDA] Add multiclass_ova objective for cuda_exp (#5491)

f1d3181c · shiyu1994 · GitHub · 06a1ee25 · f1d3181c · f1d3181c
Unverified Commit f1d3181c authored Nov 06, 2022 by shiyu1994 Committed by GitHub Nov 05, 2022
6 changed files
--- a/src/objective/cuda/cuda_binary_objective.cpp
+++ b/src/objective/cuda/cuda_binary_objective.cpp
@@ -69,7 +69,7 @@ double CUDABinaryLogloss::BoostFromScore(int) const {
  CopyFromCUDADeviceToHost<double>(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__);
  double pavg = 0.0f;
  CopyFromCUDADeviceToHost<double>(&pavg, cuda_sum_weights_, 1, __FILE__, __LINE__);
-  Log::Info("[%s:%s]: pavg=%f -> initscore=%f",  GetName(), __func__, pavg, boost_from_score);
+  Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, boost_from_score);
  return boost_from_score;
 }


--- a/src/objective/cuda/cuda_binary_objective.cu
+++ b/src/objective/cuda/cuda_binary_objective.cu
@@ -12,9 +12,9 @@

 namespace LightGBM {

-template <bool IS_OVA, bool USE_WEIGHT>
+template <bool USE_WEIGHT>
 __global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels, const data_size_t num_data, double* out_cuda_sum_labels,
-                                                     double* out_cuda_sum_weights, const label_t* cuda_weights, const int ova_class_id) {
+                                                     double* out_cuda_sum_weights, const label_t* cuda_weights) {
  __shared__ double shared_buffer[32];
  const uint32_t mask = 0xffffffff;
  const uint32_t warpLane = threadIdx.x % warpSize;
@@ -27,12 +27,12 @@ __global__ void BoostFromScoreKernel_1_BinaryLogloss(const label_t* cuda_labels,
    if (USE_WEIGHT) {
      const label_t cuda_label = cuda_labels[index];
      const double sample_weight = cuda_weights[index];
-      const label_t label = IS_OVA ? (static_cast<int>(cuda_label) == ova_class_id ? 1 : 0) : (cuda_label > 0 ? 1 : 0);
+      const label_t label = cuda_label > 0 ? 1 : 0;
      label_value = label * sample_weight;
      weight_value = sample_weight;
    } else {
      const label_t cuda_label = cuda_labels[index];
-      label_value = IS_OVA ? (static_cast<int>(cuda_label) == ova_class_id ? 1 : 0) : (cuda_label > 0 ? 1 : 0);
+      label_value = cuda_label > 0 ? 1 : 0;
    }
  }
  for (uint32_t offset = warpSize / 2; offset >= 1; offset >>= 1) {
@@ -88,22 +88,13 @@ __global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_sum_labels

 void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
  const int num_blocks = (num_data_ + CALC_INIT_SCORE_BLOCK_SIZE_BINARY - 1) / CALC_INIT_SCORE_BLOCK_SIZE_BINARY;
-  if (ova_class_id_ == -1) {
-    if (cuda_weights_ == nullptr) {
-      BoostFromScoreKernel_1_BinaryLogloss<false, false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
-        (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
-    } else {
-      BoostFromScoreKernel_1_BinaryLogloss<false, true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
-        (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
-    }
+  SetCUDAMemory<double>(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__);
+  if (cuda_weights_ == nullptr) {
+    BoostFromScoreKernel_1_BinaryLogloss<false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
+      (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_);
  } else {
-    if (cuda_weights_ == nullptr) {
-      BoostFromScoreKernel_1_BinaryLogloss<true, false><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
-        (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
-    } else {
-      BoostFromScoreKernel_1_BinaryLogloss<true, true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
-        (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_, ova_class_id_);
-    }
+    BoostFromScoreKernel_1_BinaryLogloss<true><<<num_blocks, CALC_INIT_SCORE_BLOCK_SIZE_BINARY>>>
+      (cuda_label_, num_data_, cuda_boost_from_score_, cuda_sum_weights_, cuda_weights_);
  }
  SynchronizeCUDADevice(__FILE__, __LINE__);
  if (cuda_weights_ == nullptr) {
@@ -114,15 +105,15 @@ void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
  SynchronizeCUDADevice(__FILE__, __LINE__);
 }

-template <bool USE_LABEL_WEIGHT, bool USE_WEIGHT, bool IS_OVA>
+template <bool USE_LABEL_WEIGHT, bool USE_WEIGHT>
 __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, const label_t* cuda_labels,
-  const double* cuda_label_weights, const label_t* cuda_weights, const int ova_class_id,
+  const double* cuda_label_weights, const label_t* cuda_weights,
  const double sigmoid, const data_size_t num_data,
  score_t* cuda_out_gradients, score_t* cuda_out_hessians) {
  const data_size_t data_index = static_cast<data_size_t>(blockDim.x * blockIdx.x + threadIdx.x);
  if (data_index < num_data) {
    const label_t cuda_label = static_cast<int>(cuda_labels[data_index]);
-    const int label = IS_OVA ? (cuda_label == ova_class_id ? 1 : -1) : (cuda_label > 0 ? 1 : -1);
+    const int label = cuda_label > 0 ? 1 : -1;
    const double response = -label * sigmoid / (1.0f + exp(label * sigmoid * cuda_scores[data_index]));
    const double abs_response = fabs(response);
    if (!USE_WEIGHT) {
@@ -153,7 +144,6 @@ __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, cons
  cuda_label_, \
  cuda_label_weights_, \
  cuda_weights_, \
-  ova_class_id_, \
  sigmoid_, \
  num_data_, \
  gradients, \
@@ -161,33 +151,17 @@ __global__ void GetGradientsKernel_BinaryLogloss(const double* cuda_scores, cons

 void CUDABinaryLogloss::LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const {
  const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY;
-  if (ova_class_id_ == -1) {
-    if (cuda_label_weights_ == nullptr) {
-      if (cuda_weights_ == nullptr) {
-        GetGradientsKernel_BinaryLogloss<false, false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      } else {
-        GetGradientsKernel_BinaryLogloss<false, true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      }
+  if (cuda_label_weights_ == nullptr) {
+    if (cuda_weights_ == nullptr) {
+      GetGradientsKernel_BinaryLogloss<false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
    } else {
-      if (cuda_weights_ == nullptr) {
-        GetGradientsKernel_BinaryLogloss<true, false, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      } else {
-        GetGradientsKernel_BinaryLogloss<true, true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      }
+      GetGradientsKernel_BinaryLogloss<false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
    }
  } else {
-    if (cuda_label_weights_ == nullptr) {
-      if (cuda_weights_ == nullptr) {
-        GetGradientsKernel_BinaryLogloss<false, false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      } else {
-        GetGradientsKernel_BinaryLogloss<false, true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      }
+    if (cuda_weights_ == nullptr) {
+      GetGradientsKernel_BinaryLogloss<true, false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
    } else {
-      if (cuda_weights_ == nullptr) {
-        GetGradientsKernel_BinaryLogloss<true, false, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      } else {
-        GetGradientsKernel_BinaryLogloss<true, true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
-      }
+      GetGradientsKernel_BinaryLogloss<true, true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(GetGradientsKernel_BinaryLogloss_ARGS);
    }
  }
 }

--- a/src/objective/cuda/cuda_multiclass_objective.cpp
+++ b/src/objective/cuda/cuda_multiclass_objective.cpp
@@ -36,6 +36,37 @@ void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const
 }


+CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config): MulticlassOVA(config) {
+  for (int i = 0; i < num_class_; ++i) {
+    cuda_binary_loss_.emplace_back(new CUDABinaryLogloss(config, i));
+  }
+}
+
+CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector<std::string>& strs): MulticlassOVA(strs) {}
+
+CUDAMulticlassOVA::~CUDAMulticlassOVA() {}
+
+void CUDAMulticlassOVA::Init(const Metadata& metadata, data_size_t num_data) {
+  MulticlassOVA::Init(metadata, num_data);
+  for (int i = 0; i < num_class_; ++i) {
+    cuda_binary_loss_[i]->Init(metadata, num_data);
+  }
+}
+
+void CUDAMulticlassOVA::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
+  for (int i = 0; i < num_class_; ++i) {
+    int64_t offset = static_cast<int64_t>(num_data_) * i;
+    cuda_binary_loss_[i]->GetGradients(score + offset, gradients + offset, hessians + offset);
+  }
+}
+
+void CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
+  for (int i = 0; i < num_class_; ++i) {
+    cuda_binary_loss_[i]->ConvertOutputCUDA(num_data, input + i * num_data, output + i * num_data);
+  }
+}
+
+
 }  // namespace LightGBM

 #endif  // USE_CUDA_EXP
--- a/src/objective/cuda/cuda_multiclass_objective.hpp
+++ b/src/objective/cuda/cuda_multiclass_objective.hpp
@@ -9,9 +9,12 @@

 #include <LightGBM/cuda/cuda_objective_function.hpp>

+#include <memory>
 #include <string>
 #include <vector>

+#include "cuda_binary_objective.hpp"
+
 #include "../multiclass_objective.hpp"

 #define GET_GRADIENTS_BLOCK_SIZE_MULTICLASS (1024)
@@ -54,6 +57,35 @@ class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSof
 };


+class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA {
+ public:
+  explicit CUDAMulticlassOVA(const Config& config);
+
+  explicit CUDAMulticlassOVA(const std::vector<std::string>& strs);
+
+  void Init(const Metadata& metadata, data_size_t num_data) override;
+
+  void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
+
+  void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
+
+  double BoostFromScore(int class_id) const override {
+    return cuda_binary_loss_[class_id]->BoostFromScore(0);
+  }
+
+  bool ClassNeedTrain(int class_id) const override {
+    return cuda_binary_loss_[class_id]->ClassNeedTrain(0);
+  }
+
+  ~CUDAMulticlassOVA();
+
+  bool IsCUDAObjective() const override { return true; }
+
+ private:
+  std::vector<std::unique_ptr<CUDABinaryLogloss>> cuda_binary_loss_;
+};
+
+
 }  // namespace LightGBM

 #endif  // USE_CUDA_EXP

--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -266,7 +266,7 @@ class MulticlassOVA: public ObjectiveFunction {
    return binary_loss_[class_id]->ClassNeedTrain(0);
  }

- private:
+ protected:
  /*! \brief Number of data */
  data_size_t num_data_;
  /*! \brief Number of classes */

--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -43,8 +43,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
    } else if (type == std::string("multiclass")) {
      return new CUDAMulticlassSoftmax(config);
    } else if (type == std::string("multiclassova")) {
-      Log::Warning("Objective multiclassova is not implemented in cuda_exp version. Fall back to boosting on CPU.");
-      return new MulticlassOVA(config);
+      return new CUDAMulticlassOVA(config);
    } else if (type == std::string("cross_entropy")) {
      Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new CrossEntropy(config);