Unverified Commit 1d5f46f6 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] Add lambdarank objective for cuda_exp (#5453)



* add lambdarank for cuda_exp

* support unlimited number of ranks in labels

* fix lint errors

* remove warning for lambdarank with cuda_exp

* Update src/objective/cuda/cuda_rank_objective.hpp
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update src/objective/cuda/cuda_rank_objective.hpp
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent c9a3b479
...@@ -18,17 +18,11 @@ ...@@ -18,17 +18,11 @@
#include <algorithm> #include <algorithm>
#define NUM_BANKS_DATA_PARTITION (16)
#define LOG_NUM_BANKS_DATA_PARTITION (4)
#define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024) #define GLOBAL_PREFIX_SUM_BLOCK_SIZE (1024)
#define BITONIC_SORT_NUM_ELEMENTS (1024) #define BITONIC_SORT_NUM_ELEMENTS (1024)
#define BITONIC_SORT_DEPTH (11) #define BITONIC_SORT_DEPTH (11)
#define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10) #define BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE (10)
#define CONFLICT_FREE_INDEX(n) \
((n) + ((n) >> LOG_NUM_BANKS_DATA_PARTITION)) \
namespace LightGBM { namespace LightGBM {
template <typename T> template <typename T>
...@@ -223,6 +217,54 @@ __device__ __forceinline__ void BitonicArgSort_1024(const VAL_T* scores, INDEX_T ...@@ -223,6 +217,54 @@ __device__ __forceinline__ void BitonicArgSort_1024(const VAL_T* scores, INDEX_T
} }
} }
template <typename VAL_T, typename INDEX_T, bool ASCENDING>
__device__ __forceinline__ void BitonicArgSort_2048(const VAL_T* scores, INDEX_T* indices) {
for (INDEX_T base = 0; base < 2048; base += 1024) {
for (INDEX_T outer_depth = 10; outer_depth >= 1; --outer_depth) {
const INDEX_T outer_segment_length = 1 << (11 - outer_depth);
const INDEX_T outer_segment_index = threadIdx.x / outer_segment_length;
const bool ascending = ((base == 0) ^ ASCENDING) ? (outer_segment_index % 2 > 0) : (outer_segment_index % 2 == 0);
for (INDEX_T inner_depth = outer_depth; inner_depth < 11; ++inner_depth) {
const INDEX_T segment_length = 1 << (11 - inner_depth);
const INDEX_T half_segment_length = segment_length >> 1;
const INDEX_T half_segment_index = threadIdx.x / half_segment_length;
if (half_segment_index % 2 == 0) {
const INDEX_T index_to_compare = threadIdx.x + half_segment_length + base;
if ((scores[indices[threadIdx.x + base]] > scores[indices[index_to_compare]]) == ascending) {
const INDEX_T index = indices[threadIdx.x + base];
indices[threadIdx.x + base] = indices[index_to_compare];
indices[index_to_compare] = index;
}
}
__syncthreads();
}
}
}
const unsigned int index_to_compare = threadIdx.x + 1024;
if (scores[indices[index_to_compare]] > scores[indices[threadIdx.x]]) {
const INDEX_T temp_index = indices[index_to_compare];
indices[index_to_compare] = indices[threadIdx.x];
indices[threadIdx.x] = temp_index;
}
__syncthreads();
for (INDEX_T base = 0; base < 2048; base += 1024) {
for (INDEX_T inner_depth = 1; inner_depth < 11; ++inner_depth) {
const INDEX_T segment_length = 1 << (11 - inner_depth);
const INDEX_T half_segment_length = segment_length >> 1;
const INDEX_T half_segment_index = threadIdx.x / half_segment_length;
if (half_segment_index % 2 == 0) {
const INDEX_T index_to_compare = threadIdx.x + half_segment_length + base;
if (scores[indices[threadIdx.x + base]] < scores[indices[index_to_compare]]) {
const INDEX_T index = indices[threadIdx.x + base];
indices[threadIdx.x + base] = indices[index_to_compare];
indices[index_to_compare] = index;
}
}
__syncthreads();
}
}
}
template <typename VAL_T, typename INDEX_T, bool ASCENDING, uint32_t BLOCK_DIM, uint32_t MAX_DEPTH> template <typename VAL_T, typename INDEX_T, bool ASCENDING, uint32_t BLOCK_DIM, uint32_t MAX_DEPTH>
__device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) { __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, const int len) {
__shared__ VAL_T shared_values[BLOCK_DIM]; __shared__ VAL_T shared_values[BLOCK_DIM];
...@@ -387,6 +429,12 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons ...@@ -387,6 +429,12 @@ __device__ void BitonicArgSortDevice(const VAL_T* values, INDEX_T* indices, cons
} }
} }
void BitonicArgSortItemsGlobal(
const double* scores,
const int num_queries,
const data_size_t* cuda_query_boundaries,
data_size_t* out_indices);
template <typename VAL_T, typename INDEX_T, bool ASCENDING> template <typename VAL_T, typename INDEX_T, bool ASCENDING>
void BitonicArgSortGlobal(const VAL_T* values, INDEX_T* indices, const size_t len); void BitonicArgSortGlobal(const VAL_T* values, INDEX_T* indices, const size_t len);
......
...@@ -77,6 +77,34 @@ void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix ...@@ -77,6 +77,34 @@ void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix
ShufflePrefixSumGlobalInner<uint64_t>(values, len, block_prefix_sum_buffer); ShufflePrefixSumGlobalInner<uint64_t>(values, len, block_prefix_sum_buffer);
} }
__global__ void BitonicArgSortItemsGlobalKernel(const double* scores,
const int num_queries,
const data_size_t* cuda_query_boundaries,
data_size_t* out_indices) {
const int query_index_start = static_cast<int>(blockIdx.x) * BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE;
const int query_index_end = min(query_index_start + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE, num_queries);
for (int query_index = query_index_start; query_index < query_index_end; ++query_index) {
const data_size_t query_item_start = cuda_query_boundaries[query_index];
const data_size_t query_item_end = cuda_query_boundaries[query_index + 1];
const data_size_t num_items_in_query = query_item_end - query_item_start;
BitonicArgSortDevice<double, data_size_t, false, BITONIC_SORT_NUM_ELEMENTS, 11>(scores + query_item_start,
out_indices + query_item_start,
num_items_in_query);
__syncthreads();
}
}
void BitonicArgSortItemsGlobal(
const double* scores,
const int num_queries,
const data_size_t* cuda_query_boundaries,
data_size_t* out_indices) {
const int num_blocks = (num_queries + BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE - 1) / BITONIC_SORT_QUERY_ITEM_BLOCK_SIZE;
BitonicArgSortItemsGlobalKernel<<<num_blocks, BITONIC_SORT_NUM_ELEMENTS>>>(
scores, num_queries, cuda_query_boundaries, out_indices);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
template <typename T> template <typename T>
__global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) { __global__ void BlockReduceSum(T* block_buffer, const data_size_t num_blocks) {
__shared__ T shared_buffer[32]; __shared__ T shared_buffer[32];
......
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifdef USE_CUDA_EXP
#include <string>
#include <vector>
#include "cuda_rank_objective.hpp"
namespace LightGBM {
CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config):
LambdarankNDCG(config) {}
CUDALambdarankNDCG::CUDALambdarankNDCG(const std::vector<std::string>& strs): LambdarankNDCG(strs) {}
void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) {
const int num_threads = OMP_NUM_THREADS();
LambdarankNDCG::Init(metadata, num_data);
std::vector<uint16_t> thread_max_num_items_in_query(num_threads);
Threading::For<data_size_t>(0, num_queries_, 1,
[this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) {
for (data_size_t query_index = start; query_index < end; ++query_index) {
const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index];
if (query_item_count > thread_max_num_items_in_query[thread_index]) {
thread_max_num_items_in_query[thread_index] = query_item_count;
}
}
});
data_size_t max_items_in_query = 0;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
if (thread_max_num_items_in_query[thread_index] > max_items_in_query) {
max_items_in_query = thread_max_num_items_in_query[thread_index];
}
}
max_items_in_query_aligned_ = 1;
--max_items_in_query;
while (max_items_in_query > 0) {
max_items_in_query >>= 1;
max_items_in_query_aligned_ <<= 1;
}
if (max_items_in_query_aligned_ > 2048) {
cuda_item_indices_buffer_.Resize(static_cast<size_t>(metadata.query_boundaries()[metadata.num_queries()]));
}
cuda_labels_ = metadata.cuda_metadata()->cuda_label();
cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries();
cuda_inverse_max_dcgs_.Resize(inverse_max_dcgs_.size());
CopyFromHostToCUDADevice(cuda_inverse_max_dcgs_.RawData(), inverse_max_dcgs_.data(), inverse_max_dcgs_.size(), __FILE__, __LINE__);
cuda_label_gain_.Resize(label_gain_.size());
CopyFromHostToCUDADevice(cuda_label_gain_.RawData(), label_gain_.data(), label_gain_.size(), __FILE__, __LINE__);
}
void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
LaunchGetGradientsKernel(score, gradients, hessians);
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
This diff is collapsed.
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
#define LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
#ifdef USE_CUDA_EXP
#define NUM_QUERY_PER_BLOCK (10)
#include <LightGBM/cuda/cuda_objective_function.hpp>
#include <LightGBM/utils/threading.h>
#include <fstream>
#include <string>
#include <vector>
#include "../rank_objective.hpp"
namespace LightGBM {
class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG {
public:
explicit CUDALambdarankNDCG(const Config& config);
explicit CUDALambdarankNDCG(const std::vector<std::string>& strs);
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
bool IsCUDAObjective() const override { return true; }
protected:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const;
// CUDA memory, held by this object
CUDAVector<double> cuda_inverse_max_dcgs_;
CUDAVector<double> cuda_label_gain_;
CUDAVector<int> cuda_item_indices_buffer_;
// CUDA memory, held by other objects
const label_t* cuda_labels_;
const data_size_t* cuda_query_boundaries_;
// Host memory
int max_items_in_query_aligned_;
};
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "xentropy_objective.hpp" #include "xentropy_objective.hpp"
#include "cuda/cuda_binary_objective.hpp" #include "cuda/cuda_binary_objective.hpp"
#include "cuda/cuda_rank_objective.hpp"
#include "cuda/cuda_regression_objective.hpp" #include "cuda/cuda_regression_objective.hpp"
namespace LightGBM { namespace LightGBM {
...@@ -38,8 +39,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& ...@@ -38,8 +39,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
} else if (type == std::string("binary")) { } else if (type == std::string("binary")) {
return new CUDABinaryLogloss(config); return new CUDABinaryLogloss(config);
} else if (type == std::string("lambdarank")) { } else if (type == std::string("lambdarank")) {
Log::Warning("Objective lambdarank is not implemented in cuda_exp version. Fall back to boosting on CPU."); return new CUDALambdarankNDCG(config);
return new LambdarankNDCG(config);
} else if (type == std::string("rank_xendcg")) { } else if (type == std::string("rank_xendcg")) {
Log::Warning("Objective rank_xendcg is not implemented in cuda_exp version. Fall back to boosting on CPU."); Log::Warning("Objective rank_xendcg is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RankXENDCG(config); return new RankXENDCG(config);
......
...@@ -255,7 +255,7 @@ class LambdarankNDCG : public RankingObjective { ...@@ -255,7 +255,7 @@ class LambdarankNDCG : public RankingObjective {
const char* GetName() const override { return "lambdarank"; } const char* GetName() const override { return "lambdarank"; }
private: protected:
/*! \brief Sigmoid param */ /*! \brief Sigmoid param */
double sigmoid_; double sigmoid_;
/*! \brief Normalize the lambdas or not */ /*! \brief Normalize the lambdas or not */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment