Unverified Commit bee732af authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

better document for bin_construct_sample_cnt (#3521)



* better document for bin_construct_sample_cnt

* add warnings
Co-authored-by: default avatarStrikerRUS <nekit94-12@hotmail.com>
parent 335ba231
...@@ -634,12 +634,14 @@ Dataset Parameters ...@@ -634,12 +634,14 @@ Dataset Parameters
- ``bin_construct_sample_cnt`` :raw-html:`<a id="bin_construct_sample_cnt" title="Permalink to this parameter" href="#bin_construct_sample_cnt">&#x1F517;&#xFE0E;</a>`, default = ``200000``, type = int, aliases: ``subsample_for_bin``, constraints: ``bin_construct_sample_cnt > 0`` - ``bin_construct_sample_cnt`` :raw-html:`<a id="bin_construct_sample_cnt" title="Permalink to this parameter" href="#bin_construct_sample_cnt">&#x1F517;&#xFE0E;</a>`, default = ``200000``, type = int, aliases: ``subsample_for_bin``, constraints: ``bin_construct_sample_cnt > 0``
- number of data that sampled to construct histogram bins - number of data that sampled to construct feature discrete bins
- setting this to larger value will give better training result, but will increase data loading time - setting this to larger value will give better training result, but may increase data loading time
- set this to larger value if data is very sparse - set this to larger value if data is very sparse
- **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
- ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``data_seed`` - ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``data_seed``
- random seed for sampling data to construct histogram bins - random seed for sampling data to construct histogram bins
......
...@@ -580,9 +580,10 @@ struct Config { ...@@ -580,9 +580,10 @@ struct Config {
// alias = subsample_for_bin // alias = subsample_for_bin
// check = >0 // check = >0
// desc = number of data that sampled to construct histogram bins // desc = number of data that sampled to construct feature discrete bins
// desc = setting this to larger value will give better training result, but will increase data loading time // desc = setting this to larger value will give better training result, but may increase data loading time
// desc = set this to larger value if data is very sparse // desc = set this to larger value if data is very sparse
// desc = **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
int bin_construct_sample_cnt = 200000; int bin_construct_sample_cnt = 200000;
// alias = data_seed // alias = data_seed
......
...@@ -164,6 +164,16 @@ void DatasetLoader::SetHeader(const char* filename) { ...@@ -164,6 +164,16 @@ void DatasetLoader::SetHeader(const char* filename) {
} }
} }
void CheckSampleSize(size_t sample_cnt, size_t num_data) {
if (static_cast<double>(sample_cnt) / num_data < 0.2f &&
sample_cnt < 100000) {
Log::Warning(
"Using too small ``bin_construct_sample_cnt`` may encounter "
"unexpected "
"errors and poor accuracy.");
}
}
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) { Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
// don't support query id in data file when training in parallel // don't support query id in data file when training in parallel
if (num_machines > 1 && !config_.pre_partition) { if (num_machines > 1 && !config_.pre_partition) {
...@@ -190,6 +200,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -190,6 +200,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset->num_data_ = static_cast<data_size_t>(text_data.size()); dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// sample data // sample data
auto sample_data = SampleTextDataFromMemory(text_data); auto sample_data = SampleTextDataFromMemory(text_data);
CheckSampleSize(sample_data.size(),
static_cast<size_t>(dataset->num_data_));
// construct feature bin mappers // construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get()); ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label // initialize label
...@@ -205,6 +217,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -205,6 +217,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
} else { } else {
dataset->num_data_ = num_global_data; dataset->num_data_ = num_global_data;
} }
CheckSampleSize(sample_data.size(),
static_cast<size_t>(dataset->num_data_));
// construct feature bin mappers // construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get()); ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label // initialize label
...@@ -540,6 +554,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b ...@@ -540,6 +554,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col, int** sample_indices, int num_col, const int* num_per_col,
size_t total_sample_size, data_size_t num_data) { size_t total_sample_size, data_size_t num_data) {
CheckSampleSize(total_sample_size, static_cast<size_t>(num_data));
int num_total_features = num_col; int num_total_features = num_col;
if (Network::num_machines() > 1) { if (Network::num_machines() > 1) {
num_total_features = Network::GlobalSyncUpByMax(num_total_features); num_total_features = Network::GlobalSyncUpByMax(num_total_features);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment