Unverified Commit bee732af authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

better document for bin_construct_sample_cnt (#3521)



* better document for bin_construct_sample_cnt

* add warnings
Co-authored-by: default avatarStrikerRUS <nekit94-12@hotmail.com>
parent 335ba231
......@@ -634,12 +634,14 @@ Dataset Parameters
- ``bin_construct_sample_cnt`` :raw-html:`<a id="bin_construct_sample_cnt" title="Permalink to this parameter" href="#bin_construct_sample_cnt">&#x1F517;&#xFE0E;</a>`, default = ``200000``, type = int, aliases: ``subsample_for_bin``, constraints: ``bin_construct_sample_cnt > 0``
- number of data that sampled to construct histogram bins
- number of data that sampled to construct feature discrete bins
- setting this to larger value will give better training result, but will increase data loading time
- setting this to larger value will give better training result, but may increase data loading time
- set this to larger value if data is very sparse
- **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
- ``data_random_seed`` :raw-html:`<a id="data_random_seed" title="Permalink to this parameter" href="#data_random_seed">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``data_seed``
- random seed for sampling data to construct histogram bins
......
......@@ -580,9 +580,10 @@ struct Config {
// alias = subsample_for_bin
// check = >0
// desc = number of data that sampled to construct histogram bins
// desc = setting this to larger value will give better training result, but will increase data loading time
// desc = number of data that sampled to construct feature discrete bins
// desc = setting this to larger value will give better training result, but may increase data loading time
// desc = set this to larger value if data is very sparse
// desc = **Note**: don't set this to small values, otherwise, you may encounter unexpected errors and poor accuracy
int bin_construct_sample_cnt = 200000;
// alias = data_seed
......
......@@ -164,6 +164,16 @@ void DatasetLoader::SetHeader(const char* filename) {
}
}
void CheckSampleSize(size_t sample_cnt, size_t num_data) {
if (static_cast<double>(sample_cnt) / num_data < 0.2f &&
sample_cnt < 100000) {
Log::Warning(
"Using too small ``bin_construct_sample_cnt`` may encounter "
"unexpected "
"errors and poor accuracy.");
}
}
Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_machines) {
// don't support query id in data file when training in parallel
if (num_machines > 1 && !config_.pre_partition) {
......@@ -190,6 +200,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// sample data
auto sample_data = SampleTextDataFromMemory(text_data);
CheckSampleSize(sample_data.size(),
static_cast<size_t>(dataset->num_data_));
// construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label
......@@ -205,6 +217,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
} else {
dataset->num_data_ = num_global_data;
}
CheckSampleSize(sample_data.size(),
static_cast<size_t>(dataset->num_data_));
// construct feature bin mappers
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label
......@@ -540,6 +554,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
int** sample_indices, int num_col, const int* num_per_col,
size_t total_sample_size, data_size_t num_data) {
CheckSampleSize(total_sample_size, static_cast<size_t>(num_data));
int num_total_features = num_col;
if (Network::num_machines() > 1) {
num_total_features = Network::GlobalSyncUpByMax(num_total_features);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment