Commit 713f5e6c authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

[docs] generate parameters description from config file. Stage 1 (#1409)

* generate parameters description from config file

* made pylint happy

* added checks description

* fixed links handling in desc and descl2 fields

* refine Core Parameters section

* removed excess quotes from default values

* fixed parameter type retrieve

* added note about auto-generated parameters
parent f660b5fe
.. List of parameters is auto generated by LightGBM\helper\parameter_generator.py from LightGBM\include\LightGBM\config.h file.
Parameters Parameters
========== ==========
...@@ -23,6 +25,8 @@ By using config files, one line can only contain one parameter. You can use ``#` ...@@ -23,6 +25,8 @@ By using config files, one line can only contain one parameter. You can use ``#`
If one parameter appears in both command line and config file, LightGBM will use the parameter in command line. If one parameter appears in both command line and config file, LightGBM will use the parameter in command line.
.. start params list
Core Parameters Core Parameters
--------------- ---------------
...@@ -721,6 +725,8 @@ This feature is only supported in command line version yet. ...@@ -721,6 +725,8 @@ This feature is only supported in command line version yet.
- output file name of converted model - output file name of converted model
.. end params list
Others Others
------ ------
......
...@@ -3,6 +3,10 @@ Documentation ...@@ -3,6 +3,10 @@ Documentation
Documentation for LightGBM is generated using `Sphinx <http://www.sphinx-doc.org/>`__. Documentation for LightGBM is generated using `Sphinx <http://www.sphinx-doc.org/>`__.
List of parameters and their descriptions in `Parameters.rst <https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst>`__
is generated automatically from comments in `config file <https://github.com/Microsoft/LightGBM/blob/master/include/LightGBM/config.h>`__
by `this script <https://github.com/Microsoft/LightGBM/blob/master/helper/parameter_generator.py>`__.
After each commit on ``master``, documentation is updated and published to `Read the Docs <https://lightgbm.readthedocs.io/>`__. After each commit on ``master``, documentation is updated and published to `Read the Docs <https://lightgbm.readthedocs.io/>`__.
Build Build
......
...@@ -24,7 +24,7 @@ def GetParameterInfos(config_hpp): ...@@ -24,7 +24,7 @@ def GetParameterInfos(config_hpp):
elif cur_key is not None: elif cur_key is not None:
line = line.strip() line = line.strip()
if line.startswith("//"): if line.startswith("//"):
tokens = line.split("//")[1].split("=") tokens = line[2:].split("=")
key = tokens[0].strip() key = tokens[0].strip()
val = '='.join(tokens[1:]).strip() val = '='.join(tokens[1:]).strip()
if key not in cur_info: if key not in cur_info:
...@@ -54,7 +54,7 @@ def GetParameterInfos(config_hpp): ...@@ -54,7 +54,7 @@ def GetParameterInfos(config_hpp):
cur_info["name"] = [tokens[1][:-1].strip()] cur_info["name"] = [tokens[1][:-1].strip()]
member_infos[-1].append(cur_info) member_infos[-1].append(cur_info)
cur_info = {} cur_info = {}
return (keys, member_infos) return keys, member_infos
def GetNames(infos): def GetNames(infos):
...@@ -97,11 +97,72 @@ def SetOneVarFromString(name, type, checks): ...@@ -97,11 +97,72 @@ def SetOneVarFromString(name, type, checks):
return ret return ret
def GenParameterDescription(sections, descriptions, params_rst):
def parse_check(check, reverse=False):
try:
idx = 1
float(check[idx:])
except ValueError:
idx = 2
float(check[idx:])
if reverse:
reversed_sign = {'<': '>', '>': '<', '<=': '>=', '>=': '<='}
return check[idx:], reversed_sign[check[:idx]]
else:
return check[idx:], check[:idx]
params_to_write = []
for section_name, section_params in zip(sections, descriptions):
params_to_write.append('{0}\n{1}'.format(section_name, '-' * len(section_name)))
for param_desc in section_params:
name = param_desc['name'][0]
default_raw = param_desc['default'][0]
default = default_raw.strip('"') if len(default_raw.strip('"')) > 0 else default_raw
param_type = param_desc.get('type', param_desc['inner_type'])[0].split(':')[-1].split('<')[-1].strip('>')
options = param_desc.get('options', [])
if len(options) > 0:
options_str = ', options: ``{0}``'.format('``, ``'.join([x.strip() for x in options[0].split(',')]))
else:
options_str = ''
aliases = param_desc.get('alias', [])
if len(aliases) > 0:
aliases_str = ', aliases: ``{0}``'.format('``, ``'.join([x.strip() for x in aliases[0].split(',')]))
else:
aliases_str = ''
checks = sorted(param_desc.get('check', []))
checks_len = len(checks)
if checks_len > 1:
number1, sign1 = parse_check(checks[0])
number2, sign2 = parse_check(checks[1], reverse=True)
checks_str = ', ``{0} {1} {2} {3} {4}``'.format(number2, sign2, name, sign1, number1)
elif checks_len == 1:
number, sign = parse_check(checks[0])
checks_str = ', ``{0} {1} {2}``'.format(name, sign, number)
else:
checks_str = ''
main_desc = '- ``{0}``, default = ``{1}``, type = {2}{3}{4}{5}'.format(name, default, param_type, options_str, aliases_str, checks_str)
params_to_write.append(main_desc)
params_to_write.extend([' ' * 3 * int(desc[0][-1]) + '- ' + desc[1] for desc in param_desc['desc']])
with open(params_rst) as original_params_file:
all_lines = original_params_file.read()
before, start_sep, _ = all_lines.partition('.. start params list\n\n')
_, end_sep, after = all_lines.partition('\n\n.. end params list')
with open(params_rst, "w") as new_params_file:
new_params_file.write(before)
new_params_file.write(start_sep)
new_params_file.write('\n\n'.join(params_to_write))
new_params_file.write(end_sep)
new_params_file.write(after)
def GenParameterCode(config_hpp, config_out_cpp): def GenParameterCode(config_hpp, config_out_cpp):
keys, infos = GetParameterInfos(config_hpp) keys, infos = GetParameterInfos(config_hpp)
names = GetNames(infos) names = GetNames(infos)
alias = GetAlias(infos) alias = GetAlias(infos)
str_to_write = "/// This file is auto generated by LightGBM\\helper\\parameter_generator.py\n" str_to_write = "/// This file is auto generated by LightGBM\\helper\\parameter_generator.py from LightGBM\\include\\LightGBM\\config.h file.\n"
str_to_write += "#include<LightGBM/config.h>\nnamespace LightGBM {\n" str_to_write += "#include<LightGBM/config.h>\nnamespace LightGBM {\n"
# alias table # alias table
str_to_write += "std::unordered_map<std::string, std::string> Config::alias_table({\n" str_to_write += "std::unordered_map<std::string, std::string> Config::alias_table({\n"
...@@ -151,8 +212,12 @@ def GenParameterCode(config_hpp, config_out_cpp): ...@@ -151,8 +212,12 @@ def GenParameterCode(config_hpp, config_out_cpp):
with open(config_out_cpp, "w") as config_out_cpp_file: with open(config_out_cpp, "w") as config_out_cpp_file:
config_out_cpp_file.write(str_to_write) config_out_cpp_file.write(str_to_write)
return keys, infos
if __name__ == "__main__": if __name__ == "__main__":
config_hpp = os.path.join(os.path.pardir, 'include', 'LightGBM', 'config.h') config_hpp = os.path.join(os.path.pardir, 'include', 'LightGBM', 'config.h')
config_out_cpp = os.path.join(os.path.pardir, 'src', 'io', 'config_auto.cpp') config_out_cpp = os.path.join(os.path.pardir, 'src', 'io', 'config_auto.cpp')
GenParameterCode(config_hpp, config_out_cpp) params_rst = os.path.join(os.path.pardir, 'docs', 'Parameters.rst')
sections, descriptions = GenParameterCode(config_hpp, config_out_cpp)
GenParameterDescription(sections, descriptions, params_rst)
/// desc and descl2 fields must be written in reStructuredText format
#ifndef LIGHTGBM_CONFIG_H_ #ifndef LIGHTGBM_CONFIG_H_
#define LIGHTGBM_CONFIG_H_ #define LIGHTGBM_CONFIG_H_
...@@ -76,126 +78,125 @@ public: ...@@ -76,126 +78,125 @@ public:
#pragma region Core Parameters #pragma region Core Parameters
// [doc-only] // [doc-only]
// alias=config_file // alias = config_file
// desc=path of config file // desc = path of config file
// desc=**Note**: Only can be used in CLI version // desc = **Note**: only can be used in CLI version
std::string config = ""; std::string config = "";
// [doc-only] // [doc-only]
// type=enum // type = enum
// default=train // default = train
// options=train,predict,convert_model,refit // options = train, predict, convert_model, refit
// alias=task_type // alias = task_type
// desc=``train``, alias=\ ``training``, for training // desc = ``train``, for training, aliases: ``training``
// desc=``predict``, alias=\ ``prediction``, ``test``, for prediction // desc = ``predict``, for prediction, aliases: ``prediction``, ``test``
// desc=``convert_model``, for converting model file into if-else format, see more information in `Convert model parameters <#convert-model-parameters>`__ // desc = ``convert_model``, for converting model file into if-else format, see more information in `IO Parameters <#io-parameters>`__
// desc=``refit``, alias = \ ``refit_tree``, refit existing models with new data // desc = ``refit``, for refitting existing models with new data, aliases: ``refit_tree``
// desc=**Note**: Only can be used in CLI version // desc = **Note**: only can be used in CLI version
TaskType task = TaskType::kTrain; TaskType task = TaskType::kTrain;
// [doc-only] // [doc-only]
// type=enum // type = enum
// options=regression,regression_l1,huber,fair,poisson,quantile,mape,gammma,tweedie,binary,multiclass,multiclassova,xentropy,xentlambda,lambdarank // options = regression, regression_l1, huber, fair, poisson, quantile, mape, gammma, tweedie, binary, multiclass, multiclassova, xentropy, xentlambda, lambdarank
// alias=application,app,objective_type // alias = objective_type, app, application
// desc=regression application // desc = regression application
// descl2=``regression_l2``, L2 loss, alias=\ ``regression``, ``mean_squared_error``, ``mse``, ``l2_root``, ``root_mean_squared_error``, ``rmse`` // descl2 = ``regression_l2``, L2 loss, aliases: ``regression``, ``mean_squared_error``, ``mse``, ``l2_root``, ``root_mean_squared_error``, ``rmse``
// descl2=``regression_l1``, L1 loss, alias=\ ``mean_absolute_error``, ``mae`` // descl2 = ``regression_l1``, L1 loss, aliases: ``mean_absolute_error``, ``mae``
// descl2=``huber``, `Huber loss`_ // descl2 = ``huber``, `Huber loss <https://en.wikipedia.org/wiki/Huber_loss>`__
// descl2=``fair``, `Fair loss`_ // descl2 = ``fair``, `Fair loss <https://www.kaggle.com/c/allstate-claims-severity/discussion/24520>`__
// descl2=``poisson``, `Poisson regression`_ // descl2 = ``poisson``, `Poisson regression <https://en.wikipedia.org/wiki/Poisson_regression>`__
// descl2=``quantile``, `Quantile regression`_ // descl2 = ``quantile``, `Quantile regression <https://en.wikipedia.org/wiki/Quantile_regression>`__
// descl2=``mape``, `MAPE loss`_, alias=\ ``mean_absolute_percentage_error`` // descl2 = ``mape``, `MAPE loss <https://en.wikipedia.org/wiki/Mean_absolute_percentage_error>`__, aliases: ``mean_absolute_percentage_error``
// descl2=``gamma``, Gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed`_ // descl2 = ``gamma``, Gamma regression with log-link. It might be useful, e.g., for modeling insurance claims severity, or for any target that might be `gamma-distributed <https://en.wikipedia.org/wiki/Gamma_distribution#Applications>`__
// descl2=``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed`_ // descl2 = ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed <https://en.wikipedia.org/wiki/Tweedie_distribution#Applications>`__
// desc=``binary``, binary `log loss`_ classification application // desc = ``binary``, binary `log loss <https://en.wikipedia.org/wiki/Cross_entropy>`__ classification (or logistic regression). Requires labels in {0, 1}; see ``xentropy`` for general probability labels in [0, 1]
// desc=multi-class classification application // desc = multi-class classification application
// descl2=``multiclass``, `softmax`_ objective function, alias=\ ``softmax`` // descl2 = ``multiclass``, `softmax <https://en.wikipedia.org/wiki/Softmax_function>`__ objective function, aliases: ``softmax``
// descl2=``multiclassova``, `One-vs-All`_ binary objective function, alias=\ ``multiclass_ova``, ``ova``, ``ovr`` // descl2 = ``multiclassova``, `One-vs-All <https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest>`__ binary objective function, aliases: ``multiclass_ova``, ``ova``, ``ovr``
// descl2=``num_class`` should be set as well // descl2 = ``num_class`` should be set as well
// desc=cross-entropy application // desc = cross-entropy application
// descl2=``xentropy``, objective function for cross-entropy (with optional linear weights), alias=\ ``cross_entropy`` // descl2 = ``xentropy``, objective function for cross-entropy (with optional linear weights), aliases: ``cross_entropy``
// descl2=``xentlambda``, alternative parameterization of cross-entropy, alias=\ ``cross_entropy_lambda`` // descl2 = ``xentlambda``, alternative parameterization of cross-entropy, aliases: ``cross_entropy_lambda``
// descl2=the label is anything in interval [0, 1] // descl2 = label is anything in interval [0, 1]
// desc=``lambdarank``, `lambdarank`_ application // desc = ``lambdarank``, `lambdarank <https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf>`__ application
// descl2=the label should be ``int`` type in lambdarank tasks, and larger number represent the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) // descl2 = label should be ``int`` type in lambdarank tasks, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
// descl2=`label_gain <#objective-parameters>`__ can be used to set the gain(weight) of ``int`` label // descl2 = `label_gain <#objective-parameters>`__ can be used to set the gain (weight) of ``int`` label
// descl2=all values in ``label`` must be smaller than number of elements in ``label_gain`` // descl2 = all values in ``label`` must be smaller than number of elements in ``label_gain``
std::string objective = "regression"; std::string objective = "regression";
// [doc-only] // [doc-only]
// type=enum // type = enum
// alias=boosting_type,boost // alias = boosting_type, boost
// options=gbdt,rf,dart,goss // options = gbdt, rf, dart, goss
// desc=``gbdt``, traditional Gradient Boosting Decision Tree // desc = ``gbdt``, traditional Gradient Boosting Decision Tree
// desc=``rf``, Random Forest // desc = ``rf``, Random Forest
// desc=``dart``, `Dropouts meet Multiple Additive Regression Trees`_ // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees <https://arxiv.org/abs/1505.01866>`__
// desc=``goss``, Gradient - based One - Side Sampling // desc = ``goss``, Gradient-based One-Side Sampling
std::string boosting = "gbdt"; std::string boosting = "gbdt";
// alias=train,train_data,data_filename // alias = train, train_data, data_filename
// desc=training data, LightGBM will train from this data // desc = training data, LightGBM will train from this data
std::string data = ""; std::string data = "";
// alias=test,valid_data,test_data,valid_filenames // alias = test, valid_data, test_data, valid_filenames
// desc=validation/test data, LightGBM will output metrics for these data // default = ""
// desc=support multi validation data, separate by ``,`` // desc = validation/test data, LightGBM will output metrics for these data
// desc = support multiple validation data, separated by ``,``
std::vector<std::string> valid; std::vector<std::string> valid;
// alias=num_iteration,num_tree,num_trees,num_round,num_rounds,num_boost_round,n_estimators // alias = num_iteration, num_tree, num_trees, num_round, num_rounds, num_boost_round, n_estimators
// check=>=0 // check = >=0
// desc=number of boosting iterations // desc = number of boosting iterations
// desc=**Note**: for Python/R package,**this parameter is ignored**, use num_boost_round (Python) or nrounds (R) input arguments of train and cv methods instead // desc = **Note**: for Python/R package, **this parameter is ignored**, use ``num_boost_round`` (Python) or ``nrounds`` (R) input arguments of ``train`` and ``cv`` methods instead
// desc=**Note**: internally,LightGBM constructs num_class * num_iterations trees for multiclass problems // desc = **Note**: internally, LightGBM constructs ``num_class * num_iterations`` trees for multi-class classification problems
int num_iterations = 100; int num_iterations = 100;
// alias=shrinkage_rate // alias = shrinkage_rate
// check=>0 // check = >0
// desc=shrinkage rate // desc = shrinkage rate
// desc=in dart,it also affects on normalization weights of dropped trees // desc = in ``dart``, it also affects on normalization weights of dropped trees
double learning_rate = 0.1; double learning_rate = 0.1;
// default=31 // default = 31
// alias = num_leaf // alias = num_leaf
// check=>1 // check = >1
// desc=max number of leaves in one tree // desc = max number of leaves in one tree
int num_leaves = kDefaultNumLeaves; int num_leaves = kDefaultNumLeaves;
// [doc-only] // [doc-only]
// type=enum // type = enum
// options=serial, feature, data, voting // options = serial, feature, data, voting
// alias = tree, tree_learner_type // alias = tree, tree_learner_type
// desc=serial,single machine tree learner // desc = ``serial``, single machine tree learner
// desc=feature,alias=feature_parallel,feature parallel tree learner // desc = ``feature``, feature parallel tree learner, aliases: ``feature_parallel``
// desc=data,alias=data_parallel,data parallel tree learner // desc = ``data``, data parallel tree learner, aliases: ``data_parallel``
// desc=voting,alias=voting_parallel,voting parallel tree learner // desc = ``voting``, voting parallel tree learner, aliases: ``voting_parallel``
// desc=refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details // desc = refer to `Parallel Learning Guide <./Parallel-Learning-Guide.rst>`__ to get more details
std::string tree_learner = "serial"; std::string tree_learner = "serial";
// default=OpenMP_default
// alias = num_thread, nthread, nthreads // alias = num_thread, nthread, nthreads
// desc = number of threads for LightGBM // desc = number of threads for LightGBM
// desc=for the best speed,set this to the number of **real CPU cores**, // desc = ``0`` means default number of threads in OpenMP
// not the number of threads(most CPU using `hyper-threading`_ to generate 2 threads per CPU core) // desc = for the best speed, set this to the number of **real CPU cores**, not the number of threads (most CPUs use `hyper-threading <https://en.wikipedia.org/wiki/Hyper-threading>`__ to generate 2 threads per CPU core)
// desc=do not set it too large if your dataset is small (do not use 64 threads for a dataset with 10,000 rows for instance) // desc = do not set it too large if your dataset is small (for instance, do not use 64 threads for a dataset with 10,000 rows)
// desc=be aware a task manager or any similar CPU monitoring tool might report cores not being fully utilized. **This is normal** // desc = be aware a task manager or any similar CPU monitoring tool might report that cores not being fully utilized. **This is normal**
// desc=for parallel learning,should not use full CPU cores since this will cause poor performance for the network // desc = for parallel learning, do not use all CPU cores because this will cause poor performance for the network communication
int num_threads = 0; int num_threads = 0;
// [doc-only] // [doc-only]
// options=cpu,gpu // type = enum
// desc = choose device for the tree learning, you can use GPU to achieve the faster learning // options = cpu, gpu
// desc=**Note**: it is recommended to use the smaller max_bin (e.g. 63) to get the better speed up // desc = device for the tree learning, you can use GPU to achieve the faster learning
// desc=**Note**: for the faster speed,GPU use 32-bit float point to sum up by default,may affect the accuracy for some tasks. // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
// desc=You can set gpu_use_dp = true to enable 64 - bit float point, but it will slow down the training // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
// desc=**Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build with GPU // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
std::string device_type = "cpu"; std::string device_type = "cpu";
// [doc-only] // [doc-only]
// alias=random_seed // alias = random_seed
// desc=Use this seed to generate seeds for others, e.g. data_random_seed. // desc = this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``
// desc=Will be override if set other seeds as well // desc = will be overridden, if you set other seeds
// default=none
int seed = 0; int seed = 0;
#pragma endregion #pragma endregion
...@@ -497,6 +498,7 @@ public: ...@@ -497,6 +498,7 @@ public:
std::string initscore_filename = ""; std::string initscore_filename = "";
// alias=valid_data_init_scores,valid_init_score_file,valid_init_score // alias=valid_data_init_scores,valid_init_score_file,valid_init_score
// default=""
// desc=path to validation initial score file,"" will use valid_data_file + .init (if exists) // desc=path to validation initial score file,"" will use valid_data_file + .init (if exists)
// desc=separate by ,for multi-validation data // desc=separate by ,for multi-validation data
std::vector<std::string> valid_data_initscores; std::vector<std::string> valid_data_initscores;
......
/// This file is auto generated by LightGBM\helper\parameter_generator.py /// This file is auto generated by LightGBM\helper\parameter_generator.py from LightGBM\include\LightGBM\config.h file.
#include<LightGBM/config.h> #include<LightGBM/config.h>
namespace LightGBM { namespace LightGBM {
std::unordered_map<std::string, std::string> Config::alias_table({ std::unordered_map<std::string, std::string> Config::alias_table({
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment