Unverified Commit df4f05c7 authored by Chi Song's avatar Chi Song Committed by GitHub
Browse files

add Chinese translation (#661)

parent b38c0431
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
from recommonmark.parser import CommonMarkParser
from recommonmark.transform import AutoStructify
# -- Project information ---------------------------------------------------
project = 'Neural Network Intelligence'
copyright = '2019, Microsoft'
author = 'Microsoft'
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = 'v0.5'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.mathjax',
'sphinx_markdown_tables',
'sphinxarg.ext',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
source_parsers = {
'.md': CommonMarkParser
}
source_suffix = ['.rst', '.md']
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = None
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
'logo_only': True,
}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
html_logo = './img/nni_logo_dark.png'
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'NeuralNetworkIntelligencedoc'
# -- Options for LaTeX output ------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'NeuralNetworkIntelligence.tex', 'Neural Network Intelligence Documentation',
'Microsoft', 'manual'),
]
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'neuralnetworkintelligence', 'Neural Network Intelligence Documentation',
[author], 1)
]
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'NeuralNetworkIntelligence', 'Neural Network Intelligence Documentation',
author, 'NeuralNetworkIntelligence', 'One line description of project.',
'Miscellaneous'),
]
# -- Options for Epub output -------------------------------------------------
# Bibliographic Dublin Core info.
epub_title = project
# The unique identifier of the text. This can be a ISBN number
# or the project homepage.
#
# epub_identifier = ''
# A unique identification for the text.
#
# epub_uid = ''
# A list of files that should not be packed into the epub file.
epub_exclude_files = ['search.html']
# -- Extension configuration -------------------------------------------------
github_doc_root = 'https://github.com/Microsoft/nni/tree/master/doc/'
def setup(app):
app.add_config_value('recommonmark_config', {
'url_resolver': lambda url: github_doc_root + url if url.startswith('..') else url,
'enable_auto_toc_tree': False,
}, True)
app.add_transform(AutoStructify)
# GBDT
梯度提升是机器学习中回归和分类问题的一种方法。它由一组弱分类模型所组成,决策树是其中的典型。 像其它提升方法一样,它也分步来构建模型,并使用可微分的损失函数来优化。
梯度决策树(gradient boosting decision tree,GBDT)有很多流行的实现,如:[LightGBM](https://github.com/Microsoft/LightGBM), [xgboost](https://github.com/dmlc/xgboost), 和 [catboost](https://github.com/catboost/catboost),等等。 GBDT 是解决经典机器学习问题的重要工具。 GBDT 也是一种鲁棒的算法,可以使用在很多领域。 GBDT 的超参越好,就能获得越好的性能。
NNI 是用来调优超参的平台,可以在 NNI 中尝试各种内置的搜索算法,并行运行多个 Trial。
## 1. GBDT 的搜索空间
GBDT 有很多超参,但哪些才会影响性能或计算速度呢? 基于实践经验,建议如下(以 lightgbm 为例):
> * 获得更好的精度
* `learning_rate`. `学习率`的范围应该是 [0.001, 0.9]。
* `num_leaves`. `num_leaves``max_depth` 有关,不必两个值同时调整。
* `bagging_freq`. `bagging_freq` 可以是 [1, 2, 4, 8, 10]。
* `num_iterations`. 如果达到期望的拟合精度,可以调整得大一些。
> * 加速
* `bagging_fraction`. `bagging_fraction` 的范围应该是 [0.7, 1.0]。
* `feature_fraction`. `feature_fraction` 的范围应该是 [0.6, 1.0]。
* `max_bin`.
> * 避免过拟合
* `min_data_in_leaf`. 取决于数据集。
* `min_sum_hessian_in_leaf`. 取决于数据集。
* `lambda_l1``lambda_l2`.
* `min_gain_to_split`.
* `num_leaves`.
更多信息可参考: [lightgbm](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)[autoxgoboost](https://github.com/ja-thomas/autoxgboost/blob/master/poster_2018.pdf)
## 2. 任务描述
"auto-gbdt" 基于 LightGBM 和 NNI。 数据集有[训练数据](https://github.com/Microsoft/nni/blob/master/examples/trials/auto-gbdt/data/regression.train)[测试数据](https://github.com/Microsoft/nni/blob/master/examples/trials/auto-gbdt/data/regression.train)。 根据数据中的特征和标签,训练一个 GBDT 回归模型,用来做预测。
## 3. 如何运行 NNI
### 3.1 准备 Trial 代码
基础代码如下:
```python
...
def get_default_parameters():
...
return params
def load_data(train_path='./data/regression.train', test_path='./data/regression.test'):
'''
读取或创建数据集
'''
...
return lgb_train, lgb_eval, X_test, y_test
def run(lgb_train, lgb_eval, params, X_test, y_test):
# 训练
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
# 预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# 评估
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print('The rmse of prediction is:', rmse)
if __name__ == '__main__':
lgb_train, lgb_eval, X_test, y_test = load_data()
PARAMS = get_default_parameters()
# train
run(lgb_train, lgb_eval, PARAMS, X_test, y_test)
```
### 3.2 准备搜索空间
如果要调优 `num_leaves`, `learning_rate`, `bagging_fraction``bagging_freq`, 可创建一个 [search_space.json](https://github.com/Microsoft/nni/blob/master/examples/trials/auto-gbdt/search_space.json) 文件:
```json
{
"num_leaves":{"_type":"choice","_value":[31, 28, 24, 20]},
"learning_rate":{"_type":"choice","_value":[0.01, 0.05, 0.1, 0.2]},
"bagging_fraction":{"_type":"uniform","_value":[0.7, 1.0]},
"bagging_freq":{"_type":"choice","_value":[1, 2, 4, 8, 10]}
}
```
参考[这里](./SearchSpaceSpec.md),了解更多变量类型。
### 3.3 在代码中使用 NNI SDK
```diff
+import nni
...
def get_default_parameters():
...
return params
def load_data(train_path='./data/regression.train', test_path='./data/regression.test'):
'''
读取或创建数据集
'''
...
return lgb_train, lgb_eval, X_test, y_test
def run(lgb_train, lgb_eval, params, X_test, y_test):
# 训练
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
# 预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# 评估
rmse = mean_squared_error(y_test, y_pred) ** 0.5
print('The rmse of prediction is:', rmse)
+ nni.report_final_result(rmse)
if __name__ == '__main__':
lgb_train, lgb_eval, X_test, y_test = load_data()
+ RECEIVED_PARAMS = nni.get_next_parameter()
PARAMS = get_default_parameters()
+ PARAMS.update(RECEIVED_PARAMS)
PARAMS = get_default_parameters()
PARAMS.update(RECEIVED_PARAMS)
# 训练
run(lgb_train, lgb_eval, PARAMS, X_test, y_test)
```
### 3.4 编写配置文件并运行
在配置文件中,可以设置如下内容:
* Experiment 设置:`trialConcurrency`, `maxExecDuration`, `maxTrialNum`, `trial gpuNum`, 等等。
* 平台设置:`trainingServicePlatform`,等等。
* 路径设置:`searchSpacePath`, `trial codeDir`,等等。
* 算法设置:选择 `Tuner` 算法,`优化方向`,等等。
config.yml 样例:
```yaml
authorName: default
experimentName: example_auto-gbdt
trialConcurrency: 1
maxExecDuration: 10h
maxTrialNum: 10
#可选项: local, remote, pai
trainingServicePlatform: local
searchSpacePath: search_space.json
#可选项: true, false
useAnnotation: false
tuner:
#可选项: TPE, Random, Anneal, Evolution, BatchTuner
#SMAC (SMAC 需要先通过 nnictl 来安装)
builtinTunerName: TPE
classArgs:
#可选项: maximize, minimize
optimize_mode: minimize
trial:
command: python3 main.py
codeDir: .
gpuNum: 0
```
使用下面的命令启动 Experiment:
```bash
nnictl create --config ./config.yml
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment