Unverified Commit 0fb78620 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #238 from microsoft/master

merge master
parents 3ee09617 b8d19e45
...@@ -45,6 +45,7 @@ NNI_INSTALL_PATH ?= $(INSTALL_PREFIX)/nni ...@@ -45,6 +45,7 @@ NNI_INSTALL_PATH ?= $(INSTALL_PREFIX)/nni
BIN_FOLDER ?= $(ROOT_FOLDER)/bin BIN_FOLDER ?= $(ROOT_FOLDER)/bin
NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni NNI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni
NASUI_PKG_FOLDER ?= $(ROOT_FOLDER)/nni/nasui
## Dependency information ## Dependency information
NNI_DEPENDENCY_FOLDER = /tmp/$(USER) NNI_DEPENDENCY_FOLDER = /tmp/$(USER)
...@@ -132,6 +133,8 @@ clean: ...@@ -132,6 +133,8 @@ clean:
-rm -rf src/sdk/pynni/nni_sdk.egg-info -rm -rf src/sdk/pynni/nni_sdk.egg-info
-rm -rf src/webui/build -rm -rf src/webui/build
-rm -rf src/webui/node_modules -rm -rf src/webui/node_modules
-rm -rf src/nasui/build
-rm -rf src/nasui/node_modules
# Main targets end # Main targets end
...@@ -193,6 +196,11 @@ install-node-modules: ...@@ -193,6 +196,11 @@ install-node-modules:
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
$(NNI_YARN) --prod --cwd $(NNI_PKG_FOLDER) $(NNI_YARN) --prod --cwd $(NNI_PKG_FOLDER)
cp -r src/webui/build $(NNI_PKG_FOLDER)/static cp -r src/webui/build $(NNI_PKG_FOLDER)/static
# Install nasui
mkdir -p $(NASUI_PKG_FOLDER)
cp -rf src/nasui/build $(NASUI_PKG_FOLDER)
cp src/nasui/server.js $(NASUI_PKG_FOLDER)
.PHONY: dev-install-node-modules .PHONY: dev-install-node-modules
dev-install-node-modules: dev-install-node-modules:
...@@ -203,6 +211,8 @@ dev-install-node-modules: ...@@ -203,6 +211,8 @@ dev-install-node-modules:
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(NNI_PKG_FOLDER)/package.json
ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules ln -sf ${PWD}/src/nni_manager/node_modules $(NNI_PKG_FOLDER)/node_modules
ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static ln -sf ${PWD}/src/webui/build $(NNI_PKG_FOLDER)/static
ln -sf ${PWD}/src/nasui/build $(NASUI_PKG_FOLDER)/build
ln -sf ${PWD}/src/nasui/server.js $(NASUI_PKG_FOLDER)/server.js
.PHONY: install-scripts .PHONY: install-scripts
install-scripts: install-scripts:
......
...@@ -28,7 +28,8 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a ...@@ -28,7 +28,8 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
### **NNI v1.4 has been released! &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>** ### **NNI v1.4 has been released! &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
## **NNI capabilities in a glance** ## **NNI capabilities in a glance**
NNI provides CommandLine Tool as well as an user friendly WebUI to manage training experiments. With the extensible API, you can customize your own AutoML algorithms and training services. To make it easy for new users, NNI also provides a set of build-in stat-of-the-art AutoML algorithms and out of box support for popular training platforms.
NNI provides CommandLine Tool as well as an user friendly WebUI to manage training experiments. With the extensible API, you can customize your own AutoML algorithms and training services. To make it easy for new users, NNI also provides a set of build-in stat-of-the-art AutoML algorithms and out of box support for popular training platforms.
Within the following table, we summarized the current NNI capabilities, we are gradually adding new capabilities and we'd love to have your contribution. Within the following table, we summarized the current NNI capabilities, we are gradually adding new capabilities and we'd love to have your contribution.
...@@ -105,24 +106,24 @@ Within the following table, we summarized the current NNI capabilities, we are g ...@@ -105,24 +106,24 @@ Within the following table, we summarized the current NNI capabilities, we are g
<b>Heuristic search</b> <b>Heuristic search</b>
<ul> <ul>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#Evolution">Naïve Evolution</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Anneal">Anneal</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#Anneal">Anneal</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#Hyperband">Hyperband</a></li>
</ul> </ul>
<b>Bayesian optimization</b> <b>Bayesian optimization</b>
<ul> <ul>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#BOHB">BOHB</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#TPE">TPE</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#TPE">TPE</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#SMAC">SMAC</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a> </li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#GPTuner">GP Tuner</a></li>
</ul> </ul>
<b>RL Based</b> <b>RL Based</b>
<ul> <ul>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#PPOTuner">PPO Tuner</a> </li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#PPOTuner">PPO Tuner</a> </li>
</ul> </ul>
</ul> </ul>
<a href="docs/en_US/NAS/Overview.md">Neural Architecture Search</a> <a href="docs/en_US/NAS/Overview.md">Neural Architecture Search</a>
<ul> <ul>
<ul> <ul>
<li><a href="docs/en_US/NAS/ENAS.md">ENAS</a></li> <li><a href="docs/en_US/NAS/ENAS.md">ENAS</a></li>
<li><a href="docs/en_US/NAS/DARTS.md">DARTS</a></li> <li><a href="docs/en_US/NAS/DARTS.md">DARTS</a></li>
...@@ -131,7 +132,7 @@ Within the following table, we summarized the current NNI capabilities, we are g ...@@ -131,7 +132,7 @@ Within the following table, we summarized the current NNI capabilities, we are g
<li><a href="docs/en_US/NAS/SPOS.md">SPOS</a></li> <li><a href="docs/en_US/NAS/SPOS.md">SPOS</a></li>
<li><a href="docs/en_US/NAS/Proxylessnas.md">ProxylessNAS</a></li> <li><a href="docs/en_US/NAS/Proxylessnas.md">ProxylessNAS</a></li>
<li><a href="docs/en_US/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a> </li> <li><a href="docs/en_US/Tuner/BuiltinTuner.md#NetworkMorphism">Network Morphism</a> </li>
</ul> </ul>
</ul> </ul>
<a href="docs/en_US/Compressor/Overview.md">Model Compression</a> <a href="docs/en_US/Compressor/Overview.md">Model Compression</a>
<ul> <ul>
...@@ -155,7 +156,7 @@ Within the following table, we summarized the current NNI capabilities, we are g ...@@ -155,7 +156,7 @@ Within the following table, we summarized the current NNI capabilities, we are g
<a href="docs/en_US/Assessor/BuiltinAssessor.md">Early Stop Algorithms</a> <a href="docs/en_US/Assessor/BuiltinAssessor.md">Early Stop Algorithms</a>
<ul> <ul>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Medianstop">Median Stop</a></li> <li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Medianstop">Median Stop</a></li>
<li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting</a></li> <li><a href="docs/en_US/Assessor/BuiltinAssessor.md#Curvefitting">Curve Fitting</a></li>
</ul> </ul>
</td> </td>
<td> <td>
...@@ -288,8 +289,9 @@ You can use these commands to get more information about the experiment ...@@ -288,8 +289,9 @@ You can use these commands to get more information about the experiment
</table> </table>
## **Documentation** ## **Documentation**
* To learn about what's NNI, read the [NNI Overview](https://nni.readthedocs.io/en/latest/Overview.html).
* To get yourself familiar with how to use NNI, read the [documentation](https://nni.readthedocs.io/en/latest/index.html). * To learn about what's NNI, read the [NNI Overview](https://nni.readthedocs.io/en/latest/Overview.html).
* To get yourself familiar with how to use NNI, read the [documentation](https://nni.readthedocs.io/en/latest/index.html).
* To get started and install NNI on your system, please refer to [Install NNI](https://nni.readthedocs.io/en/latest/installation.html). * To get started and install NNI on your system, please refer to [Install NNI](https://nni.readthedocs.io/en/latest/installation.html).
## **Contributing** ## **Contributing**
...@@ -300,6 +302,7 @@ When you submit a pull request, a CLA-bot will automatically determine whether y ...@@ -300,6 +302,7 @@ When you submit a pull request, a CLA-bot will automatically determine whether y
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the Code of [Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact opencode@microsoft.com with any additional questions or comments. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the Code of [Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact opencode@microsoft.com with any additional questions or comments.
After getting familiar with contribution agreements, you are ready to create your first PR =), follow the NNI developer tutorials to get start: After getting familiar with contribution agreements, you are ready to create your first PR =), follow the NNI developer tutorials to get start:
* We recommend new contributors to start with simple issues: ['good first issue'](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) or ['help-wanted'](https://github.com/microsoft/nni/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22). * We recommend new contributors to start with simple issues: ['good first issue'](https://github.com/Microsoft/nni/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) or ['help-wanted'](https://github.com/microsoft/nni/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22).
* [NNI developer environment installation tutorial](docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md) * [NNI developer environment installation tutorial](docs/en_US/Tutorial/SetupNniDeveloperEnvironment.md)
* [How to debug](docs/en_US/Tutorial/HowToDebug.md) * [How to debug](docs/en_US/Tutorial/HowToDebug.md)
...@@ -311,15 +314,14 @@ After getting familiar with contribution agreements, you are ready to create you ...@@ -311,15 +314,14 @@ After getting familiar with contribution agreements, you are ready to create you
## **External Repositories and References** ## **External Repositories and References**
With authors' permission, we listed a set of NNI usage examples and relevant articles. With authors' permission, we listed a set of NNI usage examples and relevant articles.
* ### **External Repositories** ### * ### **External Repositories** ###
* Run [ENAS](examples/tuners/enas_nni/README.md) with NNI * Run [ENAS](examples/tuners/enas_nni/README.md) with NNI
* Run [Neural Network Architecture Search](examples/trials/nas_cifar10/README.md) with NNI * Run [Neural Network Architecture Search](examples/trials/nas_cifar10/README.md) with NNI
* [Automatic Feature Engineering](examples/feature_engineering/auto-feature-engineering/README.md) with NNI * [Automatic Feature Engineering](examples/feature_engineering/auto-feature-engineering/README.md) with NNI
* [Hyperparameter Tuning for Matrix Factorization](https://github.com/microsoft/recommenders/blob/master/notebooks/04_model_select_and_optimize/nni_surprise_svd.ipynb) with NNI * [Hyperparameter Tuning for Matrix Factorization](https://github.com/microsoft/recommenders/blob/master/notebooks/04_model_select_and_optimize/nni_surprise_svd.ipynb) with NNI
* [scikit-nni](https://github.com/ksachdeva/scikit-nni) Hyper-parameter search for scikit-learn pipelines using NNI * [scikit-nni](https://github.com/ksachdeva/scikit-nni) Hyper-parameter search for scikit-learn pipelines using NNI
* ### **Relevant Articles** ### * ### **Relevant Articles** ###
* [Hyper Parameter Optimization Comparison](docs/en_US/CommunitySharings/HpoComparision.md) * [Hyper Parameter Optimization Comparison](docs/en_US/CommunitySharings/HpoComparision.md)
* [Neural Architecture Search Comparison](docs/en_US/CommunitySharings/NasComparision.md) * [Neural Architecture Search Comparison](docs/en_US/CommunitySharings/NasComparision.md)
* [Parallelizing a Sequential Algorithm TPE](docs/en_US/CommunitySharings/ParallelizingTpeSearch.md) * [Parallelizing a Sequential Algorithm TPE](docs/en_US/CommunitySharings/ParallelizingTpeSearch.md)
...@@ -330,11 +332,13 @@ With authors' permission, we listed a set of NNI usage examples and relevant art ...@@ -330,11 +332,13 @@ With authors' permission, we listed a set of NNI usage examples and relevant art
* **Blog (in Chinese)** - [A summary of NNI new capabilities in 2019](https://mp.weixin.qq.com/s/7_KRT-rRojQbNuJzkjFMuA) by @squirrelsc * **Blog (in Chinese)** - [A summary of NNI new capabilities in 2019](https://mp.weixin.qq.com/s/7_KRT-rRojQbNuJzkjFMuA) by @squirrelsc
## **Feedback** ## **Feedback**
* Discuss on the NNI [Gitter](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) in NNI. * Discuss on the NNI [Gitter](https://gitter.im/Microsoft/nni?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) in NNI.
* [File an issue](https://github.com/microsoft/nni/issues/new/choose) on GitHub. * [File an issue](https://github.com/microsoft/nni/issues/new/choose) on GitHub.
* Ask a question with NNI tags on [Stack Overflow](https://stackoverflow.com/questions/tagged/nni?sort=Newest&edited=true). * Ask a question with NNI tags on [Stack Overflow](https://stackoverflow.com/questions/tagged/nni?sort=Newest&edited=true).
## Related Projects ## Related Projects
Targeting at openness and advancing state-of-art technology, [Microsoft Research (MSR)](https://www.microsoft.com/en-us/research/group/systems-research-group-asia/) had also released few other open source projects. Targeting at openness and advancing state-of-art technology, [Microsoft Research (MSR)](https://www.microsoft.com/en-us/research/group/systems-research-group-asia/) had also released few other open source projects.
* [OpenPAI](https://github.com/Microsoft/pai) : an open source platform that provides complete AI model training and resource management capabilities, it is easy to extend and supports on-premise, cloud and hybrid environments in various scale. * [OpenPAI](https://github.com/Microsoft/pai) : an open source platform that provides complete AI model training and resource management capabilities, it is easy to extend and supports on-premise, cloud and hybrid environments in various scale.
......
...@@ -52,24 +52,12 @@ jobs: ...@@ -52,24 +52,12 @@ jobs:
displayName: 'Run flake8 tests to find Python syntax errors and undefined names' displayName: 'Run flake8 tests to find Python syntax errors and undefined names'
- script: | - script: |
cd test cd test
source unittest.sh source scripts/unittest.sh
displayName: 'Unit test' displayName: 'Unit test'
- script: | - script: |
cd test cd test
python3 naive_test.py python3 nni_test/nnitest/run_tests.py --config config/pr_tests.yml
displayName: 'Naive test' displayName: 'Simple test'
- script: |
cd test
python3 tuner_test.py
displayName: 'Built-in tuners / assessors tests'
- script: |
cd test
python3 metrics_test.py
displayName: 'Trial job metrics test'
- script: |
cd test
python3 cli_test.py
displayName: 'nnicli test'
- script: | - script: |
cd docs/en_US/ cd docs/en_US/
sphinx-build -M html . _build -W sphinx-build -M html . _build -W
...@@ -101,20 +89,12 @@ jobs: ...@@ -101,20 +89,12 @@ jobs:
displayName: 'Install dependencies' displayName: 'Install dependencies'
- script: | - script: |
cd test cd test
source unittest.sh source scripts/unittest.sh
displayName: 'Unit test' displayName: 'Unit test'
- script: | - script: |
cd test cd test
python3 naive_test.py python3 nni_test/nnitest/run_tests.py --config config/pr_tests.yml
displayName: 'Naive test' displayName: 'Simple test'
- script: |
cd test
python3 tuner_test.py
displayName: 'Built-in tuners / assessors tests'
- script: |
cd test
python3 cli_test.py
displayName: 'nnicli test'
- job: 'basic_test_pr_Windows' - job: 'basic_test_pr_Windows'
pool: pool:
...@@ -137,13 +117,9 @@ jobs: ...@@ -137,13 +117,9 @@ jobs:
displayName: 'Install dependencies' displayName: 'Install dependencies'
- script: | - script: |
cd test cd test
powershell.exe -file unittest.ps1 powershell.exe -file scripts/unittest.ps1
displayName: 'unit test' displayName: 'unit test'
- script: | - script: |
cd test cd test
python tuner_test.py python nni_test/nnitest/run_tests.py --config config/pr_tests.yml
displayName: 'Built-in tuners / assessors tests' displayName: 'Simple test'
- script: |
cd test
PATH=$HOME/.local/bin:$PATH python3 cli_test.py
displayName: 'nnicli test'
...@@ -36,10 +36,13 @@ build: ...@@ -36,10 +36,13 @@ build:
tar -xf $(NNI_YARN_TARBALL) -C $(NNI_YARN_FOLDER) --strip-components 1 tar -xf $(NNI_YARN_TARBALL) -C $(NNI_YARN_FOLDER) --strip-components 1
cd $(CWD)../../src/nni_manager && $(NNI_YARN) && $(NNI_YARN) build cd $(CWD)../../src/nni_manager && $(NNI_YARN) && $(NNI_YARN) build
cd $(CWD)../../src/webui && $(NNI_YARN) && $(NNI_YARN) build cd $(CWD)../../src/webui && $(NNI_YARN) && $(NNI_YARN) build
cd $(CWD)../../src/nasui && $(NNI_YARN) && $(NNI_YARN) build
rm -rf $(CWD)nni rm -rf $(CWD)nni
cp -r $(CWD)../../src/nni_manager/dist $(CWD)nni cp -r $(CWD)../../src/nni_manager/dist $(CWD)nni
cp -r $(CWD)../../src/nni_manager/config $(CWD)nni cp -r $(CWD)../../src/nni_manager/config $(CWD)nni
cp -r $(CWD)../../src/webui/build $(CWD)nni/static cp -r $(CWD)../../src/webui/build $(CWD)nni/static
cp -r $(CWD)../../src/nasui/build $(CWD)nni/nasui
cp $(CWD)../../src/nasui/server.js $(CWD)nni/nasui
cp $(CWD)../../src/nni_manager/package.json $(CWD)nni cp $(CWD)../../src/nni_manager/package.json $(CWD)nni
sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(CWD)nni/package.json sed -ie 's/$(NNI_VERSION_TEMPLATE)/$(NNI_VERSION_VALUE)/' $(CWD)nni/package.json
cd $(CWD)nni && $(NNI_YARN) --prod cd $(CWD)nni && $(NNI_YARN) --prod
......
...@@ -46,11 +46,16 @@ yarn build ...@@ -46,11 +46,16 @@ yarn build
cd $CWD\..\..\src\webui cd $CWD\..\..\src\webui
yarn yarn
yarn build yarn build
cd $CWD\..\..\src\nasui
yarn
yarn build
if(Test-Path $CWD\nni){ if(Test-Path $CWD\nni){
Remove-Item $CWD\nni -r -fo Remove-Item $CWD\nni -r -fo
} }
Copy-Item $CWD\..\..\src\nni_manager\dist $CWD\nni -Recurse Copy-Item $CWD\..\..\src\nni_manager\dist $CWD\nni -Recurse
Copy-Item $CWD\..\..\src\webui\build $CWD\nni\static -Recurse Copy-Item $CWD\..\..\src\webui\build $CWD\nni\static -Recurse
Copy-Item $CWD\..\..\src\nasui\build $CWD\nni\nasui -Recurse
Copy-Item $CWD\..\..\src\nasui\server.js $CWD\nni\nasui -Recurse
Copy-Item $CWD\..\..\src\nni_manager\package.json $CWD\nni Copy-Item $CWD\..\..\src\nni_manager\package.json $CWD\nni
(Get-Content $CWD\nni\package.json).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $CWD\nni\package.json (Get-Content $CWD\nni\package.json).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $CWD\nni\package.json
cd $CWD\nni cd $CWD\nni
......
# Built-in Assessors # Built-in Assessors
NNI provides state-of-the-art tuning algorithm in our builtin-assessors and makes them easy to use. Below is the brief overview of NNI current builtin Assessors: NNI provides state-of-the-art tuning algorithms within our builtin-assessors and makes them easy to use. Below is a brief overview of NNI's current builtin Assessors.
Note: Click the **Assessor's name** to get the Assessor's installation requirements, suggested scenario and using example. The link for a detailed description of the algorithm is at the end of the suggested scenario of each Assessor. Note: Click the **Assessor's name** to get each Assessor's installation requirements, suggested usage scenario, and a config example. A link to a detailed description of each algorithm is provided at the end of the suggested scenario for each Assessor.
Currently we support the following Assessors: Currently, we support the following Assessors:
|Assessor|Brief Introduction of Algorithm| |Assessor|Brief Introduction of Algorithm|
|---|---| |---|---|
|[__Medianstop__](#MedianStop)|Medianstop is a simple early stopping rule. It stops a pending trial X at step S if the trial’s best objective value by step S is strictly worse than the median value of the running averages of all completed trials’ objectives reported up to step S. [Reference Paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf)| |[__Medianstop__](#MedianStop)|Medianstop is a simple early stopping rule. It stops a pending trial X at step S if the trial’s best objective value by step S is strictly worse than the median value of the running averages of all completed trials’ objectives reported up to step S. [Reference Paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf)|
|[__Curvefitting__](#Curvefitting)|Curve Fitting Assessor is a LPA(learning, predicting, assessing) algorithm. It stops a pending trial X at step S if the prediction of final epoch's performance worse than the best final performance in the trial history. In this algorithm, we use 12 curves to fit the accuracy curve. [Reference Paper](http://aad.informatik.uni-freiburg.de/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf)| |[__Curvefitting__](#Curvefitting)|Curve Fitting Assessor is an LPA (learning, predicting, assessing) algorithm. It stops a pending trial X at step S if the prediction of the final epoch's performance worse than the best final performance in the trial history. In this algorithm, we use 12 curves to fit the accuracy curve. [Reference Paper](http://aad.informatik.uni-freiburg.de/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf)|
## Usage of Builtin Assessors ## Usage of Builtin Assessors
Use builtin assessors provided by NNI SDK requires to declare the **builtinAssessorName** and **classArgs** in `config.yml` file. In this part, we will introduce the detailed usage about the suggested scenarios, classArg requirements, and example for each assessor. Usage of builtin assessors provided by the NNI SDK requires one to declare the **builtinAssessorName** and **classArgs** in the `config.yml` file. In this part, we will introduce the details of usage and the suggested scenarios, classArg requirements, and an example for each assessor.
Note: Please follow the format when you write your `config.yml` file. Note: Please follow the provided format when writing your `config.yml` file.
<a name="MedianStop"></a> <a name="MedianStop"></a>
...@@ -25,12 +25,12 @@ Note: Please follow the format when you write your `config.yml` file. ...@@ -25,12 +25,12 @@ Note: Please follow the format when you write your `config.yml` file.
**Suggested scenario** **Suggested scenario**
It is applicable in a wide range of performance curves, thus, can be used in various scenarios to speed up the tuning progress. [Detailed Description](./MedianstopAssessor.md) It's applicable in a wide range of performance curves, thus, it can be used in various scenarios to speed up the tuning progress. [Detailed Description](./MedianstopAssessor.md)
**Requirement of classArg** **classArgs requirements:**
* **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', assessor will **stop** the trial with smaller expectation. If 'minimize', assessor will **stop** the trial with larger expectation. * **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', assessor will **stop** the trial with smaller expectation. If 'minimize', assessor will **stop** the trial with larger expectation.
* **start_step** (*int, optional, default = 0*) - A trial is determined to be stopped or not, only after receiving start_step number of reported intermediate results. * **start_step** (*int, optional, default = 0*) - A trial is determined to be stopped or not only after receiving start_step number of reported intermediate results.
**Usage example:** **Usage example:**
...@@ -53,15 +53,15 @@ assessor: ...@@ -53,15 +53,15 @@ assessor:
**Suggested scenario** **Suggested scenario**
It is applicable in a wide range of performance curves, thus, can be used in various scenarios to speed up the tuning progress. Even better, it's able to handle and assess curves with similar performance. [Detailed Description](./CurvefittingAssessor.md) It's applicable in a wide range of performance curves, thus, it can be used in various scenarios to speed up the tuning progress. Even better, it's able to handle and assess curves with similar performance. [Detailed Description](./CurvefittingAssessor.md)
**Requirement of classArg** **classArgs requirements:**
* **epoch_num** (*int, **required***) - The total number of epoch. We need to know the number of epoch to determine which point we need to predict. * **epoch_num** (*int, **required***) - The total number of epochs. We need to know the number of epochs to determine which points we need to predict.
* **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', assessor will **stop** the trial with smaller expectation. If 'minimize', assessor will **stop** the trial with larger expectation. * **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', assessor will **stop** the trial with smaller expectation. If 'minimize', assessor will **stop** the trial with larger expectation.
* **start_step** (*int, optional, default = 6*) - A trial is determined to be stopped or not, we start to predict only after receiving start_step number of reported intermediate results. * **start_step** (*int, optional, default = 6*) - A trial is determined to be stopped or not only after receiving start_step number of reported intermediate results.
* **threshold** (*float, optional, default = 0.95*) - The threshold that we decide to early stop the worse performance curve. For example: if threshold = 0.95, optimize_mode = maximize, best performance in the history is 0.9, then we will stop the trial which predict value is lower than 0.95 * 0.9 = 0.855. * **threshold** (*float, optional, default = 0.95*) - The threshold that we use to decide to early stop the worst performance curve. For example: if threshold = 0.95, optimize_mode = maximize, and the best performance in the history is 0.9, then we will stop the trial who's predicted value is lower than 0.95 * 0.9 = 0.855.
* **gap** (*int, optional, default = 1*) - The gap interval between Assesor judgements. For example: if gap = 2, start_step = 6, then we will assess the result when we get 6, 8, 10, 12...intermedian result. * **gap** (*int, optional, default = 1*) - The gap interval between Assesor judgements. For example: if gap = 2, start_step = 6, then we will assess the result when we get 6, 8, 10, 12...intermediate results.
**Usage example:** **Usage example:**
......
...@@ -2,9 +2,9 @@ Curve Fitting Assessor on NNI ...@@ -2,9 +2,9 @@ Curve Fitting Assessor on NNI
=== ===
## 1. Introduction ## 1. Introduction
Curve Fitting Assessor is a LPA(learning, predicting, assessing) algorithm. It stops a pending trial X at step S if the prediction of final epoch's performance is worse than the best final performance in the trial history. The Curve Fitting Assessor is an LPA (learning, predicting, assessing) algorithm. It stops a pending trial X at step S if the prediction of the final epoch's performance is worse than the best final performance in the trial history.
In this algorithm, we use 12 curves to fit the learning curve, the large set of parametric curve models are chosen from [reference paper][1]. The learning curves' shape coincides with our prior knowlwdge about the form of learning curves: They are typically increasing, saturating functions. In this algorithm, we use 12 curves to fit the learning curve. The set of parametric curve models are chosen from this [reference paper][1]. The learning curves' shape coincides with our prior knowledge about the form of learning curves: They are typically increasing, saturating functions.
![](../../img/curvefitting_learning_curve.PNG) ![](../../img/curvefitting_learning_curve.PNG)
...@@ -12,21 +12,21 @@ We combine all learning curve models into a single, more powerful model. This co ...@@ -12,21 +12,21 @@ We combine all learning curve models into a single, more powerful model. This co
![](../../img/curvefitting_f_comb.gif) ![](../../img/curvefitting_f_comb.gif)
where the new combined parameter vector with the new combined parameter vector
![](../../img/curvefitting_expression_xi.gif) ![](../../img/curvefitting_expression_xi.gif)
Assuming additive a Gaussian noise and the noise parameter is initialized to its maximum likelihood estimate. Assuming additive Gaussian noise and the noise parameter being initialized to its maximum likelihood estimate.
We determine the maximum probability value of the new combined parameter vector by learing the historical data. Use such value to predict the future trial performance, and stop the inadequate experiments to save computing resource. We determine the maximum probability value of the new combined parameter vector by learning the historical data. We use such a value to predict future trial performance and stop the inadequate experiments to save computing resources.
Concretely,this algorithm goes through three stages of learning, predicting and assessing. Concretely, this algorithm goes through three stages of learning, predicting, and assessing.
* Step1: Learning. We will learning about the trial history of the current trial and determine the \xi at Bayesian angle. First of all, We fit each curve using the least squares method(implement by `fit_theta`) to save our time. After we obtained the parameters, we filter the curve and remove the outliers(implement by `filter_curve`). Finally, we use the MCMC sampling method(implement by `mcmc_sampling`) to adjust the weight of each curve. Up to now, we have dertermined all the parameters in \xi. * Step1: Learning. We will learn about the trial history of the current trial and determine the \xi at the Bayesian angle. First of all, We fit each curve using the least-squares method, implemented by `fit_theta`. After we obtained the parameters, we filter the curve and remove the outliers, implemented by `filter_curve`. Finally, we use the MCMC sampling method. implemented by `mcmc_sampling`, to adjust the weight of each curve. Up to now, we have determined all the parameters in \xi.
* Step2: Predicting. Calculates the expected final result accuracy(implement by `f_comb`) at target position(ie the total number of epoch) by the \xi and the formula of the combined model. * Step2: Predicting. It calculates the expected final result accuracy, implemented by `f_comb`, at the target position (i.e., the total number of epochs) by \xi and the formula of the combined model.
* Step3: If the fitting result doesn't converge, the predicted value will be `None`, in this case we return `AssessResult.Good` to ask for future accuracy information and predict again. Furthermore, we will get a positive value by `predict()` function, if this value is strictly greater than the best final performance in history * `THRESHOLD`(default value = 0.95), return `AssessResult.Good`, otherwise, return `AssessResult.Bad` * Step3: If the fitting result doesn't converge, the predicted value will be `None`. In this case, we return `AssessResult.Good` to ask for future accuracy information and predict again. Furthermore, we will get a positive value from the `predict()` function. If this value is strictly greater than the best final performance in history * `THRESHOLD`(default value = 0.95), return `AssessResult.Good`, otherwise, return `AssessResult.Bad`
The figure below is the result of our algorithm on MNIST trial history data, where the green point represents the data obtained by Assessor, the blue point represents the future but unknown data, and the red line is the Curve predicted by the Curve fitting assessor. The figure below is the result of our algorithm on MNIST trial history data, where the green point represents the data obtained by Assessor, the blue point represents the future but unknown data, and the red line is the Curve predicted by the Curve fitting assessor.
...@@ -60,11 +60,11 @@ assessor: ...@@ -60,11 +60,11 @@ assessor:
``` ```
## 3. File Structure ## 3. File Structure
The assessor has a lot of different files, functions and classes. Here we will only give most of those files a brief introduction: The assessor has a lot of different files, functions, and classes. Here we briefly describe a few of them.
* `curvefunctions.py` includes all the function expression and default parameters. * `curvefunctions.py` includes all the function expressions and default parameters.
* `modelfactory.py` includes learning and predicting, the corresponding calculation part is also implemented here. * `modelfactory.py` includes learning and predicting; the corresponding calculation part is also implemented here.
* `curvefitting_assessor.py` is a assessor which receives the trial history and assess whether to early stop the trial. * `curvefitting_assessor.py` is the assessor which receives the trial history and assess whether to early stop the trial.
## 4. TODO ## 4. TODO
* Further improve the accuracy of the prediction and test it on more models. * Further improve the accuracy of the prediction and test it on more models.
......
...@@ -3,4 +3,4 @@ Medianstop Assessor on NNI ...@@ -3,4 +3,4 @@ Medianstop Assessor on NNI
## Median Stop ## Median Stop
Medianstop is a simple early stopping rule mentioned in the [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf). It stops a pending trial X at step S if the trial’s best objective value by step S is strictly worse than the median value of the running averages of all completed trials’ objectives reported up to step S. Medianstop is a simple early stopping rule mentioned in this [paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf). It stops a pending trial X after step S if the trial’s best objective value by step S is strictly worse than the median value of the running averages of all completed trials’ objectives reported up to step S.
\ No newline at end of file \ No newline at end of file
...@@ -13,6 +13,8 @@ Index of supported pruning algorithms ...@@ -13,6 +13,8 @@ Index of supported pruning algorithms
* [Filter Pruners with Activation Rank](#activationrankfilterpruner) * [Filter Pruners with Activation Rank](#activationrankfilterpruner)
* [APoZ Rank Pruner](#activationapozrankfilterpruner) * [APoZ Rank Pruner](#activationapozrankfilterpruner)
* [Activation Mean Rank Pruner](#activationmeanrankfilterpruner) * [Activation Mean Rank Pruner](#activationmeanrankfilterpruner)
* [Filter Pruners with Gradient Rank](#gradientrankfilterpruner)
* [Taylor FO On Weight Pruner](#taylorfoweightfilterpruner)
## Level Pruner ## Level Pruner
...@@ -281,7 +283,7 @@ pruner.compress() ...@@ -281,7 +283,7 @@ pruner.compress()
- **op_types:** Only Conv1d and Conv2d is supported in L2Filter Pruner - **op_types:** Only Conv1d and Conv2d is supported in L2Filter Pruner
## ActivationRankFilterPruner ## ActivationRankFilterPruner
ActivationRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the output activations of convolution layers to achieve a preset level of network sparsity ActivationRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the output activations of convolution layers to achieve a preset level of network sparsity.
### ActivationAPoZRankFilterPruner ### ActivationAPoZRankFilterPruner
...@@ -341,4 +343,42 @@ You can view example for more information ...@@ -341,4 +343,42 @@ You can view example for more information
#### User configuration for ActivationMeanRankFilterPruner #### User configuration for ActivationMeanRankFilterPruner
- **sparsity:** How much percentage of convolutional filters are to be pruned. - **sparsity:** How much percentage of convolutional filters are to be pruned.
- **op_types:** Only Conv2d is supported in ActivationMeanRankFilterPruner - **op_types:** Only Conv2d is supported in ActivationMeanRankFilterPruner.
## GradientRankFilterPruner
GradientRankFilterPruner is a series of pruners which prune the filters with the smallest importance criterion calculated from the gradients of convolution layers to achieve a preset level of network sparsity.
### TaylorFOWeightFilterPruner
We implemented it as a one-shot pruner, it prunes convolutional layers based on the first order taylor expansion on weights. The estimated importance of filters is defined as the paper [Importance Estimation for Neural Network Pruning](http://jankautz.com/publications/Importance4NNPruning_CVPR19.pdf). Other pruning criteria mentioned in this paper will be supported in future release.
>
![](../../img/importance_estimation_sum.png)
#### Usage
PyTorch code
```python
from nni.compression.torch import TaylorFOWeightFilterPruner
config_list = [{
'sparsity': 0.5,
'op_types': ['Conv2d']
}]
pruner = TaylorFOWeightFilterPruner(model, config_list, optimizer)
pruner.compress()
```
You can view example for more information
#### User configuration for GradientWeightSumFilterPruner
- **sparsity:** How much percentage of convolutional filters are to be pruned.
- **op_types:** Currently only Conv2d is supported in TaylorFOWeightFilterPruner.
 
\ No newline at end of file
...@@ -94,7 +94,7 @@ For advanced usages, e.g., users want to manipulate the way modules in `LayerCho ...@@ -94,7 +94,7 @@ For advanced usages, e.g., users want to manipulate the way modules in `LayerCho
## Implemented a Distributed NAS Tuner ## Implemented a Distributed NAS Tuner
Before learning how to write a one-shot NAS tuner, users should first learn how to write a general tuner. read [Customize Tuner](../Tuner/CustomizeTuner.md) for tutorials. Before learning how to write a distributed NAS tuner, users should first learn how to write a general tuner. read [Customize Tuner](../Tuner/CustomizeTuner.md) for tutorials.
When users call "[nnictl ss_gen](../Tutorial/Nnictl.md)" to generate search space file, a search space file like this will be generated: When users call "[nnictl ss_gen](../Tutorial/Nnictl.md)" to generate search space file, a search space file like this will be generated:
...@@ -129,4 +129,4 @@ This is the exact search space tuners will receive in `update_search_space`. It' ...@@ -129,4 +129,4 @@ This is the exact search space tuners will receive in `update_search_space`. It'
} }
``` ```
Send it through `generate_parameters`, and the tuner would look like any HPO tuner. Refer to [SPOS](./SPOS.md) example code for an example. Send it through `generate_parameters`, and the tuner would look like any HPO tuner. Refer to [SPOS](./SPOS.md) example code for an example.
\ No newline at end of file
...@@ -8,11 +8,11 @@ ...@@ -8,11 +8,11 @@
![](../../img/nas_abstract_illustration.png) ![](../../img/nas_abstract_illustration.png)
Modern Neural Architecture Search (NAS) methods usually incorporate [three dimensions][1]: search space, search strategy, and performance estimation strategy. Search space often contains a limited neural network architectures to explore, while search strategy samples architectures from search space, gets estimations of their performance, and evolves itself. Ideally, search strategy should find the best architecture in the search space and report it to users. After users obtain such "best architecture", many methods use a "retrain step", which trains the network with the same pipeline as any traditional model. Modern Neural Architecture Search (NAS) methods usually incorporate [three dimensions][1]: search space, search strategy, and performance estimation strategy. Search space often contains a limited number of neural network architectures to explore, while the search strategy samples architectures from search space, gets estimations of their performance, and evolves itself. Ideally, the search strategy should find the best architecture in the search space and report it to users. After users obtain the "best architecture", many methods use a "retrain step", which trains the network with the same pipeline as any traditional model.
## Implement a Search Space ## Implement a Search Space
Assuming now we've got a baseline model, what should we do to be empowered with NAS? Take [MNIST on PyTorch](https://github.com/pytorch/examples/blob/master/mnist/main.py) as an example, the code might look like this: Assuming we've got a baseline model, what should we do to be empowered with NAS? Take [MNIST on PyTorch](https://github.com/pytorch/examples/blob/master/mnist/main.py) as an example, the code might look like this:
```python ```python
from nni.nas.pytorch import mutables from nni.nas.pytorch import mutables
...@@ -37,9 +37,9 @@ class Net(nn.Module): ...@@ -37,9 +37,9 @@ class Net(nn.Module):
return output return output
``` ```
The example above adds an option of choosing conv5x5 at conv1. The modification is as simple as declaring a `LayerChoice` with original conv3x3 and a new conv5x5 as its parameter. That's it! You don't have to modify the forward function in anyway. You can imagine conv1 as any another module without NAS. The example above adds an option of choosing conv5x5 at conv1. The modification is as simple as declaring a `LayerChoice` with the original conv3x3 and a new conv5x5 as its parameter. That's it! You don't have to modify the forward function in any way. You can imagine conv1 as any other module without NAS.
So how about the possibilities of connections? This can be done by `InputChoice`. To allow for a skipconnection on an MNIST example, we add another layer called conv3. In the following example, a possible connection from conv2 is added to the output of conv3. So how about the possibilities of connections? This can be done using `InputChoice`. To allow for a skip connection on the MNIST example, we add another layer called conv3. In the following example, a possible connection from conv2 is added to the output of conv3.
```python ```python
from nni.nas.pytorch import mutables from nni.nas.pytorch import mutables
...@@ -67,21 +67,21 @@ class Net(nn.Module): ...@@ -67,21 +67,21 @@ class Net(nn.Module):
return output return output
``` ```
Input choice can be thought of as a callable module that receives a list of tensors and output the concatenation/sum/mean of some of them (sum by default), or `None` if none is selected. Like layer choices, input choices should be **initialized in `__init__` and called in `forward`**. We will see later that this is to allow search algorithms to identify these choices, and do necessary preparation. Input choice can be thought of as a callable module that receives a list of tensors and outputs the concatenation/sum/mean of some of them (sum by default), or `None` if none is selected. Like layer choices, input choices should be **initialized in `__init__` and called in `forward`**. We will see later that this is to allow search algorithms to identify these choices and do necessary preparations.
`LayerChoice` and `InputChoice` are both **mutables**. Mutable means "changeable". As opposed to traditional deep learning layers/modules which have fixed operation type once defined, models with mutables are essentially a series of possible models. `LayerChoice` and `InputChoice` are both **mutables**. Mutable means "changeable". As opposed to traditional deep learning layers/modules which have fixed operation types once defined, models with mutable are essentially a series of possible models.
Users can specify a **key** for each mutable. By default NNI will assign one for you that is globally unique, but in case users want to share choices (for example, there are two `LayerChoice` with the same candidate operations, and you want them to have the same choice, i.e., if first one chooses the i-th op, the second one also chooses the i-th op), they can give them the same key. The key marks the identity for this choice, and will be used in dumped checkpoint. So if you want to increase the readability of your exported architecture, manually assigning keys to each mutable would be a good idea. For advanced usage on mutables, see [Mutables](./NasReference.md). Users can specify a **key** for each mutable. By default, NNI will assign one for you that is globally unique, but in case users want to share choices (for example, there are two `LayerChoice`s with the same candidate operations and you want them to have the same choice, i.e., if first one chooses the i-th op, the second one also chooses the i-th op), they can give them the same key. The key marks the identity for this choice and will be used in the dumped checkpoint. So if you want to increase the readability of your exported architecture, manually assigning keys to each mutable would be a good idea. For advanced usage on mutables, see [Mutables](./NasReference.md).
## Use a Search Algorithm ## Use a Search Algorithm
Different in how the search space is explored and trials are spawned, there are at least two different ways users can do search. One runs NAS distributedly, which can be as naive as enumerating all the architectures and training each one from scratch, or leveraging more advanced technique, such as [SMASH][8], [ENAS][2], [DARTS][1], [FBNet][3], [ProxylessNAS][4], [SPOS][5], [Single-Path NAS][6], [Understanding One-shot][7] and [GDAS][9]. Since training many different architectures are known to be expensive, another family of methods, called one-shot NAS, builds a supernet containing every candidate in the search space as its subnetwork, and in each step a subnetwork or combination of several subnetworks is trained. Aside from using a search space, there are at least two other ways users can do search. One runs NAS distributedly, which can be as naive as enumerating all the architectures and training each one from scratch, or can involve leveraging more advanced technique, such as [SMASH][8], [ENAS][2], [DARTS][1], [FBNet][3], [ProxylessNAS][4], [SPOS][5], [Single-Path NAS][6], [Understanding One-shot][7] and [GDAS][9]. Since training many different architectures is known to be expensive, another family of methods, called one-shot NAS, builds a supernet containing every candidate in the search space as its subnetwork, and in each step, a subnetwork or combination of several subnetworks is trained.
Currently, several one-shot NAS methods have been supported on NNI. For example, `DartsTrainer` which uses SGD to train architecture weights and model weights iteratively, `ENASTrainer` which [uses a controller to train the model][2]. New and more efficient NAS trainers keep emerging in research community. Currently, several one-shot NAS methods are supported on NNI. For example, `DartsTrainer`, which uses SGD to train architecture weights and model weights iteratively, and `ENASTrainer`, which [uses a controller to train the model][2]. New and more efficient NAS trainers keep emerging in research community and some will be implemented in future releases of NNI.
### One-Shot NAS ### One-Shot NAS
Each one-shot NAS implements a trainer, which users can find detailed usages in the description of each algorithm. Here is a simple example, demonstrating how users can use `EnasTrainer`. Each one-shot NAS algorithm implements a trainer, for which users can find usage details in the description of each algorithm. Here is a simple example, demonstrating how users can use `EnasTrainer`.
```python ```python
# this is exactly same as traditional model training # this is exactly same as traditional model training
...@@ -117,15 +117,15 @@ trainer.train() # training ...@@ -117,15 +117,15 @@ trainer.train() # training
trainer.export(file="model_dir/final_architecture.json") # export the final architecture to file trainer.export(file="model_dir/final_architecture.json") # export the final architecture to file
``` ```
Users can directly run their training file by `python3 train.py`, without `nnictl`. After training, users could export the best one of the found models through `trainer.export()`. Users can directly run their training file through `python3 train.py` without `nnictl`. After training, users can export the best one of the found models through `trainer.export()`.
Normally, the trainer exposes a few arguments that you can customize, for example, loss function, metrics function, optimizer, and datasets. These should satisfy the needs from most usages, and we do our best to make sure our built-in trainers work on as many models, tasks and datasets as possible. But there is no guarantee. For example, some trainers have assumption that the task has to be a classification task; some trainers might have a different definition of "epoch" (e.g., an ENAS epoch = some child steps + some controller steps); most trainers do not have support for distributed training: they won't wrap your model with `DataParallel` or `DistributedDataParallel` to do that. So after a few tryouts, if you want to actually use the trainers on your very customized applications, you might very soon need to [customize your trainer](#extend-the-ability-of-one-shot-trainers). Normally, the trainer exposes a few arguments that you can customize. For example, the loss function, the metrics function, the optimizer, and the datasets. These should satisfy most usages needs and we do our best to make sure our built-in trainers work on as many models, tasks, and datasets as possible. But there is no guarantee. For example, some trainers have the assumption that the task is a classification task; some trainers might have a different definition of "epoch" (e.g., an ENAS epoch = some child steps + some controller steps); most trainers do not have support for distributed training: they won't wrap your model with `DataParallel` or `DistributedDataParallel` to do that. So after a few tryouts, if you want to actually use the trainers on your very customized applications, you might need to [customize your trainer](#extend-the-ability-of-one-shot-trainers).
### Distributed NAS ### Distributed NAS
Neural architecture search is originally executed by running each child model independently as a trial job. We also support this searching approach, and it naturally fits in NNI hyper-parameter tuning framework, where tuner generates child model for next trial and trials run in training service. Neural architecture search was originally executed by running each child model independently as a trial job. We also support this searching approach, and it naturally fits within the NNI hyper-parameter tuning framework, where Tuner generates child models for the next trial and trials run in the training service.
To use this mode, there is no need to change the search space expressed with NNI NAS API (i.e., `LayerChoice`, `InputChoice`, `MutableScope`). After the model is initialized, apply the function `get_and_apply_next_architecture` on the model. One-shot NAS trainers are not used in this mode. Here is a simple example: To use this mode, there is no need to change the search space expressed with the NNI NAS API (i.e., `LayerChoice`, `InputChoice`, `MutableScope`). After the model is initialized, apply the function `get_and_apply_next_architecture` on the model. One-shot NAS trainers are not used in this mode. Here is a simple example:
```python ```python
model = Net() model = Net()
...@@ -137,17 +137,17 @@ acc = test(model) # test the trained model ...@@ -137,17 +137,17 @@ acc = test(model) # test the trained model
nni.report_final_result(acc) # report the performance of the chosen architecture nni.report_final_result(acc) # report the performance of the chosen architecture
``` ```
The search space should be generated and sent to tuner. As with NNI NAS API the search space is embedded in user code, users could use "[nnictl ss_gen](../Tutorial/Nnictl.md)" to generate search space file. Then, put the path of the generated search space in the field `searchSpacePath` of `config.yml`. The other fields in `config.yml` can be filled by referring [this tutorial](../Tutorial/QuickStart.md). The search space should be generated and sent to Tuner. As with the NNI NAS API, the search space is embedded in the user code. Users can use "[nnictl ss_gen](../Tutorial/Nnictl.md)" to generate the search space file. Then put the path of the generated search space in the field `searchSpacePath` of `config.yml`. The other fields in `config.yml` can be filled by referring [this tutorial](../Tutorial/QuickStart.md).
You could use [NNI tuners](../Tuner/BuiltinTuner.md) to do the search. Currently, only PPO Tuner supports NAS search space. You can use the [NNI tuners](../Tuner/BuiltinTuner.md) to do the search. Currently, only PPO Tuner supports NAS search spaces.
We support standalone mode for easy debugging, where you could directly run the trial command without launching an NNI experiment. This is for checking whether your trial code can correctly run. The first candidate(s) are chosen for `LayerChoice` and `InputChoice` in this standalone mode. We support a standalone mode for easy debugging, where you can directly run the trial command without launching an NNI experiment. This is for checking whether your trial code can correctly run. The first candidate(s) are chosen for `LayerChoice` and `InputChoice` in this standalone mode.
A complete example can be found [here](https://github.com/microsoft/nni/tree/master/examples/nas/classic_nas/config_nas.yml). A complete example can be found [here](https://github.com/microsoft/nni/tree/master/examples/nas/classic_nas/config_nas.yml).
### Retrain with Exported Architecture ### Retrain with Exported Architecture
After the searching phase, it's time to train the architecture found. Unlike many open-source NAS algorithms who write a whole new model specifically for retraining. We found that searching model and retraining model are usual very similar, and therefore you can construct your final model with the exact model code. For example After the search phase, it's time to train the found architecture. Unlike many open-source NAS algorithms who write a whole new model specifically for retraining. We found that the search model and retraining model are usually very similar, and therefore you can construct your final model with the exact same model code. For example
```python ```python
model = Net() model = Net()
...@@ -163,9 +163,9 @@ The JSON is simply a mapping from mutable keys to one-hot or multi-hot represent ...@@ -163,9 +163,9 @@ The JSON is simply a mapping from mutable keys to one-hot or multi-hot represent
} }
``` ```
After applying, the model is then fixed and ready for a final training. The model works as a single model, although it might contain more parameters than expected. This comes with pros and cons. The good side is, you can directly load the checkpoint dumped from supernet during search phase and start retrain from there. However, this is also a model with redundant parameters, which may cause problems when trying to count the number of parameters in model. For deeper reasons and possible workaround, see [Trainers](./NasReference.md). After applying, the model is then fixed and ready for final training. The model works as a single model, although it might contain more parameters than expected. This comes with pros and cons. The good side is, you can directly load the checkpoint dumped from supernet during the search phase and start retraining from there. However, this is also a model with redundant parameters and this may cause problems when trying to count the number of parameters in the model. For deeper reasons and possible workarounds, see [Trainers](./NasReference.md).
Also refer to [DARTS](./DARTS.md) for example code of retraining. Also, refer to [DARTS](./DARTS.md) for code exemplifying retraining.
[1]: https://arxiv.org/abs/1808.05377 [1]: https://arxiv.org/abs/1808.05377
[2]: https://arxiv.org/abs/1802.03268 [2]: https://arxiv.org/abs/1802.03268
...@@ -175,4 +175,4 @@ Also refer to [DARTS](./DARTS.md) for example code of retraining. ...@@ -175,4 +175,4 @@ Also refer to [DARTS](./DARTS.md) for example code of retraining.
[6]: https://arxiv.org/abs/1904.02877 [6]: https://arxiv.org/abs/1904.02877
[7]: http://proceedings.mlr.press/v80/bender18a [7]: http://proceedings.mlr.press/v80/bender18a
[8]: https://arxiv.org/abs/1708.05344 [8]: https://arxiv.org/abs/1708.05344
[9]: https://arxiv.org/abs/1910.04465 [9]: https://arxiv.org/abs/1910.04465
\ No newline at end of file
# Neural Architecture Search (NAS) on NNI # Neural Architecture Search (NAS) on NNI
Automatic neural architecture search is taking an increasingly important role on finding better models. Recent research works have proved the feasibility of automatic NAS, and also found some models that could beat manually designed and tuned models. Some of representative works are [NASNet][2], [ENAS][1], [DARTS][3], [Network Morphism][4], and [Evolution][5]. There are new innovations keeping emerging. Automatic neural architecture search is taking an increasingly important role in finding better models. Recent research has proved the feasibility of automatic NAS and has lead to models that beat many manually designed and tuned models. Some representative works are [NASNet][2], [ENAS][1], [DARTS][3], [Network Morphism][4], and [Evolution][5]. Further, new innovations keep emerging.
However, it takes great efforts to implement NAS algorithms, and it is hard to reuse code base of existing algorithms in new one. To facilitate NAS innovations (e.g., design and implement new NAS models, compare different NAS models side-by-side), an easy-to-use and flexible programming interface is crucial. However, it takes a great effort to implement NAS algorithms, and it's hard to reuse the code base of existing algorithms for new ones. To facilitate NAS innovations (e.g., the design and implementation of new NAS models, the comparison of different NAS models side-by-side, etc.), an easy-to-use and flexible programming interface is crucial.
With this motivation, our ambition is to provide a unified architecture in NNI, to accelerate innovations on NAS, and apply state-of-art algorithms on real world problems faster. With this motivation, our ambition is to provide a unified architecture in NNI, accelerate innovations on NAS, and apply state-of-the-art algorithms to real-world problems faster.
With the unified interface, there are two different modes for the architecture search. [One](#supported-one-shot-nas-algorithms) is the so-called one-shot NAS, where a super-net is built based on search space, and using one shot training to generate good-performing child model. [The other](#supported-distributed-nas-algorithms) is the traditional searching approach, where each child model in search space runs as an independent trial, the performance result is sent to tuner and the tuner generates new child model. With the unified interface, there are two different modes for architecture search. [One](#supported-one-shot-nas-algorithms) is the so-called one-shot NAS, where a super-net is built based on a search space and one-shot training is used to generate a good-performing child model. [The other](#supported-distributed-nas-algorithms) is the traditional search-based approach, where each child model within the search space runs as an independent trial. The performance result is then sent to Tuner and the tuner generates a new child model.
## Supported One-shot NAS Algorithms ## Supported One-shot NAS Algorithms
NNI supports below NAS algorithms now and is adding more. User can reproduce an algorithm or use it on their own dataset. We also encourage users to implement other algorithms with [NNI API](#use-nni-api), to benefit more people. NNI currently supports the NAS algorithms listed below and is adding more. Users can reproduce an algorithm or use it on their own dataset. We also encourage users to implement other algorithms with [NNI API](#use-nni-api), to benefit more people.
|Name|Brief Introduction of Algorithm| |Name|Brief Introduction of Algorithm|
|---|---| |---|---|
| [ENAS](ENAS.md) | [Efficient Neural Architecture Search via Parameter Sharing](https://arxiv.org/abs/1802.03268). In ENAS, a controller learns to discover neural network architectures by searching for an optimal subgraph within a large computational graph. It uses parameter sharing between child models to achieve fast speed and excellent performance. | | [ENAS](ENAS.md) | [Efficient Neural Architecture Search via Parameter Sharing](https://arxiv.org/abs/1802.03268). In ENAS, a controller learns to discover neural network architectures by searching for an optimal subgraph within a large computational graph. It uses parameter sharing between child models to achieve fast speed and excellent performance. |
| [DARTS](DARTS.md) | [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) introduces a novel algorithm for differentiable network architecture search on bilevel optimization. | | [DARTS](DARTS.md) | [DARTS: Differentiable Architecture Search](https://arxiv.org/abs/1806.09055) introduces a novel algorithm for differentiable network architecture search on bilevel optimization. |
| [P-DARTS](PDARTS.md) | [Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) is based on DARTS. It introduces an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure. | | [P-DARTS](PDARTS.md) | [Progressive Differentiable Architecture Search: Bridging the Depth Gap between Search and Evaluation](https://arxiv.org/abs/1904.12760) is based on DARTS. It introduces an efficient algorithm which allows the depth of searched architectures to grow gradually during the training procedure. |
| [SPOS](SPOS.md) | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) constructs a simplified supernet trained with an uniform path sampling method, and applies an evolutionary algorithm to efficiently search for the best-performing architectures. | | [SPOS](SPOS.md) | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) constructs a simplified supernet trained with a uniform path sampling method and applies an evolutionary algorithm to efficiently search for the best-performing architectures. |
| [CDARTS](CDARTS.md) | [Cyclic Differentiable Architecture Search](https://arxiv.org/abs/****) builds a cyclic feedback mechanism between the search and evaluation networks. It introduces a cyclic differentiable architecture search framework which integrates the two networks into a unified architecture.| | [CDARTS](CDARTS.md) | [Cyclic Differentiable Architecture Search](https://arxiv.org/abs/****) builds a cyclic feedback mechanism between the search and evaluation networks. It introduces a cyclic differentiable architecture search framework which integrates the two networks into a unified architecture.|
| [ProxylessNAS](Proxylessnas.md) | [ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware](https://arxiv.org/abs/1812.00332).| | [ProxylessNAS](Proxylessnas.md) | [ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware](https://arxiv.org/abs/1812.00332).|
One-shot algorithms run **standalone without nnictl**. Only PyTorch version has been implemented. Tensorflow 2.x will be supported in future release. One-shot algorithms run **standalone without nnictl**. Only the PyTorch version has been implemented. Tensorflow 2.x will be supported in a future release.
Here are some common dependencies to run the examples. PyTorch needs to be above 1.2 to use ``BoolTensor``. Here are some common dependencies to run the examples. PyTorch needs to be above 1.2 to use ``BoolTensor``.
...@@ -34,20 +34,20 @@ Here are some common dependencies to run the examples. PyTorch needs to be above ...@@ -34,20 +34,20 @@ Here are some common dependencies to run the examples. PyTorch needs to be above
|Name|Brief Introduction of Algorithm| |Name|Brief Introduction of Algorithm|
|---|---| |---|---|
| [SPOS's 2nd stage](SPOS.md) | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) constructs a simplified supernet trained with an uniform path sampling method, and applies an evolutionary algorithm to efficiently search for the best-performing architectures. _Note:: SPOS is a two-stage algorithm, whose first stage is one-shot and second stage is distributed, leveraging result of first stage as a checkpoint._| | [SPOS's 2nd stage](SPOS.md) | [Single Path One-Shot Neural Architecture Search with Uniform Sampling](https://arxiv.org/abs/1904.00420) constructs a simplified supernet trained with a uniform path sampling method, and applies an evolutionary algorithm to efficiently search for the best-performing architectures.|
```eval_rst ```eval_rst
.. Note:: SPOS is a two-stage algorithm, whose first stage is one-shot and second stage is distributed, leveraging result of first stage as a checkpoint. .. Note:: SPOS is a two-stage algorithm, whose first stage is one-shot and the second stage is distributed, leveraging the result of the first stage as a checkpoint.
``` ```
## Use NNI API ## Using the NNI API
The programming interface of designing and searching a model is often demanded in two scenarios. The programming interface of designing and searching a model is often demanded in two scenarios.
1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs best. So, it needs an easy way to express the candidate layers or sub-models. 1. When designing a neural network, there may be multiple operation choices on a layer, sub-model, or connection, and it's undetermined which one or combination performs best. So, it needs an easy way to express the candidate layers or sub-models.
2. When applying NAS on a neural network, it needs an unified way to express the search space of architectures, so that it doesn't need to update trial code for different searching algorithms. 2. When applying NAS on a neural network, it needs a unified way to express the search space of architectures, so that it doesn't need to update trial code for different search algorithms.
[Here](./NasGuide.md) is a user guide to get started with using NAS on NNI. [Here](./NasGuide.md) is the user guide to get started with using NAS on NNI.
## Reference and Feedback ## Reference and Feedback
......
# NAS Quick Start # NAS Quick Start
The NAS feature provided by NNI has two key components: APIs for expressing search space, and NAS training approaches. The former is for users to easily specify a class of models (i.e., the candidate models specified by search space) which may perform well. The latter is for users to easily apply state-of-the-art NAS training approaches on their own model. The NAS feature provided by NNI has two key components: APIs for expressing the search space and NAS training approaches. The former is for users to easily specify a class of models (i.e., the candidate models specified by the search space) which may perform well. The latter is for users to easily apply state-of-the-art NAS training approaches on their own model.
Here we use a simple example to demonstrate how to tune your model architecture with NNI NAS APIs step by step. The complete code of this example can be found [here](https://github.com/microsoft/nni/tree/master/examples/nas/naive). Here we use a simple example to demonstrate how to tune your model architecture with the NNI NAS APIs step by step. The complete code of this example can be found [here](https://github.com/microsoft/nni/tree/master/examples/nas/naive).
## Write your model with NAS APIs ## Write your model with NAS APIs
Instead of writing a concrete neural model, you can write a class of neural models using two NAS APIs `LayerChoice` and `InputChoice`. For example, you think either of two operations might work in the first convolution layer, then you can get one from them using `LayerChoice` as shown by `self.conv1` in the code. Similarly, the second convolution layer `self.conv2` also chooses one from two operations. To this line, four candidate neural networks are specified. `self.skipconnect` uses `InputChoice` to specify two choices, i.e., adding skip connection or not. Instead of writing a concrete neural model, you can write a class of neural models using two of the NAS APIs library functions, `LayerChoice` and `InputChoice`. For example, if you think either of two options might work in the first convolution layer, then you can get one from them using `LayerChoice` as shown by `self.conv1` in the code. Similarly, the second convolution layer `self.conv2` also chooses one from two options. To this line, four candidate neural networks are specified. `self.skipconnect` uses `InputChoice` to specify two choices, adding a skip connection or not.
```python ```python
import torch.nn as nn import torch.nn as nn
...@@ -29,11 +29,11 @@ class Net(nn.Module): ...@@ -29,11 +29,11 @@ class Net(nn.Module):
self.fc3 = nn.Linear(84, 10) self.fc3 = nn.Linear(84, 10)
``` ```
For detailed description of `LayerChoice` and `InputChoice`, please refer to [the guidance](NasGuide.md) For a detailed description of `LayerChoice` and `InputChoice`, please refer to [the NAS guide](NasGuide.md)
## Choose a NAS trainer ## Choose a NAS trainer
After the model is instantiated, it is time to train the model using NAS trainer. Different trainers use different approaches to search for the best one from a class of neural models that you specified. NNI provides popular NAS training approaches, such as DARTS, ENAS. Here we use `DartsTrainer` as an example below. After the trainer is instantiated, invoke `trainer.train()` to do the search. After the model is instantiated, it is time to train the model using a NAS trainer. Different trainers use different approaches to search for the best one from a class of neural models that you specified. NNI provides several popular NAS training approaches such as DARTS and ENAS. Here we use `DartsTrainer` in the example below. After the trainer is instantiated, invoke `trainer.train()` to do the search.
```python ```python
trainer = DartsTrainer(net, trainer = DartsTrainer(net,
...@@ -50,15 +50,15 @@ trainer.train() ...@@ -50,15 +50,15 @@ trainer.train()
## Export the best model ## Export the best model
After the search (i.e., `trainer.train()`) is done, we want to get the best performing model, then simply call `trainer.export("final_arch.json")` to export the found neural architecture to a file. After the search (i.e., `trainer.train()`) is done, to get the best performing model we simply call `trainer.export("final_arch.json")` to export the found neural architecture to a file.
## NAS visualization ## NAS visualization
We are working on visualization of NAS and will release soon. We are working on NAS visualization and will release this feature soon.
## Retrain the exported best model ## Retrain the exported best model
It is simple to retrain the found (exported) neural architecture. Step one, instantiate the model you defined above. Step two, invoke `apply_fixed_architecture` on the model. Then the model becomes the found (exported) one, you can use traditional model training to train this model. It is simple to retrain the found (exported) neural architecture. Step one, instantiate the model you defined above. Step two, invoke `apply_fixed_architecture` to the model. Then the model becomes the found (exported) one. Afterward, you can use traditional training to train this model.
```python ```python
model = Net() model = Net()
......
# Overview # Overview
NNI (Neural Network Intelligence) is a toolkit to help users design and tune machine learning models (e.g., hyperparameters), neural network architectures, or complex system's parameters, in an efficient and automatic way. NNI has several appealing properties: easy-to-use, scalability, flexibility, and efficiency. NNI (Neural Network Intelligence) is a toolkit to help users design and tune machine learning models (e.g., hyperparameters), neural network architectures, or complex system's parameters, in an efficient and automatic way. NNI has several appealing properties: ease-of-use, scalability, flexibility, and efficiency.
* **Easy-to-use**: NNI can be easily installed through python pip. Only several lines need to be added to your code in order to use NNI's power. You can use both commandline tool and WebUI to work with your experiments. * **Ease-of-use**: NNI can be easily installed through python pip. Only several lines need to be added to your code in order to use NNI's power. You can use both the commandline tool and WebUI to work with your experiments.
* **Scalability**: Tuning hyperparameters or neural architecture often demands large amount of computation resource, while NNI is designed to fully leverage different computation resources, such as remote machines, training platforms (e.g., OpenPAI, Kubernetes). Hundreds of trials could run in parallel by depending on the capacity of your configured training platforms. * **Scalability**: Tuning hyperparameters or the neural architecture often demands a large number of computational resources, while NNI is designed to fully leverage different computation resources, such as remote machines, training platforms (e.g., OpenPAI, Kubernetes). Hundreds of trials could run in parallel by depending on the capacity of your configured training platforms.
* **Flexibility**: Besides rich built-in algorithms, NNI allows users to customize various hyperparameter tuning algorithms, neural architecture search algorithms, early stopping algorithms, etc. Users could also extend NNI with more training platforms, such as virtual machines, kubernetes service on the cloud. Moreover, NNI can connect to external environments to tune special applications/models on them. * **Flexibility**: Besides rich built-in algorithms, NNI allows users to customize various hyperparameter tuning algorithms, neural architecture search algorithms, early stopping algorithms, etc. Users can also extend NNI with more training platforms, such as virtual machines, kubernetes service on the cloud. Moreover, NNI can connect to external environments to tune special applications/models on them.
* **Efficiency**: We are intensively working on more efficient model tuning from both system level and algorithm level. For example, leveraging early feedback to speedup tuning procedure. * **Efficiency**: We are intensively working on more efficient model tuning on both the system and algorithm level. For example, we leverage early feedback to speedup the tuning procedure.
The figure below shows high-level architecture of NNI. The figure below shows high-level architecture of NNI.
...@@ -15,23 +15,23 @@ The figure below shows high-level architecture of NNI. ...@@ -15,23 +15,23 @@ The figure below shows high-level architecture of NNI.
## Key Concepts ## Key Concepts
* *Experiment*: An experiment is one task of, for example, finding out the best hyperparameters of a model, finding out the best neural network architecture. It consists of trials and AutoML algorithms. * *Experiment*: One task of, for example, finding out the best hyperparameters of a model, finding out the best neural network architecture, etc. It consists of trials and AutoML algorithms.
* *Search Space*: It means the feasible region for tuning the model. For example, the value range of each hyperparameters. * *Search Space*: The feasible region for tuning the model. For example, the value range of each hyperparameter.
* *Configuration*: A configuration is an instance from the search space, that is, each hyperparameter has a specific value. * *Configuration*: An instance from the search space, that is, each hyperparameter has a specific value.
* *Trial*: Trial is an individual attempt at applying a new configuration (e.g., a set of hyperparameter values, a specific nerual architecture). Trial code should be able to run with the provided configuration. * *Trial*: An individual attempt at applying a new configuration (e.g., a set of hyperparameter values, a specific neural architecture, etc.). Trial code should be able to run with the provided configuration.
* *Tuner*: Tuner is an AutoML algorithm, which generates a new configuration for the next try. A new trial will run with this configuration. * *Tuner*: An AutoML algorithm, which generates a new configuration for the next try. A new trial will run with this configuration.
* *Assessor*: Assessor analyzes trial's intermediate results (e.g., periodically evaluated accuracy on test dataset) to tell whether this trial can be early stopped or not. * *Assessor*: Analyze a trial's intermediate results (e.g., periodically evaluated accuracy on test dataset) to tell whether this trial can be early stopped or not.
* *Training Platform*: It means where trials are executed. Depending on your experiment's configuration, it could be your local machine, or remote servers, or large-scale training platform (e.g., OpenPAI, Kubernetes). * *Training Platform*: Where trials are executed. Depending on your experiment's configuration, it could be your local machine, or remote servers, or large-scale training platform (e.g., OpenPAI, Kubernetes).
Basically, an experiment runs as follows: Tuner receives search space and generates configurations. These configurations will be submitted to training platforms, such as local machine, remote machines, or training clusters. Their performances are reported back to Tuner. Then, new configurations are generated and submitted. Basically, an experiment runs as follows: Tuner receives search space and generates configurations. These configurations will be submitted to training platforms, such as the local machine, remote machines, or training clusters. Their performances are reported back to Tuner. Then, new configurations are generated and submitted.
For each experiment, user only needs to define a search space and update a few lines of code, and then leverage NNI built-in Tuner/Assessor and training platforms to search the best hyperparameters and/or neural architecture. There are basically 3 steps: For each experiment, the user only needs to define a search space and update a few lines of code, and then leverage NNI built-in Tuner/Assessor and training platforms to search the best hyperparameters and/or neural architecture. There are basically 3 steps:
>Step 1: [Define search space](Tutorial/SearchSpaceSpec.md) >Step 1: [Define search space](Tutorial/SearchSpaceSpec.md)
...@@ -44,31 +44,31 @@ For each experiment, user only needs to define a search space and update a few l ...@@ -44,31 +44,31 @@ For each experiment, user only needs to define a search space and update a few l
<img src="https://user-images.githubusercontent.com/23273522/51816627-5d13db80-2302-11e9-8f3e-627e260203d5.jpg" alt="drawing"/> <img src="https://user-images.githubusercontent.com/23273522/51816627-5d13db80-2302-11e9-8f3e-627e260203d5.jpg" alt="drawing"/>
</p> </p>
More details about how to run an experiment, please refer to [Get Started](Tutorial/QuickStart.md). For more details about how to run an experiment, please refer to [Get Started](Tutorial/QuickStart.md).
## Core Features ## Core Features
NNI provides a key capacity to run multiple instances in parallel to find best combinations of parameters. This feature can be used in various domains, like find best hyperparameters for a deep learning model, or find best configuration for database and other complex system with real data. NNI provides a key capacity to run multiple instances in parallel to find the best combinations of parameters. This feature can be used in various domains, like finding the best hyperparameters for a deep learning model or finding the best configuration for database and other complex systems with real data.
NNI is also like to provide algorithm toolkits for machine learning and deep learning, especially neural architecture search (NAS) algorithms, model compression algorithms, and feature engineering algorithms. NNI also provides algorithm toolkits for machine learning and deep learning, especially neural architecture search (NAS) algorithms, model compression algorithms, and feature engineering algorithms.
### Hyperparameter Tuning ### Hyperparameter Tuning
This is a core and basic feature of NNI, we provide many popular [automatic tuning algorithms](Tuner/BuiltinTuner.md) (i.e., tuner) and [early stop algorithms](Assessor/BuiltinAssessor.md) (i.e., assessor). You could follow [Quick Start](Tutorial/QuickStart.md) to tune your model (or system). Basically, there are the above three steps and then start an NNI experiment. This is a core and basic feature of NNI, we provide many popular [automatic tuning algorithms](Tuner/BuiltinTuner.md) (i.e., tuner) and [early stop algorithms](Assessor/BuiltinAssessor.md) (i.e., assessor). You can follow [Quick Start](Tutorial/QuickStart.md) to tune your model (or system). Basically, there are the above three steps and then starting an NNI experiment.
### General NAS Framework ### General NAS Framework
This NAS framework is for users to easily specify candidate neural architectures, for example, could specify multiple candidate operations (e.g., separable conv, dilated conv) for a single layer, and specify possible skip connections. NNI will find the best candidate automatically. On the other hand, the NAS framework provides simple interface for another type of users (e.g., NAS algorithm researchers) to implement new NAS algorithms. Detailed description and usage can be found [here](NAS/Overview.md). This NAS framework is for users to easily specify candidate neural architectures, for example, one can specify multiple candidate operations (e.g., separable conv, dilated conv) for a single layer, and specify possible skip connections. NNI will find the best candidate automatically. On the other hand, the NAS framework provides a simple interface for another type of user (e.g., NAS algorithm researchers) to implement new NAS algorithms. A detailed description of NAS and its usage can be found [here](NAS/Overview.md).
NNI has supported many one-shot NAS algorithms, such as ENAS, DARTS, through NNI trial SDK. To use these algorithms you do not have to start an NNI experiment. Instead, to import an algorithm in your trial code, and simply run your trial code. If you want to tune the hyperparameters in the algorithms or want to run multiple instances, you could choose a tuner and start an NNI experiment. NNI has support for many one-shot NAS algorithms such as ENAS and DARTS through NNI trial SDK. To use these algorithms you do not have to start an NNI experiment. Instead, import an algorithm in your trial code and simply run your trial code. If you want to tune the hyperparameters in the algorithms or want to run multiple instances, you can choose a tuner and start an NNI experiment.
Other than one-shot NAS, NAS can also run in a classic mode where each candidate architecture runs as an independent trial job. In this mode, similar to hyperparameter tuning, users have to start an NNI experiment and choose a tuner for NAS. Other than one-shot NAS, NAS can also run in a classic mode where each candidate architecture runs as an independent trial job. In this mode, similar to hyperparameter tuning, users have to start an NNI experiment and choose a tuner for NAS.
### Model Compression ### Model Compression
Model Compression on NNI includes pruning algorithms and quantization algorithms. These algorithms are provided through NNI trial SDK. Users could directly use them in their trial code and run the trial code without starting an NNI experiment. Detailed description and usage can be found [here](Compressor/Overview.md). Model Compression on NNI includes pruning algorithms and quantization algorithms. These algorithms are provided through NNI trial SDK. Users can directly use them in their trial code and run the trial code without starting an NNI experiment. A detailed description of model compression and its usage can be found [here](Compressor/Overview.md).
There are different types of hyperparamters in model compression. One type is the hyperparameters in input configuration, e.g., sparsity, quantization bits, to a compression algorithm. The other type is the hyperparamters in compression algorithms. Here, Hyperparameter tuning of NNI could help a lot in finding the best compressed model automatically. A simple example can be found [here](Compressor/AutoCompression.md). There are different types of hyperparameters in model compression. One type is the hyperparameters in input configuration (e.g., sparsity, quantization bits) to a compression algorithm. The other type is the hyperparameters in compression algorithms. Here, Hyperparameter tuning of NNI can help a lot in finding the best compressed model automatically. A simple example can be found [here](Compressor/AutoCompression.md).
### Automatic Feature Engineering ### Automatic Feature Engineering
Automatic feature engineering is for users to find the best features for the following tasks. Detailed description and usage can be found [here](FeatureEngineering/Overview.md). It is supported through NNI trial SDK, which means you do not have to create an NNI experiment. Instead, simply import a built-in auto-feature-engineering algorithm in your trial code and directly run your trial code. Automatic feature engineering is for users to find the best features for their tasks. A detailed description of automatic feature engineering and its usage can be found [here](FeatureEngineering/Overview.md). It is supported through NNI trial SDK, which means you do not have to create an NNI experiment. Instead, simply import a built-in auto-feature-engineering algorithm in your trial code and directly run your trial code.
The auto-feature-engineering algorithms usually have a bunch of hyperparameters themselves. If you want to automatically tune those hyperparameters, you can leverage hyperparameter tuning of NNI, that is, choose a tuning algorithm (i.e., tuner) and start an NNI experiment for it. The auto-feature-engineering algorithms usually have a bunch of hyperparameters themselves. If you want to automatically tune those hyperparameters, you can leverage hyperparameter tuning of NNI, that is, choose a tuning algorithm (i.e., tuner) and start an NNI experiment for it.
......
# Write a Trial Run on NNI # Write a Trial Run on NNI
A **Trial** in NNI is an individual attempt at applying a configuration (e.g., a set of hyper-parameters) on a model. A **Trial** in NNI is an individual attempt at applying a configuration (e.g., a set of hyper-parameters) to a model.
To define an NNI trial, you need to firstly define the set of parameters (i.e., search space) and then update the model. NNI provide two approaches for you to define a trial: [NNI API](#nni-api) and [NNI Python annotation](#nni-annotation). You could also refer to [here](#more-examples) for more trial examples. To define an NNI trial, you need to first define the set of parameters (i.e., search space) and then update the model. NNI provides two approaches for you to define a trial: [NNI API](#nni-api) and [NNI Python annotation](#nni-annotation). You could also refer to [here](#more-examples) for more trial examples.
<a name="nni-api"></a> <a name="nni-api"></a>
## NNI API ## NNI API
...@@ -20,9 +20,9 @@ An example is shown below: ...@@ -20,9 +20,9 @@ An example is shown below:
} }
``` ```
Refer to [SearchSpaceSpec.md](../Tutorial/SearchSpaceSpec.md) to learn more about search space. Tuner will generate configurations from this search space, that is, choosing a value for each hyperparameter from the range. Refer to [SearchSpaceSpec.md](../Tutorial/SearchSpaceSpec.md) to learn more about search spaces. Tuner will generate configurations from this search space, that is, choosing a value for each hyperparameter from the range.
### Step 2 - Update model codes ### Step 2 - Update model code
- Import NNI - Import NNI
...@@ -44,18 +44,18 @@ RECEIVED_PARAMS = nni.get_next_parameter() ...@@ -44,18 +44,18 @@ RECEIVED_PARAMS = nni.get_next_parameter()
nni.report_intermediate_result(metrics) nni.report_intermediate_result(metrics)
``` ```
`metrics` could be any python object. If users use NNI built-in tuner/assessor, `metrics` can only have two formats: 1) a number e.g., float, int, 2) a dict object that has a key named `default` whose value is a number. This `metrics` is reported to [assessor](../Assessor/BuiltinAssessor.md). Usually, `metrics` could be periodically evaluated loss or accuracy. `metrics` can be any python object. If users use the NNI built-in tuner/assessor, `metrics` can only have two formats: 1) a number e.g., float, int, or 2) a dict object that has a key named `default` whose value is a number. These `metrics` are reported to [assessor](../Assessor/BuiltinAssessor.md). Often, `metrics` includes the periodically evaluated loss or accuracy.
- Report performance of the configuration - Report performance of the configuration
```python ```python
nni.report_final_result(metrics) nni.report_final_result(metrics)
``` ```
`metrics` also could be any python object. If users use NNI built-in tuner/assessor, `metrics` follows the same format rule as that in `report_intermediate_result`, the number indicates the model's performance, for example, the model's accuracy, loss etc. This `metrics` is reported to [tuner](../Tuner/BuiltinTuner.md). `metrics` can also be any python object. If users use the NNI built-in tuner/assessor, `metrics` follows the same format rule as that in `report_intermediate_result`, the number indicates the model's performance, for example, the model's accuracy, loss etc. These `metrics` are reported to [tuner](../Tuner/BuiltinTuner.md).
### Step 3 - Enable NNI API ### Step 3 - Enable NNI API
To enable NNI API mode, you need to set useAnnotation to *false* and provide the path of SearchSpace file (you just defined in step 1): To enable NNI API mode, you need to set useAnnotation to *false* and provide the path of the SearchSpace file was defined in step 1:
```yaml ```yaml
useAnnotation: false useAnnotation: false
...@@ -69,23 +69,23 @@ You can refer to [here](../Tutorial/ExperimentConfig.md) for more information ab ...@@ -69,23 +69,23 @@ You can refer to [here](../Tutorial/ExperimentConfig.md) for more information ab
<a name="nni-annotation"></a> <a name="nni-annotation"></a>
## NNI Python Annotation ## NNI Python Annotation
An alternative to writing a trial is to use NNI's syntax for python. Simple as any annotation, NNI annotation is working like comments in your codes. You don't have to make structure or any other big changes to your existing codes. With a few lines of NNI annotation, you will be able to: An alternative to writing a trial is to use NNI's syntax for python. NNI annotations are simple, similar to comments. You don't have to make structural changes to your existing code. With a few lines of NNI annotation, you will be able to:
* annotate the variables you want to tune * annotate the variables you want to tune
* specify in which range you want to tune the variables * specify the range in which you want to tune the variables
* annotate which variable you want to report as intermediate result to `assessor` * annotate which variable you want to report as an intermediate result to `assessor`
* annotate which variable you want to report as the final result (e.g. model accuracy) to `tuner`. * annotate which variable you want to report as the final result (e.g. model accuracy) to `tuner`.
Again, take MNIST as an example, it only requires 2 steps to write a trial with NNI Annotation. Again, take MNIST as an example, it only requires 2 steps to write a trial with NNI Annotation.
### Step 1 - Update codes with annotations ### Step 1 - Update codes with annotations
The following is a tensorflow code snippet for NNI Annotation, where the highlighted four lines are annotations that help you to: The following is a TensorFlow code snippet for NNI Annotation where the highlighted four lines are annotations that:
1. tune batch\_size and dropout\_rate 1. tune batch\_size and dropout\_rate
2. report test\_acc every 100 steps 2. report test\_acc every 100 steps
3. at last report test\_acc as final result. 3. lastly report test\_acc as the final result.
What noteworthy is: as these newly added codes are annotations, it does not actually change your previous codes logic, you can still run your code as usual in environments without NNI installed. It's worth noting that, as these newly added codes are merely annotations, you can still run your code as usual in environments without NNI installed.
```diff ```diff
with tf.Session() as sess: with tf.Session() as sess:
...@@ -114,7 +114,7 @@ with tf.Session() as sess: ...@@ -114,7 +114,7 @@ with tf.Session() as sess:
``` ```
**NOTE**: **NOTE**:
- `@nni.variable` will take effect on its following line, which is an assignment statement whose leftvalue must be specified by the keyword `name` in `@nni.variable`. - `@nni.variable` will affect its following line which should be an assignment statement whose left-hand side must be the same as the keyword `name` in the `@nni.variable` statement.
- `@nni.report_intermediate_result`/`@nni.report_final_result` will send the data to assessor/tuner at that line. - `@nni.report_intermediate_result`/`@nni.report_final_result` will send the data to assessor/tuner at that line.
For more information about annotation syntax and its usage, please refer to [Annotation](../Tutorial/AnnotationSpec.md). For more information about annotation syntax and its usage, please refer to [Annotation](../Tutorial/AnnotationSpec.md).
...@@ -127,9 +127,9 @@ In the YAML configure file, you need to set *useAnnotation* to true to enable NN ...@@ -127,9 +127,9 @@ In the YAML configure file, you need to set *useAnnotation* to true to enable NN
useAnnotation: true useAnnotation: true
``` ```
## Standalone mode for debug ## Standalone mode for debugging
NNI supports standalone mode for trial code to run without starting an NNI experiment. This is for finding out bugs in trial code more conveniently. NNI annotation natively supports standalone mode, as the added NNI related lines are comments. For NNI trial APIs, the APIs have changed behaviors in standalone mode, some APIs return dummy values, and some APIs do not really report values. Please refer to the following table for the full list of these APIs. NNI supports a standalone mode for trial code to run without starting an NNI experiment. This is for finding out bugs in trial code more conveniently. NNI annotation natively supports standalone mode, as the added NNI related lines are comments. For NNI trial APIs, the APIs have changed behaviors in standalone mode, some APIs return dummy values, and some APIs do not really report values. Please refer to the following table for the full list of these APIs.
```python ```python
# NOTE: please assign default values to the hyperparameters in your trial code # NOTE: please assign default values to the hyperparameters in your trial code
nni.get_next_parameter # return {} nni.get_next_parameter # return {}
...@@ -140,17 +140,17 @@ nni.get_trial_id # return "STANDALONE" ...@@ -140,17 +140,17 @@ nni.get_trial_id # return "STANDALONE"
nni.get_sequence_id # return 0 nni.get_sequence_id # return 0
``` ```
You can try standalone mode with the [mnist example](https://github.com/microsoft/nni/tree/master/examples/trials/mnist-tfv1). Simply run `python3 mnist.py` under the code directory. The trial code successfully runs with default hyperparameter values. You can try standalone mode with the [mnist example](https://github.com/microsoft/nni/tree/master/examples/trials/mnist-tfv1). Simply run `python3 mnist.py` under the code directory. The trial code should successfully run with the default hyperparameter values.
For more debuggability, please refer to [How to Debug](../Tutorial/HowToDebug.md) For more information on debugging, please refer to [How to Debug](../Tutorial/HowToDebug.md)
## Where are my trials? ## Where are my trials?
### Local Mode ### Local Mode
In NNI, every trial has a dedicated directory for them to output their own data. In each trial, an environment variable called `NNI_OUTPUT_DIR` is exported. Under this directory, you could find each trial's code, data and other possible log. In addition, each trial's log (including stdout) will be re-directed to a file named `trial.log` under that directory. In NNI, every trial has a dedicated directory for them to output their own data. In each trial, an environment variable called `NNI_OUTPUT_DIR` is exported. Under this directory, you can find each trial's code, data, and other logs. In addition, each trial's log (including stdout) will be re-directed to a file named `trial.log` under that directory.
If NNI Annotation is used, trial's converted code is in another temporary directory. You can check that in a file named `run.sh` under the directory indicated by `NNI_OUTPUT_DIR`. The second line (i.e., the `cd` command) of this file will change directory to the actual directory where code is located. Below is an example of `run.sh`: If NNI Annotation is used, the trial's converted code is in another temporary directory. You can check that in a file named `run.sh` under the directory indicated by `NNI_OUTPUT_DIR`. The second line (i.e., the `cd` command) of this file will change directory to the actual directory where code is located. Below is an example of `run.sh`:
```bash ```bash
#!/bin/bash #!/bin/bash
...@@ -168,9 +168,9 @@ echo $? `date +%s%3N` >/home/user_name/nni/experiments/$experiment_id$/trials/$t ...@@ -168,9 +168,9 @@ echo $? `date +%s%3N` >/home/user_name/nni/experiments/$experiment_id$/trials/$t
### Other Modes ### Other Modes
When running trials on other platform like remote machine or PAI, the environment variable `NNI_OUTPUT_DIR` only refers to the output directory of the trial, while trial code and `run.sh` might not be there. However, the `trial.log` will be transmitted back to local machine in trial's directory, which defaults to `~/nni/experiments/$experiment_id$/trials/$trial_id$/` When running trials on other platforms like remote machine or PAI, the environment variable `NNI_OUTPUT_DIR` only refers to the output directory of the trial, while the trial code and `run.sh` might not be there. However, the `trial.log` will be transmitted back to the local machine in the trial's directory, which defaults to `~/nni/experiments/$experiment_id$/trials/$trial_id$/`
For more information, please refer to [HowToDebug](../Tutorial/HowToDebug.md) For more information, please refer to [HowToDebug](../Tutorial/HowToDebug.md).
<a name="more-examples"></a> <a name="more-examples"></a>
## More Trial Examples ## More Trial Examples
......
...@@ -3,6 +3,6 @@ Batch Tuner on NNI ...@@ -3,6 +3,6 @@ Batch Tuner on NNI
## Batch Tuner ## Batch Tuner
Batch tuner allows users to simply provide several configurations (i.e., choices of hyper-parameters) for their trial code. After finishing all the configurations, the experiment is done. Batch tuner only supports the type choice in [search space spec](../Tutorial/SearchSpaceSpec.md). Batch tuner allows users to simply provide several configurations (i.e., choices of hyper-parameters) for their trial code. After finishing all the configurations, the experiment is done. Batch tuner only supports the type `choice` in the [search space spec](../Tutorial/SearchSpaceSpec.md).
Suggested scenario: If the configurations you want to try have been decided, you can list them in SearchSpace file (using choice) and run them using batch tuner. Suggested scenario: If the configurations you want to try have been decided, you can list them in the SearchSpace file (using `choice`) and run them using the batch tuner.
...@@ -2,48 +2,48 @@ BOHB Advisor on NNI ...@@ -2,48 +2,48 @@ BOHB Advisor on NNI
=== ===
## 1. Introduction ## 1. Introduction
BOHB is a robust and efficient hyperparameter tuning algorithm mentioned in [reference paper](https://arxiv.org/abs/1807.01774). BO is the abbreviation of Bayesian optimization and HB is the abbreviation of Hyperband. BOHB is a robust and efficient hyperparameter tuning algorithm mentioned in [this reference paper](https://arxiv.org/abs/1807.01774). BO is an abbreviation for "Bayesian Optimization" and HB is an abbreviation for "Hyperband".
BOHB relies on HB(Hyperband) to determine how many configurations to evaluate with which budget, but it **replaces the random selection of configurations at the beginning of each HB iteration by a model-based search(Byesian Optimization)**. Once the desired number of configurations for the iteration is reached, the standard successive halving procedure is carried out using these configurations. We keep track of the performance of all function evaluations g(x, b) of configurations x on all budgets b to use as a basis for our models in later iterations. BOHB relies on HB (Hyperband) to determine how many configurations to evaluate with which budget, but it **replaces the random selection of configurations at the beginning of each HB iteration by a model-based search (Bayesian Optimization)**. Once the desired number of configurations for the iteration is reached, the standard successive halving procedure is carried out using these configurations. We keep track of the performance of all function evaluations g(x, b) of configurations x on all budgets b to use as a basis for our models in later iterations.
Below we divide introduction of the BOHB process into two parts: Below we divide the introduction of the BOHB process into two parts:
### HB (Hyperband) ### HB (Hyperband)
We follow Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving, for more details, you can refer to the [Hyperband in NNI](HyperbandAdvisor.md) and [reference paper of Hyperband](https://arxiv.org/abs/1603.06560). This procedure is summarized by the pseudocode below. We follow Hyperband’s way of choosing the budgets and continue to use SuccessiveHalving. For more details, you can refer to the [Hyperband in NNI](HyperbandAdvisor.md) and the [reference paper for Hyperband](https://arxiv.org/abs/1603.06560). This procedure is summarized by the pseudocode below.
![](../../img/bohb_1.png) ![](../../img/bohb_1.png)
### BO (Bayesian Optimization) ### BO (Bayesian Optimization)
The BO part of BOHB closely resembles TPE, with one major difference: we opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE in order to better handle interaction effects in the input space. The BO part of BOHB closely resembles TPE with one major difference: we opted for a single multidimensional KDE compared to the hierarchy of one-dimensional KDEs used in TPE in order to better handle interaction effects in the input space.
Tree Parzen Estimator(TPE): uses a KDE(kernel density estimator) to model the densities. Tree Parzen Estimator(TPE): uses a KDE (kernel density estimator) to model the densities.
![](../../img/bohb_2.png) ![](../../img/bohb_2.png)
To fit useful KDEs, we require a minimum number of data points Nmin; this is set to d + 1 for our experiments, where d is the number of hyperparameters. To build a model as early as possible, we do not wait until Nb = |Db|, the number of observations for budget b, is large enough to satisfy q · Nb ≥ Nmin. Instead, after initializing with Nmin + 2 random configurations, we choose the To fit useful KDEs, we require a minimum number of data points Nmin; this is set to d + 1 for our experiments, where d is the number of hyperparameters. To build a model as early as possible, we do not wait until Nb = |Db|, where the number of observations for budget b is large enough to satisfy q · Nb ≥ Nmin. Instead, after initializing with Nmin + 2 random configurations, we choose the
![](../../img/bohb_3.png) ![](../../img/bohb_3.png)
best and worst configurations, respectively, to model the two densities. best and worst configurations, respectively, to model the two densities.
Note that we alse sample a constant fraction named **random fraction** of the configurations uniformly at random. Note that we also sample a constant fraction named **random fraction** of the configurations uniformly at random.
## 2. Workflow ## 2. Workflow
![](../../img/bohb_6.jpg) ![](../../img/bohb_6.jpg)
This image shows the workflow of BOHB. Here we set max_budget = 9, min_budget = 1, eta = 3, others as default. In this case, s_max = 2, so we will continuesly run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle. In each stage of SuccessiveHalving (the orange box), we will pick the top 1/eta configurations and run them again with more budget, repeated SuccessiveHalving stage until the end of this iteration. At the same time, we collect the configurations, budgets and final metrics of each trial, and use this to build a multidimensional KDEmodel with the key "budget". This image shows the workflow of BOHB. Here we set max_budget = 9, min_budget = 1, eta = 3, others as default. In this case, s_max = 2, so we will continuously run the {s=2, s=1, s=0, s=2, s=1, s=0, ...} cycle. In each stage of SuccessiveHalving (the orange box), we will pick the top 1/eta configurations and run them again with more budget, repeating the SuccessiveHalving stage until the end of this iteration. At the same time, we collect the configurations, budgets and final metrics of each trial and use these to build a multidimensional KDEmodel with the key "budget".
Multidimensional KDE is used to guide the selection of configurations for the next iteration. Multidimensional KDE is used to guide the selection of configurations for the next iteration.
The way of sampling procedure(use Multidimensional KDE to guide the selection) is summarized by the pseudocode below. The sampling procedure (using Multidimensional KDE to guide selection) is summarized by the pseudocode below.
![](../../img/bohb_4.png) ![](../../img/bohb_4.png)
## 3. Usage ## 3. Usage
BOHB advisor requires [ConfigSpace](https://github.com/automl/ConfigSpace) package, ConfigSpace need to be installed by following command before first use. BOHB advisor requires the [ConfigSpace](https://github.com/automl/ConfigSpace) package. ConfigSpace can be installed using the following command.
```bash ```bash
nnictl package install --name=BOHB nnictl package install --name=BOHB
...@@ -67,26 +67,26 @@ advisor: ...@@ -67,26 +67,26 @@ advisor:
min_bandwidth: 0.001 min_bandwidth: 0.001
``` ```
**Requirement of classArg** **classArgs Requirements:**
* **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', tuners will target to maximize metrics. If 'minimize', tuner will target to minimize metrics. * **optimize_mode** (*maximize or minimize, optional, default = maximize*) - If 'maximize', tuners will try to maximize metrics. If 'minimize', tuner will try to minimize metrics.
* **min_budget** (*int, optional, default = 1*) - The smallest budget assign to a trial job, (budget could be the number of mini-batches or epochs). Needs to be positive. * **min_budget** (*int, optional, default = 1*) - The smallest budget to assign to a trial job, (budget can be the number of mini-batches or epochs). Needs to be positive.
* **max_budget** (*int, optional, default = 3*) - The largest budget assign to a trial job, (budget could be the number of mini-batches or epochs). Needs to be larger than min_budget. * **max_budget** (*int, optional, default = 3*) - The largest budget to assign to a trial job, (budget can be the number of mini-batches or epochs). Needs to be larger than min_budget.
* **eta** (*int, optional, default = 3*) - In each iteration, a complete run of sequential halving is executed. In it, after evaluating each configuration on the same subset size, only a fraction of 1/eta of them 'advances' to the next round. Must be greater or equal to 2. * **eta** (*int, optional, default = 3*) - In each iteration, a complete run of sequential halving is executed. In it, after evaluating each configuration on the same subset size, only a fraction of 1/eta of them 'advances' to the next round. Must be greater or equal to 2.
* **min_points_in_model**(*int, optional, default = None*): number of observations to start building a KDE. Default 'None' means dim+1, when the number of completed trial in this budget is equal or larger than `max{dim+1, min_points_in_model}`, BOHB will start to build a KDE model of this budget, then use KDE model to guide the configuration selection. Need to be positive.(dim means the number of hyperparameters in search space) * **min_points_in_model**(*int, optional, default = None*): number of observations to start building a KDE. Default 'None' means dim+1; when the number of completed trials in this budget is equal to or larger than `max{dim+1, min_points_in_model}`, BOHB will start to build a KDE model of this budget then use said KDE model to guide configuration selection. Needs to be positive. (dim means the number of hyperparameters in search space)
* **top_n_percent**(*int, optional, default = 15*): percentage (between 1 and 99, default 15) of the observations that are considered good. Good points and bad points are used for building KDE models. For example, if you have 100 observed trials and top_n_percent is 15, then top 15 point will used for building good point models "l(x)", the remaining 85 point will used for building bad point models "g(x)". * **top_n_percent**(*int, optional, default = 15*): percentage (between 1 and 99) of the observations which are considered good. Good points and bad points are used for building KDE models. For example, if you have 100 observed trials and top_n_percent is 15, then the top 15% of points will be used for building the good points models "l(x)". The remaining 85% of points will be used for building the bad point models "g(x)".
* **num_samples**(*int, optional, default = 64*): number of samples to optimize EI (default 64). In this case, we will sample "num_samples"(default = 64) points, and compare the result of l(x)/g(x), then return one with the maximum l(x)/g(x) value as the next configuration if the optimize_mode is maximize. Otherwise, we return the smallest one. * **num_samples**(*int, optional, default = 64*): number of samples to optimize EI (default 64). In this case, we will sample "num_samples" points and compare the result of l(x)/g(x). Then we will return the one with the maximum l(x)/g(x) value as the next configuration if the optimize_mode is `maximize`. Otherwise, we return the smallest one.
* **random_fraction**(*float, optional, default = 0.33*): fraction of purely random configurations that are sampled from the prior without the model. * **random_fraction**(*float, optional, default = 0.33*): fraction of purely random configurations that are sampled from the prior without the model.
* **bandwidth_factor**(*float, optional, default = 3.0*): to encourage diversity, the points proposed to optimize EI, are sampled from a 'widened' KDE where the bandwidth is multiplied by this factor. Suggest to use default value if you are not familiar with KDE. * **bandwidth_factor**(*float, optional, default = 3.0*): to encourage diversity, the points proposed to optimize EI are sampled from a 'widened' KDE where the bandwidth is multiplied by this factor. We suggest using the default value if you are not familiar with KDE.
* **min_bandwidth**(*float, optional, default = 0.001*): to keep diversity, even when all (good) samples have the same value for one of the parameters, a minimum bandwidth (default: 1e-3) is used instead of zero. Suggest to use default value if you are not familiar with KDE. * **min_bandwidth**(*float, optional, default = 0.001*): to keep diversity, even when all (good) samples have the same value for one of the parameters, a minimum bandwidth (default: 1e-3) is used instead of zero. We suggest using the default value if you are not familiar with KDE.
*Please note that currently float type only support decimal representation, you have to use 0.333 instead of 1/3 and 0.001 instead of 1e-3.* *Please note that the float type currently only supports decimal representations. You have to use 0.333 instead of 1/3 and 0.001 instead of 1e-3.*
## 4. File Structure ## 4. File Structure
The advisor has a lot of different files, functions and classes. Here we will only give most of those files a brief introduction: The advisor has a lot of different files, functions, and classes. Here, we will only give most of those files a brief introduction:
* `bohb_advisor.py` Defination of BOHB, handle the interaction with the dispatcher, including generating new trial and processing results. Also includes the implementation of HB(Hyperband) part. * `bohb_advisor.py` Definition of BOHB, handles interaction with the dispatcher, including generating new trials and processing results. Also includes the implementation of the HB (Hyperband) part.
* `config_generator.py` includes the implementation of BO(Bayesian Optimization) part. The function *get_config* can generate new configuration base on BO, the function *new_result* will update model with the new result. * `config_generator.py` Includes the implementation of the BO (Bayesian Optimization) part. The function *get_config* can generate new configurations based on BO; the function *new_result* will update the model with the new result.
## 5. Experiment ## 5. Experiment
...@@ -94,8 +94,8 @@ The advisor has a lot of different files, functions and classes. Here we will on ...@@ -94,8 +94,8 @@ The advisor has a lot of different files, functions and classes. Here we will on
code implementation: [examples/trials/mnist-advisor](https://github.com/Microsoft/nni/tree/master/examples/trials/) code implementation: [examples/trials/mnist-advisor](https://github.com/Microsoft/nni/tree/master/examples/trials/)
We chose BOHB to build CNN on the MNIST dataset. The following is our experimental final results: We chose BOHB to build a CNN on the MNIST dataset. The following is our experimental final results:
![](../../img/bohb_5.png) ![](../../img/bohb_5.png)
More experimental result can be found in the [reference paper](https://arxiv.org/abs/1807.01774), we can see that BOHB makes good use of previous results, and has a balance trade-off in exploration and exploitation. More experimental results can be found in the [reference paper](https://arxiv.org/abs/1807.01774). We can see that BOHB makes good use of previous results and has a balanced trade-off in exploration and exploitation.
\ No newline at end of file \ No newline at end of file
This diff is collapsed.
...@@ -3,4 +3,4 @@ Naive Evolution Tuners on NNI ...@@ -3,4 +3,4 @@ Naive Evolution Tuners on NNI
## Naive Evolution ## Naive Evolution
Naive Evolution comes from [Large-Scale Evolution of Image Classifiers](https://arxiv.org/pdf/1703.01041.pdf). It randomly initializes a population based on search space. For each generation, it chooses better ones and do some mutation (e.g., change a hyperparameter, add/remove one layer) on them to get the next generation. Naive Evolution requires many trials to works, but it's very simple and easily to expand new features. Naive Evolution comes from [Large-Scale Evolution of Image Classifiers](https://arxiv.org/pdf/1703.01041.pdf). It randomly initializes a population based on the search space. For each generation, it chooses better ones and does some mutation (e.g., changes a hyperparameter, adds/removes one layer, etc.) on them to get the next generation. Naive Evolution requires many trials to works but it's very simple and it's easily expanded with new features.
\ No newline at end of file
...@@ -3,10 +3,10 @@ GP Tuner on NNI ...@@ -3,10 +3,10 @@ GP Tuner on NNI
## GP Tuner ## GP Tuner
Bayesian optimization works by constructing a posterior distribution of functions (Gaussian Process here) that best describes the function you want to optimize. As the number of observations grows, the posterior distribution improves, and the algorithm becomes more certain of which regions in parameter space are worth exploring and which are not. Bayesian optimization works by constructing a posterior distribution of functions (a Gaussian Process) that best describes the function you want to optimize. As the number of observations grows, the posterior distribution improves, and the algorithm becomes more certain of which regions in parameter space are worth exploring and which are not.
GP Tuner is designed to minimize/maximize the number of steps required to find a combination of parameters that are close to the optimal combination. To do so, this method uses a proxy optimization problem (finding the maximum of the acquisition function) that, albeit still a hard problem, is cheaper (in the computational sense) and common tools can be employed. Therefore Bayesian Optimization is most adequate for situations where sampling the function to be optimized is a very expensive endeavor. GP Tuner is designed to minimize/maximize the number of steps required to find a combination of parameters that are close to the optimal combination. To do so, this method uses a proxy optimization problem (finding the maximum of the acquisition function) that, albeit still a hard problem, is cheaper (in the computational sense) to solve, and it's amenable to common tools. Therefore, Bayesian Optimization is suggested for situations where sampling the function to be optimized is very expensive.
Note that the only acceptable types of search space are `randint`, `uniform`, `quniform`, `loguniform`, `qloguniform`, and numerical `choice`. Note that the only acceptable types within the search space are `randint`, `uniform`, `quniform`, `loguniform`, `qloguniform`, and numerical `choice`.
This optimization approach is described in Section 3 of [Algorithms for Hyper-Parameter Optimization](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf). This optimization approach is described in Section 3 of [Algorithms for Hyper-Parameter Optimization](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf).
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment