Merge pull request #221 from microsoft/master

merge master

Merge pull request #221 from microsoft/master
merge master
36e6e350 · SparkSnail · GitHub · 543239c6 · 7cbde508 · 36e6e350
Unverified Commit 36e6e350 authored Dec 19, 2019 by SparkSnail Committed by GitHub Dec 19, 2019
20 changed files
--- a/docs/zh_CN/builtin_tuner.rst
+++ b/docs/zh_CN/builtin_tuner.rst
@@ -17,3 +17,4 @@
    Network Morphism<Tuner/NetworkmorphismTuner>
    Hyperband<Tuner/HyperbandAdvisor>
    BOHB<Tuner/BohbAdvisor>
+    PPO Tuner <Tuner/PPOTuner>
--- a/docs/zh_CN/conf.py
+++ b/docs/zh_CN/conf.py
@@ -28,7 +28,7 @@ author = 'Microsoft'
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = 'v1.1'
+release = 'v1.2'
 # -- General configuration ---------------------------------------------------

--- a/docs/zh_CN/examples.rst
+++ b/docs/zh_CN/examples.rst
@@ -10,3 +10,4 @@
    Scikit-learn<./TrialExample/SklearnExamples>
    EvolutionSQuAD<./TrialExample/SquadEvolutionExamples>
    GBDT<./TrialExample/GbdtExample>
+    RocksDB <./TrialExample/RocksdbExamples>
--- a/docs/zh_CN/feature_engineering.rst
+++ b/docs/zh_CN/feature_engineering.rst
-#################
 特征工程
-#################
+===================
 很高兴的宣布 NNI 的特征工程包 Alpha 版本发布了。
 其仍处于试验阶段，会根据使用反馈来演化。

--- a/docs/zh_CN/nas.rst
+++ b/docs/zh_CN/nas.rst
-#################
+##############
 NAS 算法
-#################
+##############
 自动化的神经网络架构（NAS）搜索在寻找更好的模型方面发挥着越来越重要的作用。
 最近的研究工作证明了自动化 NAS 的可行性，并发现了一些超越手动设计和调整的模型。
@@ -20,6 +20,6 @@ NAS 算法
    概述 <NAS/Overview>
    NAS 接口 <NAS/NasInterface>
-    ENAS <NAS/Overview>
+    ENAS <NAS/ENAS>
-    DARTS <NAS/Overview>
+    DARTS <NAS/DARTS>
    P-DARTS <NAS/Overview>
--- a/docs/zh_CN/reference.rst
+++ b/docs/zh_CN/reference.rst
@@ -10,3 +10,4 @@
    配置<Tutorial/ExperimentConfig>
    搜索空间<Tutorial/SearchSpaceSpec>
    实现训练平台<TrainingService/HowToImplementTrainingService>
+    Framework Library <SupportedFramework_Library>
--- a/docs/zh_CN/training_services.rst
+++ b/docs/zh_CN/training_services.rst
@@ -2,8 +2,9 @@ NNI 支持的训练平台介绍
 =====================================
 ..  toctree::
+    概述 <./TrainingService/SupportTrainingService>
    本机<./TrainingService/LocalMode>
    远程<./TrainingService/RemoteMachineMode>
    OpenPAI<./TrainingService/PaiMode>
    Kubeflow<./TrainingService/KubeflowMode>
    FrameworkController<./TrainingService/FrameworkControllerMode>
\ No newline at end of file
--- a/examples/trials/ga_squad/README.md
+++ b/examples/trials/ga_squad/README.md
@@ -116,10 +116,6 @@ trial:
  memoryMB: 32869
  #The docker image to run NNI job on OpenPAI
  image: msranni/nni:latest
-  #The hdfs directory to store data on OpenPAI, format 'hdfs://host:port/directory'
-  dataDir: hdfs://10.10.10.10:9000/username/nni
-  #The hdfs directory to store output data generated by NNI, format 'hdfs://host:port/directory'
-  outputDir: hdfs://10.10.10.10:9000/username/nni
 paiConfig:
  #The username to login OpenPAI
  userName: username
@@ -129,7 +125,7 @@ paiConfig:
  host: 10.10.10.10
 ```
-Please change the default value to your personal account and machine information. Including `nniManagerIp`, `dataDir`, `outputDir`, `userName`, `passWord` and `host`.
+Please change the default value to your personal account and machine information. Including `nniManagerIp`, `userName`, `passWord` and `host`.
 In the "trial" part, if you want to use GPU to perform the architecture search, change `gpuNum` from `0` to `1`. You need to increase the `maxTrialNum` and `maxExecDuration`, according to how long you want to wait for the search result.

--- a/examples/trials/ga_squad/README_zh_CN.md
+++ b/examples/trials/ga_squad/README_zh_CN.md
@@ -113,22 +113,18 @@
      gpuNum: 0
      cpuNum: 1
      memoryMB: 32869
-      #在 OpenPAI 上运行 NNI 任务的 Docker 映像
+      # 在 OpenPAI 上运行 NNI 的 Docker 映像
      image: msranni/nni:latest
-      #在 OpenPAI 的 hdfs 目录上存储数据的目录，如：'hdfs://host:port/directory'
-      dataDir: hdfs://10.10.10.10:9000/username/nni
-      #在 OpenPAI 的 hdfs 目录上存储输出的目录，如：'hdfs://host:port/directory'
-      outputDir: hdfs://10.10.10.10:9000/username/nni
    paiConfig:
-      #登录 OpenPAI 的用户名
+      # 登录 OpenPAI 的用户名
      userName: username
-      #登录 OpenPAI 的密码
+      # 登录 OpenPAI 的密码
      passWord: password
-      # OpenPAI 的 RESTful 服务器地址
+      # OpenPAI 的 RestFUL 服务器地址
      host: 10.10.10.10
-将默认值改为个人账户和服务器信息。 包括 `nniManagerIp`, `dataDir`, `outputDir`, `userName`, `passWord` 和 `host`。
+将默认值改为个人账户和服务器信息。 包括 `nniManagerIp`, `userName`, `passWord` 和 `host`.
 在 "trial" 部分中，如果需要使用 GPU 来进行架构搜索，可将 `gpuNum` 从 `0` 改为 `1`。 根据训练时长，可以增加 `maxTrialNum` 和 `maxExecDuration`。

--- a/src/nni_manager/core/ipcInterface.ts
+++ b/src/nni_manager/core/ipcInterface.ts
@@ -74,6 +74,8 @@ class IpcInterface {
        this.readBuffer = Buffer.alloc(0);
        this.incomingStream.on('data', (data: Buffer) => { this.receive(data); });
+        this.incomingStream.on('error', (error: Error) => { this.eventEmitter.emit('error', error); });
+        this.outgoingStream.on('error', (error: Error) => { this.eventEmitter.emit('error', error); });
    }
    /**
@@ -106,6 +108,10 @@ class IpcInterface {
        this.eventEmitter.on('command', listener);
    }
+    public onError(listener: (error: Error) => void): void {
+        this.eventEmitter.on('error', listener);
+    }
    /**
     * Deal with incoming data from process
     * Invoke listeners for each complete command received, save incomplete command to buffer

--- a/src/nni_manager/core/nnimanager.ts
+++ b/src/nni_manager/core/nnimanager.ts
@@ -652,6 +652,10 @@ class NNIManager implements Manager {
                this.criticalError(NNIError.FromError(err, 'Tuner command event error: '));
            });
        });
+        this.dispatcher.onError((error: Error) => {
+            this.log.error(`Dispatcher error: ${error.message}`);
+            this.criticalError(new Error('Dispatcher stream error, tuner may have crashed.'));
+        });
    }
    private sendInitTunerCommands(): void {

--- a/src/nni_manager/package.json
+++ b/src/nni_manager/package.json
@@ -63,7 +63,8 @@
    "lodash.merge": "^4.6.2",
    "node.extend": "^1.1.7",
    "hoek": "^4.2.1",
-    "js-yaml": "^3.13.1"
+    "js-yaml": "^3.13.1",
+    "npm": "^6.13.4"
  },
  "engines": {
    "node": ">=10.0.0"

--- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -442,7 +442,7 @@ class RemoteMachineTrainingService implements TrainingService {
            async (tick: number) => {
                const cmdresult: RemoteCommandResult = await SSHClientUtility.remoteExeCommand(
                    `tail -n 1 ${unixPathJoin(remoteGpuScriptCollectorDir, 'gpu_metrics')}`, conn);
-                if (cmdresult !== undefined && cmdresult.stdout !== undefined) {
+                if (cmdresult !== undefined && cmdresult.stdout !== undefined && cmdresult.stdout.length > 0) {
                    rmMeta.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
                    if (rmMeta.gpuSummary.gpuCount === 0) {
                        this.log.warning(`No GPU found on remote machine ${rmMeta.ip}`);

--- a/src/nni_manager/training_service/remote_machine/sshClientUtility.ts
+++ b/src/nni_manager/training_service/remote_machine/sshClientUtility.ts
@@ -101,10 +101,9 @@ export namespace SSHClientUtility {
     * @param sshClient SSH client
     */
    export async function copyDirectoryToRemote(localDirectory: string, remoteDirectory: string, sshClient: Client, remoteOS: string): Promise<void> {
-        const deferred: Deferred<void> = new Deferred<void>();
+        const tmpSuffix: string = uniqueString(5);
-        const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
+        const localTarPath: string = path.join(os.tmpdir(), `nni_tmp_local_${tmpSuffix}.tar.gz`);
-        const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
+        const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), `nni_tmp_remote_${tmpSuffix}.tar.gz`);
-        const remoteTarPath: string = unixPathJoin(getRemoteTmpDir(remoteOS), tmpTarName);
        // Compress files in local directory to experiment root directory
        await tarAdd(localTarPath, localDirectory);
@@ -114,9 +113,6 @@ export namespace SSHClientUtility {
        // Decompress the remote compressed file in and delete it
        await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
        await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
-        deferred.resolve();
-        return deferred.promise;
    }
    export function getRemoteFileContent(filePath: string, sshClient: Client): Promise<string> {

--- a/src/nni_manager/yarn.lock
+++ b/src/nni_manager/yarn.lock
--- a/src/sdk/pynni/nni/feature_engineering/__init__.py
+++ b/src/sdk/pynni/nni/feature_engineering/__init__.py
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/fginitialize.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/fginitialize.py
@@ -33,8 +33,8 @@ from torch.utils.data import DataLoader, Dataset
 # pylint: disable=E0611
 from torch.utils.data.dataloader import _DataLoaderIter, _utils
-import nni.feature_engineering.gradient_selector.constants as constants
+from . import constants
-import nni.feature_engineering.gradient_selector.syssettings as syssettings
+from . import syssettings
 torch.set_default_tensor_type(syssettings.torch.tensortype)
 sparsetensor = syssettings.torch.sparse.tensortype

--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/fgtrain.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/fgtrain.py
@@ -26,10 +26,10 @@ import torch
 from sklearn.feature_selection import SelectKBest, \
    f_classif, mutual_info_classif, f_regression, mutual_info_regression
-import nni.feature_engineering.gradient_selector.constants as constants
+from . import constants
-import nni.feature_engineering.gradient_selector.syssettings as syssettings
+from . import syssettings
-from nni.feature_engineering.gradient_selector.learnability import Solver
+from .learnability import Solver
-from nni.feature_engineering.gradient_selector.utils import EMA
+from .utils import EMA
 torch.set_default_tensor_type(syssettings.torch.tensortype)

--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py
@@ -30,9 +30,9 @@ from sklearn.utils.validation import check_is_fitted
 import torch
 from nni.feature_engineering.feature_selector import FeatureSelector
-import nni.feature_engineering.gradient_selector.constants as constants
+from . import constants
-from nni.feature_engineering.gradient_selector.fginitialize import PrepareData
+from .fginitialize import PrepareData
-from nni.feature_engineering.gradient_selector.fgtrain import _train
+from .fgtrain import _train
 class FeatureGradientSelector(FeatureSelector, BaseEstimator, SelectorMixin):

--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/learnability.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/learnability.py
@@ -25,9 +25,9 @@ import scipy.special
 import torch
 import torch.nn as nn
-import nni.feature_engineering.gradient_selector.constants as constants
+from . import constants
-import nni.feature_engineering.gradient_selector.syssettings as syssettings
+from . import syssettings
-from nni.feature_engineering.gradient_selector.fginitialize import ChunkDataLoader
+from .fginitialize import ChunkDataLoader
 torch.set_default_tensor_type(syssettings.torch.tensortype)
 sparsetensor = syssettings.torch.sparse.tensortype