Merge pull request #2492 from microsoft/v1.6

merge V1.6 back

Merge pull request #2492 from microsoft/v1.6
merge V1.6 back
d628942b · SparkSnail · GitHub · a4bbb796 · 7d5feeb9 · d628942b
Unverified Commit d628942b authored May 26, 2020 by SparkSnail Committed by GitHub May 26, 2020
18 changed files
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn

 ## Version number
 NNI_VERSION_VALUE = $(shell git describe --tags)
+NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
 NNI_VERSION_TEMPLATE = 999.0.0-developing

 # Main targets

--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
 * Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm.
 * ML Platform owners who want to **support AutoML in their platform**.

-### **[NNI v1.5 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
+### **[NNI v1.6 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**

 ## **NNI capabilities in a glance**

@@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is
 * Download the examples via clone the source code.

  ```bash
-  git clone -b v1.5 https://github.com/Microsoft/nni.git
+  git clone -b v1.6 https://github.com/Microsoft/nni.git
  ```

 * Run the MNIST example.

--- a/deployment/pypi/Makefile
+++ b/deployment/pypi/Makefile
@@ -13,6 +13,7 @@ endif

 TIME_STAMP = $(shell date -u "+%y%m%d%H%M")
 NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0)
+NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)

 # To include time stamp in version value, run:
 # make version_ts=true build
@@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn
 NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
 .PHONY: build
 build:
+	# Building version $(NNI_VERSION_VALUE)
 	python3 -m pip install --user --upgrade setuptools wheel
 	wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz
 	rm -rf $(CWD)node-$(OS_SPEC)-x64

--- a/deployment/pypi/install.ps1
+++ b/deployment/pypi/install.ps1
@@ -15,6 +15,7 @@ else{

 $TIME_STAMP = date -u "+%y%m%d%H%M"
 $NNI_VERSION_VALUE = git describe --tags --abbrev=0
+$NNI_VERSION_VALUE = $NNI_VERSION_VALUE.substring(1)

 # To include time stamp in version value, run:
 # make version_ts=true build

--- a/docs/en_US/Release.md
+++ b/docs/en_US/Release.md
 # ChangeLog

+## Release 1.6 - 5/26/2020
+
+### Major Features
+
+#### New Features and improvement
+* Improve IPC limitation to 100W
+* improve code storage upload logic among trials in non-local platform
+* support `__version__` for SDK version
+* support windows dev intall
+
+#### Web UI
+* Show trial error message
+* finalize homepage layout
+* Refactor overview's best trials module
+* Remove multiphase from webui
+* add tooltip for trial concurrency in the overview page
+* Show top trials for hyper-parameter graph
+
+#### HPO Updates
+* Improve PBT on failure handling and support experiment resume for PBT
+
+#### NAS Updates
+* NAS support for TensorFlow 2.0 (preview) [TF2.0 NAS examples](https://github.com/microsoft/nni/tree/master/examples/nas/naive-tf)
+* Use OrderedDict for LayerChoice
+* Prettify the format of export
+* Replace layer choice with selected module after applied fixed architecture
+
+#### Model Compression Updates
+* Model compression PyTorch 1.4 support
+
+#### Training Service Updates
+* update pai yaml merge logic
+* support windows as remote machine in remote mode [Remote Mode](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/RemoteMachineMode.md#windows)
+
+### Bug Fix
+* fix dev install
+* SPOS example crash when the checkpoints do not have state_dict
+* Fix table sort issue when experiment had failed trial
+* Support multi python env (conda, pyenv etc)
+
+
 ## Release 1.5 - 4/13/2020

 ### New Features and Documentation

--- a/examples/nas/enas-tf/datasets.py
+++ b/examples/nas/enas-tf/datasets.py
@@ -2,7 +2,6 @@
 # Licensed under the MIT license.

 import tensorflow as tf
-from tensorflow.data import Dataset

 def get_dataset():
    (x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()

--- a/src/nni_manager/core/nnimanager.ts
+++ b/src/nni_manager/core/nnimanager.ts
@@ -566,7 +566,7 @@ class NNIManager implements Manager {
            assert(this.status.status === 'RUNNING' ||
                this.status.status === 'DONE' ||
                this.status.status === 'NO_MORE_TRIAL' ||
-                this.status.status === 'TUNER_NO_MORE_TRIAL');
+                this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
            if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
                this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
                if (this.status.status !== 'DONE') {

--- a/src/nni_manager/training_service/kubernetes/kubernetesData.ts
+++ b/src/nni_manager/training_service/kubernetes/kubernetesData.ts
@@ -47,10 +47,10 @@ export NNI_EXP_ID={4}
 export NNI_CODE_DIR={5}
 export NNI_TRIAL_SEQ_ID={6}
 {7}
-mkdir -p $NNI_SYS_DIR
+mkdir -p $NNI_SYS_DIR/code
 mkdir -p $NNI_OUTPUT_DIR
-cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
-cd $NNI_SYS_DIR
-sh install_nni.sh
+cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
+sh $NNI_SYS_DIR/install_nni.sh
+cd $NNI_SYS_DIR/code
 python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
 --nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
--- a/src/nni_manager/training_service/local/localTrainingService.ts
+++ b/src/nni_manager/training_service/local/localTrainingService.ts
@@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService {
    private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
        const script: string[] = [];
        if (process.platform === 'win32') {
-            script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`);
-            script.push(`cd $env:NNI_SYS_DIR`);
+            script.push(`cd $env:NNI_CODE_DIR`);
            script.push(
                `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
                `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
                `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
                `Write $LASTEXITCODE " " $NOW_DATE  | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
        } else {
-            script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`);
-            script.push(`cd $NNI_SYS_DIR`);
+            script.push(`cd $NNI_CODE_DIR`);
            script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
            if (process.platform === 'darwin') {
                // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x

--- a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts
@@ -31,6 +31,6 @@ fi`;

 export const PAI_K8S_TRIAL_COMMAND_FORMAT: string =
 `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
-&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
-&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
+&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
+&& cd $NNI_SYS_DIR/code && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
 --nni_manager_version '{10}' --log_collection '{11}'`;
--- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -54,7 +54,7 @@ const yaml = require('js-yaml');
 class PAIK8STrainingService extends PAITrainingService {
    protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
    private copyExpCodeDirPromise?: Promise<void>;
-    private paiJobConfig: undefined;
+    private paiJobConfig: any;
    private nniVersion: string | undefined;
    constructor() {
        super();
@@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService {

        let nniJobConfig: any = undefined;
        if (this.paiTrialConfig.paiConfigPath) {
-            nniJobConfig = this.paiJobConfig;
+            nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript
            nniJobConfig.name = jobName;
            // Each taskRole will generate new command in NNI's command format
            // Each command will be formatted to NNI style
@@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService {
            await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters);
        }

-        //Copy codeDir files to local working folder
-        await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
        //Generate Job Configuration in yaml format
        const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
        this.log.debug(paiJobConfig);

--- a/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/extends/linuxCommands.ts
@@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands {
            export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \
            NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir}
            export MULTI_PHASE=${isMultiPhase}
-
-            cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
-            cd $NNI_SYS_DIR
-            sh install_nni.sh
+            mkdir -p $NNI_SYS_DIR/code
+            cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
+            sh $NNI_SYS_DIR/install_nni.sh
+            cd $NNI_SYS_DIR/code
            python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \
                --nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \
                --job_id_file ${jobIdFileName} \
@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
        return result;
    }

-    public killChildProcesses(pidFileName: string): string {
+    public killChildProcesses(pidFileName: string, killSelf: boolean): string {
        // prevent trialkeeper to be killed, so it can save exit code.
-        const command = `list_descendants ()
+        let command = `list_descendants ()
                {
                local children=$(ps -o pid= --ppid "$1")

@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
                echo "$children"
                }
            kill $(list_descendants \`cat '${pidFileName}'\`)`
+        if (killSelf) {
+            command += `\nkill \`cat '${pidFileName}'\``
+        }
        return command;
    }


--- a/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/extends/windowsCommands.ts
@@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands {
            set MULTI_PHASE=${isMultiPhase}
            set NNI_CODE_DIR=${codeDir}
            ${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""}
-
-            robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%
-            cd %NNI_SYS_DIR%
+            md %NNI_SYS_DIR%/code
+            robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%/code
+            cd %NNI_SYS_DIR%/code
            python -c "import nni" 2>nul
            if not %ERRORLEVEL% EQU 0 (
                echo installing NNI as exit code of "import nni" is %ERRORLEVEL%
@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
        return result;
    }

-    public killChildProcesses(pidFileName: string): string {
-        const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
+    public killChildProcesses(pidFileName: string, killSelf: boolean): string {
+        let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
            `Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
-            `if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` +
+            `if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
            `kill-tree $ppid"`;
+        if (killSelf){
+            command += `;Stop-Process -Id $ppid`;
+        }
        return command;
    }


--- a/src/nni_manager/training_service/remote_machine/osCommands.ts
+++ b/src/nni_manager/training_service/remote_machine/osCommands.ts
@@ -25,7 +25,7 @@ abstract class OsCommands {
    public abstract readLastLines(fileName: string, lineCount: number): string;
    public abstract isProcessAliveCommand(pidFileName: string): string;
    public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
-    public abstract killChildProcesses(pidFileName: string): string;
+    public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
    public abstract extractFile(tarFileName: string, targetFolder: string): string;
    public abstract executeScript(script: string, isFile: boolean): string;


--- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
                }
            }
            if (restServer.getErrorMessage !== undefined) {
-                throw new Error(restServer.getErrorMessage);
                this.stopping = true;
+                throw new Error(restServer.getErrorMessage);
            }
            await delay(3000);
        }
@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
                if (executor !== undefined) {
                    this.log.info(`killing gpu metric collector on ${executor.name}`);
                    const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
-                    await executor.killChildProcesses(gpuJobPidPath);
+                    await executor.killChildProcesses(gpuJobPidPath, true);
                }
                executorManager.releaseAllExecutor();
            }
@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
                            this.timer.unsubscribe(disposable);
                        }
                    }
+                    if (this.stopping){
+                        this.timer.unsubscribe(disposable);
+                        this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
+                    }
                    collectingCount.pop();
                }
            }

--- a/src/nni_manager/training_service/remote_machine/shellExecutor.ts
+++ b/src/nni_manager/training_service/remote_machine/shellExecutor.ts
@@ -230,8 +230,8 @@ class ShellExecutor {
        return result !== undefined ? result : false;
    }

-    public async killChildProcesses(pidFileName: string): Promise<boolean> {
-        const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName);
+    public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
+        const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
        const commandResult = await this.execute(commandText);
        return commandResult.exitCode == 0;
    }

--- a/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py
+++ b/src/sdk/pynni/nni/nas/tensorflow/enas/trainer.py
@@ -4,7 +4,6 @@
 import logging

 import tensorflow as tf
-from tensorflow.data import Dataset
 from tensorflow.keras.optimizers import Adam

 from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
@@ -39,9 +38,9 @@ class EnasTrainer:

        x, y = dataset_train
        split = int(len(x) * 0.9)
-        self.train_set = Dataset.from_tensor_slices((x[:split], y[:split]))
-        self.valid_set = Dataset.from_tensor_slices((x[split:], y[split:]))
-        self.test_set = Dataset.from_tensor_slices(dataset_valid)
+        self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
+        self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
+        self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)

        self.mutator = EnasMutator(model)
        self.mutator_optim = Adam(learning_rate=mutator_lr)
@@ -151,9 +150,9 @@ class EnasTrainer:


    def _create_train_loader(self):
-        train_set = self.train_set.shuffle(1000000).batch(self.batch_size)
-        test_set = self.test_set.shuffle(1000000).batch(self.batch_size)
+        train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
+        test_set = self.test_set.shuffle(1000000).repeat().batch(self.batch_size)
        return iter(train_set), iter(test_set)

    def _create_validate_loader(self):
-        return iter(self.test_set.shuffle(1000000).batch(self.batch_size))
+        return iter(self.test_set.shuffle(1000000).repeat().batch(self.batch_size))
--- a/tools/nni_gpu_tool/gpu_metrics_collector.py
+++ b/tools/nni_gpu_tool/gpu_metrics_collector.py
@@ -11,31 +11,9 @@ import traceback
 from xml.dom import minidom


-def check_ready_to_run():
-    if sys.platform == 'win32':
-        pgrep_output = subprocess.check_output(
-            'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
-        pidList = pgrep_output.decode("utf-8").strip().split()
-        pidList.pop(0)  # remove the key word 'ProcessId'
-        pidList = list(map(int, pidList))
-        pidList.remove(os.getpid())
-        return not pidList
-    else:
-        pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
-        pidList = []
-        for pid in pgrep_output.splitlines():
-            pid = pid.decode()
-            if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
-                continue
-            pidList.append(pid)
-        return not pidList
-
-
 def main(argv):
    metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
-    if check_ready_to_run() == False:
-        print("GPU metrics collector is already running. exiting...")
-        exit(2)
+
    cmd = 'nvidia-smi -q -x'.split()
    while(True):
        try: