"vscode:/vscode.git/clone" did not exist on "70368cec0a993694bde8ffa98d894b8bcb60d41c"
Unverified Commit d628942b authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #2492 from microsoft/v1.6

merge V1.6 back
parents a4bbb796 7d5feeb9
......@@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
## Version number
NNI_VERSION_VALUE = $(shell git describe --tags)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
NNI_VERSION_TEMPLATE = 999.0.0-developing
# Main targets
......
......@@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
* Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm.
* ML Platform owners who want to **support AutoML in their platform**.
### **[NNI v1.5 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
### **[NNI v1.6 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
## **NNI capabilities in a glance**
......@@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is
* Download the examples via clone the source code.
```bash
git clone -b v1.5 https://github.com/Microsoft/nni.git
git clone -b v1.6 https://github.com/Microsoft/nni.git
```
* Run the MNIST example.
......
......@@ -13,6 +13,7 @@ endif
TIME_STAMP = $(shell date -u "+%y%m%d%H%M")
NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
# To include time stamp in version value, run:
# make version_ts=true build
......@@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn
NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
.PHONY: build
build:
# Building version $(NNI_VERSION_VALUE)
python3 -m pip install --user --upgrade setuptools wheel
wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz
rm -rf $(CWD)node-$(OS_SPEC)-x64
......
......@@ -15,6 +15,7 @@ else{
$TIME_STAMP = date -u "+%y%m%d%H%M"
$NNI_VERSION_VALUE = git describe --tags --abbrev=0
$NNI_VERSION_VALUE = $NNI_VERSION_VALUE.substring(1)
# To include time stamp in version value, run:
# make version_ts=true build
......
# ChangeLog
## Release 1.6 - 5/26/2020
### Major Features
#### New Features and improvement
* Improve IPC limitation to 100W
* improve code storage upload logic among trials in non-local platform
* support `__version__` for SDK version
* support windows dev intall
#### Web UI
* Show trial error message
* finalize homepage layout
* Refactor overview's best trials module
* Remove multiphase from webui
* add tooltip for trial concurrency in the overview page
* Show top trials for hyper-parameter graph
#### HPO Updates
* Improve PBT on failure handling and support experiment resume for PBT
#### NAS Updates
* NAS support for TensorFlow 2.0 (preview) [TF2.0 NAS examples](https://github.com/microsoft/nni/tree/master/examples/nas/naive-tf)
* Use OrderedDict for LayerChoice
* Prettify the format of export
* Replace layer choice with selected module after applied fixed architecture
#### Model Compression Updates
* Model compression PyTorch 1.4 support
#### Training Service Updates
* update pai yaml merge logic
* support windows as remote machine in remote mode [Remote Mode](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/RemoteMachineMode.md#windows)
### Bug Fix
* fix dev install
* SPOS example crash when the checkpoints do not have state_dict
* Fix table sort issue when experiment had failed trial
* Support multi python env (conda, pyenv etc)
## Release 1.5 - 4/13/2020
### New Features and Documentation
......
......@@ -2,7 +2,6 @@
# Licensed under the MIT license.
import tensorflow as tf
from tensorflow.data import Dataset
def get_dataset():
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()
......
......@@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL');
this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') {
......
......@@ -47,10 +47,10 @@ export NNI_EXP_ID={4}
export NNI_CODE_DIR={5}
export NNI_TRIAL_SEQ_ID={6}
{7}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_SYS_DIR/code
mkdir -p $NNI_OUTPUT_DIR
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
sh $NNI_SYS_DIR/install_nni.sh
cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
--nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
......@@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = [];
if (process.platform === 'win32') {
script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`);
script.push(`cd $env:NNI_SYS_DIR`);
script.push(`cd $env:NNI_CODE_DIR`);
script.push(
`cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
} else {
script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`);
script.push(`cd $NNI_SYS_DIR`);
script.push(`cd $NNI_CODE_DIR`);
script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
if (process.platform === 'darwin') {
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
......
......@@ -31,6 +31,6 @@ fi`;
export const PAI_K8S_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
&& NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& cd $NNI_SYS_DIR/code && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}'`;
......@@ -54,7 +54,7 @@ const yaml = require('js-yaml');
class PAIK8STrainingService extends PAITrainingService {
protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
private copyExpCodeDirPromise?: Promise<void>;
private paiJobConfig: undefined;
private paiJobConfig: any;
private nniVersion: string | undefined;
constructor() {
super();
......@@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService {
let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) {
nniJobConfig = this.paiJobConfig;
nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript
nniJobConfig.name = jobName;
// Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style
......@@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService {
await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters);
}
//Copy codeDir files to local working folder
await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
//Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig);
......
......@@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands {
export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \
NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir}
export MULTI_PHASE=${isMultiPhase}
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
mkdir -p $NNI_SYS_DIR/code
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
sh $NNI_SYS_DIR/install_nni.sh
cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \
--nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \
--job_id_file ${jobIdFileName} \
......@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return result;
}
public killChildProcesses(pidFileName: string): string {
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
// prevent trialkeeper to be killed, so it can save exit code.
const command = `list_descendants ()
let command = `list_descendants ()
{
local children=$(ps -o pid= --ppid "$1")
......@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children"
}
kill $(list_descendants \`cat '${pidFileName}'\`)`
if (killSelf) {
command += `\nkill \`cat '${pidFileName}'\``
}
return command;
}
......
......@@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands {
set MULTI_PHASE=${isMultiPhase}
set NNI_CODE_DIR=${codeDir}
${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""}
robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%
cd %NNI_SYS_DIR%
md %NNI_SYS_DIR%/code
robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%/code
cd %NNI_SYS_DIR%/code
python -c "import nni" 2>nul
if not %ERRORLEVEL% EQU 0 (
echo installing NNI as exit code of "import nni" is %ERRORLEVEL%
......@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return result;
}
public killChildProcesses(pidFileName: string): string {
const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
public killChildProcesses(pidFileName: string, killSelf: boolean): string {
let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
`kill-tree $ppid"`;
if (killSelf){
command += `;Stop-Process -Id $ppid`;
}
return command;
}
......
......@@ -25,7 +25,7 @@ abstract class OsCommands {
public abstract readLastLines(fileName: string, lineCount: number): string;
public abstract isProcessAliveCommand(pidFileName: string): string;
public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
public abstract killChildProcesses(pidFileName: string): string;
public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
public abstract extractFile(tarFileName: string, targetFolder: string): string;
public abstract executeScript(script: string, isFile: boolean): string;
......
......@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
}
}
if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
this.stopping = true;
throw new Error(restServer.getErrorMessage);
}
await delay(3000);
}
......@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (executor !== undefined) {
this.log.info(`killing gpu metric collector on ${executor.name}`);
const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
await executor.killChildProcesses(gpuJobPidPath);
await executor.killChildProcesses(gpuJobPidPath, true);
}
executorManager.releaseAllExecutor();
}
......@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this.timer.unsubscribe(disposable);
}
}
if (this.stopping){
this.timer.unsubscribe(disposable);
this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
}
collectingCount.pop();
}
}
......
......@@ -230,8 +230,8 @@ class ShellExecutor {
return result !== undefined ? result : false;
}
public async killChildProcesses(pidFileName: string): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName);
public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
const commandResult = await this.execute(commandText);
return commandResult.exitCode == 0;
}
......
......@@ -4,7 +4,6 @@
import logging
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.optimizers import Adam
from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
......@@ -39,9 +38,9 @@ class EnasTrainer:
x, y = dataset_train
split = int(len(x) * 0.9)
self.train_set = Dataset.from_tensor_slices((x[:split], y[:split]))
self.valid_set = Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = Dataset.from_tensor_slices(dataset_valid)
self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)
self.mutator = EnasMutator(model)
self.mutator_optim = Adam(learning_rate=mutator_lr)
......@@ -151,9 +150,9 @@ class EnasTrainer:
def _create_train_loader(self):
train_set = self.train_set.shuffle(1000000).batch(self.batch_size)
test_set = self.test_set.shuffle(1000000).batch(self.batch_size)
train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
test_set = self.test_set.shuffle(1000000).repeat().batch(self.batch_size)
return iter(train_set), iter(test_set)
def _create_validate_loader(self):
return iter(self.test_set.shuffle(1000000).batch(self.batch_size))
return iter(self.test_set.shuffle(1000000).repeat().batch(self.batch_size))
......@@ -11,31 +11,9 @@ import traceback
from xml.dom import minidom
def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList
def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment