Unverified Commit d628942b authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #2492 from microsoft/v1.6

merge V1.6 back
parents a4bbb796 7d5feeb9
...@@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn ...@@ -60,6 +60,7 @@ NNI_YARN ?= PATH=$(BIN_FOLDER):$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
## Version number ## Version number
NNI_VERSION_VALUE = $(shell git describe --tags) NNI_VERSION_VALUE = $(shell git describe --tags)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
NNI_VERSION_TEMPLATE = 999.0.0-developing NNI_VERSION_TEMPLATE = 999.0.0-developing
# Main targets # Main targets
......
...@@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a ...@@ -25,7 +25,7 @@ The tool manages automated machine learning (AutoML) experiments, **dispatches a
* Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm. * Researchers and data scientists who want to easily **implement and experiment new AutoML algorithms**, may it be: hyperparameter tuning algorithm, neural architect search algorithm or model compression algorithm.
* ML Platform owners who want to **support AutoML in their platform**. * ML Platform owners who want to **support AutoML in their platform**.
### **[NNI v1.5 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>** ### **[NNI v1.6 has been released!](https://github.com/microsoft/nni/releases) &nbsp;<a href="#nni-released-reminder"><img width="48" src="docs/img/release_icon.png"></a>**
## **NNI capabilities in a glance** ## **NNI capabilities in a glance**
...@@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is ...@@ -239,7 +239,7 @@ The following example is built on TensorFlow 1.x. Make sure **TensorFlow 1.x is
* Download the examples via clone the source code. * Download the examples via clone the source code.
```bash ```bash
git clone -b v1.5 https://github.com/Microsoft/nni.git git clone -b v1.6 https://github.com/Microsoft/nni.git
``` ```
* Run the MNIST example. * Run the MNIST example.
......
...@@ -13,6 +13,7 @@ endif ...@@ -13,6 +13,7 @@ endif
TIME_STAMP = $(shell date -u "+%y%m%d%H%M") TIME_STAMP = $(shell date -u "+%y%m%d%H%M")
NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0) NNI_VERSION_VALUE = $(shell git describe --tags --abbrev=0)
NNI_VERSION_VALUE := $(NNI_VERSION_VALUE:v%=%)
# To include time stamp in version value, run: # To include time stamp in version value, run:
# make version_ts=true build # make version_ts=true build
...@@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn ...@@ -25,6 +26,7 @@ NNI_YARN_FOLDER ?= $(CWD)nni-yarn
NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn NNI_YARN := PATH=$(CWD)node-$(OS_SPEC)-x64/bin:$${PATH} $(NNI_YARN_FOLDER)/bin/yarn
.PHONY: build .PHONY: build
build: build:
# Building version $(NNI_VERSION_VALUE)
python3 -m pip install --user --upgrade setuptools wheel python3 -m pip install --user --upgrade setuptools wheel
wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz wget -q https://aka.ms/nni/nodejs-download/$(OS_SPEC) -O $(CWD)node-$(OS_SPEC)-x64.tar.xz
rm -rf $(CWD)node-$(OS_SPEC)-x64 rm -rf $(CWD)node-$(OS_SPEC)-x64
......
...@@ -15,6 +15,7 @@ else{ ...@@ -15,6 +15,7 @@ else{
$TIME_STAMP = date -u "+%y%m%d%H%M" $TIME_STAMP = date -u "+%y%m%d%H%M"
$NNI_VERSION_VALUE = git describe --tags --abbrev=0 $NNI_VERSION_VALUE = git describe --tags --abbrev=0
$NNI_VERSION_VALUE = $NNI_VERSION_VALUE.substring(1)
# To include time stamp in version value, run: # To include time stamp in version value, run:
# make version_ts=true build # make version_ts=true build
......
# ChangeLog # ChangeLog
## Release 1.6 - 5/26/2020
### Major Features
#### New Features and improvement
* Improve IPC limitation to 100W
* improve code storage upload logic among trials in non-local platform
* support `__version__` for SDK version
* support windows dev intall
#### Web UI
* Show trial error message
* finalize homepage layout
* Refactor overview's best trials module
* Remove multiphase from webui
* add tooltip for trial concurrency in the overview page
* Show top trials for hyper-parameter graph
#### HPO Updates
* Improve PBT on failure handling and support experiment resume for PBT
#### NAS Updates
* NAS support for TensorFlow 2.0 (preview) [TF2.0 NAS examples](https://github.com/microsoft/nni/tree/master/examples/nas/naive-tf)
* Use OrderedDict for LayerChoice
* Prettify the format of export
* Replace layer choice with selected module after applied fixed architecture
#### Model Compression Updates
* Model compression PyTorch 1.4 support
#### Training Service Updates
* update pai yaml merge logic
* support windows as remote machine in remote mode [Remote Mode](https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/RemoteMachineMode.md#windows)
### Bug Fix
* fix dev install
* SPOS example crash when the checkpoints do not have state_dict
* Fix table sort issue when experiment had failed trial
* Support multi python env (conda, pyenv etc)
## Release 1.5 - 4/13/2020 ## Release 1.5 - 4/13/2020
### New Features and Documentation ### New Features and Documentation
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# Licensed under the MIT license. # Licensed under the MIT license.
import tensorflow as tf import tensorflow as tf
from tensorflow.data import Dataset
def get_dataset(): def get_dataset():
(x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data() (x_train, y_train), (x_valid, y_valid) = tf.keras.datasets.cifar10.load_data()
......
...@@ -566,7 +566,7 @@ class NNIManager implements Manager { ...@@ -566,7 +566,7 @@ class NNIManager implements Manager {
assert(this.status.status === 'RUNNING' || assert(this.status.status === 'RUNNING' ||
this.status.status === 'DONE' || this.status.status === 'DONE' ||
this.status.status === 'NO_MORE_TRIAL' || this.status.status === 'NO_MORE_TRIAL' ||
this.status.status === 'TUNER_NO_MORE_TRIAL'); this.status.status === 'TUNER_NO_MORE_TRIAL', `Actual status: ${this.status.status}`);
if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration || if (this.experimentProfile.execDuration > this.experimentProfile.params.maxExecDuration ||
this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
if (this.status.status !== 'DONE') { if (this.status.status !== 'DONE') {
......
...@@ -47,10 +47,10 @@ export NNI_EXP_ID={4} ...@@ -47,10 +47,10 @@ export NNI_EXP_ID={4}
export NNI_CODE_DIR={5} export NNI_CODE_DIR={5}
export NNI_TRIAL_SEQ_ID={6} export NNI_TRIAL_SEQ_ID={6}
{7} {7}
mkdir -p $NNI_SYS_DIR mkdir -p $NNI_SYS_DIR/code
mkdir -p $NNI_OUTPUT_DIR mkdir -p $NNI_OUTPUT_DIR
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
cd $NNI_SYS_DIR sh $NNI_SYS_DIR/install_nni.sh
sh install_nni.sh cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \ python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} \
--nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`; --nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`;
...@@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService { ...@@ -477,16 +477,14 @@ class LocalTrainingService implements TrainingService {
private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] { private getScript(localTrialConfig: TrialConfig, workingDirectory: string): string[] {
const script: string[] = []; const script: string[] = [];
if (process.platform === 'win32') { if (process.platform === 'win32') {
script.push(`Copy-Item $env:NNI_CODE_DIR\\* -Destination $env:NNI_SYS_DIR -Recurse`); script.push(`cd $env:NNI_CODE_DIR`);
script.push(`cd $env:NNI_SYS_DIR`);
script.push( script.push(
`cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`, `cmd.exe /c ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`, `$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`, `$NOW_DATE = "$NOW_DATE" + (Get-Date -Format fff).ToString()`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`); `Write $LASTEXITCODE " " $NOW_DATE | Out-File "${path.join(workingDirectory, '.nni', 'state')}" -NoNewline -encoding utf8`);
} else { } else {
script.push(`cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR`); script.push(`cd $NNI_CODE_DIR`);
script.push(`cd $NNI_SYS_DIR`);
script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`); script.push(`eval ${localTrialConfig.command} 2>"${path.join(workingDirectory, 'stderr')}"`);
if (process.platform === 'darwin') { if (process.platform === 'darwin') {
// https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x // https://superuser.com/questions/599072/how-to-get-bash-execution-time-in-milliseconds-under-mac-os-x
......
...@@ -31,6 +31,6 @@ fi`; ...@@ -31,6 +31,6 @@ fi`;
export const PAI_K8S_TRIAL_COMMAND_FORMAT: string = export const PAI_K8S_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \ `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& NNI_CODE_DIR={6} && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR && cd $NNI_SYS_DIR && sh install_nni.sh \ && NNI_CODE_DIR={6} && mkdir -p $NNI_SYS_DIR/code && cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code && sh $NNI_SYS_DIR/install_nni.sh \
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \ && cd $NNI_SYS_DIR/code && python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}'`; --nni_manager_version '{10}' --log_collection '{11}'`;
...@@ -54,7 +54,7 @@ const yaml = require('js-yaml'); ...@@ -54,7 +54,7 @@ const yaml = require('js-yaml');
class PAIK8STrainingService extends PAITrainingService { class PAIK8STrainingService extends PAITrainingService {
protected paiTrialConfig: NNIPAIK8STrialConfig | undefined; protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
private copyExpCodeDirPromise?: Promise<void>; private copyExpCodeDirPromise?: Promise<void>;
private paiJobConfig: undefined; private paiJobConfig: any;
private nniVersion: string | undefined; private nniVersion: string | undefined;
constructor() { constructor() {
super(); super();
...@@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService { ...@@ -190,7 +190,7 @@ class PAIK8STrainingService extends PAITrainingService {
let nniJobConfig: any = undefined; let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) { if (this.paiTrialConfig.paiConfigPath) {
nniJobConfig = this.paiJobConfig; nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript
nniJobConfig.name = jobName; nniJobConfig.name = jobName;
// Each taskRole will generate new command in NNI's command format // Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style // Each command will be formatted to NNI style
...@@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService { ...@@ -290,8 +290,6 @@ class PAIK8STrainingService extends PAITrainingService {
await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters); await this.writeParameterFile(trialJobDetail.logPath, trialJobDetail.form.hyperParameters);
} }
//Copy codeDir files to local working folder
await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
//Generate Job Configuration in yaml format //Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
this.log.debug(paiJobConfig); this.log.debug(paiJobConfig);
......
...@@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands { ...@@ -22,10 +22,10 @@ class LinuxCommands extends OsCommands {
export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \ export NNI_PLATFORM=remote NNI_SYS_DIR=${workingDirectory} NNI_OUTPUT_DIR=${workingDirectory} NNI_TRIAL_JOB_ID=${trialJobId} \
NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir} NNI_EXP_ID=${experimentId} NNI_TRIAL_SEQ_ID=${trialSequenceId} NNI_CODE_DIR=${codeDir}
export MULTI_PHASE=${isMultiPhase} export MULTI_PHASE=${isMultiPhase}
mkdir -p $NNI_SYS_DIR/code
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR/code
cd $NNI_SYS_DIR sh $NNI_SYS_DIR/install_nni.sh
sh install_nni.sh cd $NNI_SYS_DIR/code
python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \ python3 -m nni_trial_tool.trial_keeper --trial_command '${cudaVisibleSetting} ${command}' --nnimanager_ip '${nniManagerAddress}' \
--nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \ --nnimanager_port '${nniManagerPort}' --nni_manager_version '${nniManagerVersion}' \
--job_id_file ${jobIdFileName} \ --job_id_file ${jobIdFileName} \
...@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands { ...@@ -93,9 +93,9 @@ class LinuxCommands extends OsCommands {
return result; return result;
} }
public killChildProcesses(pidFileName: string): string { public killChildProcesses(pidFileName: string, killSelf: boolean): string {
// prevent trialkeeper to be killed, so it can save exit code. // prevent trialkeeper to be killed, so it can save exit code.
const command = `list_descendants () let command = `list_descendants ()
{ {
local children=$(ps -o pid= --ppid "$1") local children=$(ps -o pid= --ppid "$1")
...@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands { ...@@ -107,6 +107,9 @@ class LinuxCommands extends OsCommands {
echo "$children" echo "$children"
} }
kill $(list_descendants \`cat '${pidFileName}'\`)` kill $(list_descendants \`cat '${pidFileName}'\`)`
if (killSelf) {
command += `\nkill \`cat '${pidFileName}'\``
}
return command; return command;
} }
......
...@@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands { ...@@ -28,9 +28,9 @@ class WindowsCommands extends OsCommands {
set MULTI_PHASE=${isMultiPhase} set MULTI_PHASE=${isMultiPhase}
set NNI_CODE_DIR=${codeDir} set NNI_CODE_DIR=${codeDir}
${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""} ${cudaVisibleSetting !== "" ? "set " + cudaVisibleSetting : ""}
md %NNI_SYS_DIR%/code
robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR% robocopy /s %NNI_CODE_DIR%/. %NNI_SYS_DIR%/code
cd %NNI_SYS_DIR% cd %NNI_SYS_DIR%/code
python -c "import nni" 2>nul python -c "import nni" 2>nul
if not %ERRORLEVEL% EQU 0 ( if not %ERRORLEVEL% EQU 0 (
echo installing NNI as exit code of "import nni" is %ERRORLEVEL% echo installing NNI as exit code of "import nni" is %ERRORLEVEL%
...@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands { ...@@ -102,11 +102,14 @@ class WindowsCommands extends OsCommands {
return result; return result;
} }
public killChildProcesses(pidFileName: string): string { public killChildProcesses(pidFileName: string, killSelf: boolean): string {
const command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` + let command = `powershell "$ppid=(type ${pidFileName}); function Kill-Tree {Param([int]$subppid);` +
`Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` + `Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq $subppid } | ForEach-Object { Kill-Tree $_.ProcessId }; ` +
`if ($subppid -ne $ppid){Stop-Process -Id $subppid}}` + `if ($subppid -ne $ppid){Stop-Process -Id $subppid -Force"}}` +
`kill-tree $ppid"`; `kill-tree $ppid"`;
if (killSelf){
command += `;Stop-Process -Id $ppid`;
}
return command; return command;
} }
......
...@@ -25,7 +25,7 @@ abstract class OsCommands { ...@@ -25,7 +25,7 @@ abstract class OsCommands {
public abstract readLastLines(fileName: string, lineCount: number): string; public abstract readLastLines(fileName: string, lineCount: number): string;
public abstract isProcessAliveCommand(pidFileName: string): string; public abstract isProcessAliveCommand(pidFileName: string): string;
public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean; public abstract isProcessAliveProcessOutput(result: RemoteCommandResult): boolean;
public abstract killChildProcesses(pidFileName: string): string; public abstract killChildProcesses(pidFileName: string, killSelf: boolean): string;
public abstract extractFile(tarFileName: string, targetFolder: string): string; public abstract extractFile(tarFileName: string, targetFolder: string): string;
public abstract executeScript(script: string, isFile: boolean): string; public abstract executeScript(script: string, isFile: boolean): string;
......
...@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -96,8 +96,8 @@ class RemoteMachineTrainingService implements TrainingService {
} }
} }
if (restServer.getErrorMessage !== undefined) { if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
this.stopping = true; this.stopping = true;
throw new Error(restServer.getErrorMessage);
} }
await delay(3000); await delay(3000);
} }
...@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -394,7 +394,7 @@ class RemoteMachineTrainingService implements TrainingService {
if (executor !== undefined) { if (executor !== undefined) {
this.log.info(`killing gpu metric collector on ${executor.name}`); this.log.info(`killing gpu metric collector on ${executor.name}`);
const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid'); const gpuJobPidPath: string = executor.joinPath(executor.getRemoteScriptsPath(getExperimentId()), 'pid');
await executor.killChildProcesses(gpuJobPidPath); await executor.killChildProcesses(gpuJobPidPath, true);
} }
executorManager.releaseAllExecutor(); executorManager.releaseAllExecutor();
} }
...@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -460,6 +460,10 @@ class RemoteMachineTrainingService implements TrainingService {
this.timer.unsubscribe(disposable); this.timer.unsubscribe(disposable);
} }
} }
if (this.stopping){
this.timer.unsubscribe(disposable);
this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`);
}
collectingCount.pop(); collectingCount.pop();
} }
} }
......
...@@ -230,8 +230,8 @@ class ShellExecutor { ...@@ -230,8 +230,8 @@ class ShellExecutor {
return result !== undefined ? result : false; return result !== undefined ? result : false;
} }
public async killChildProcesses(pidFileName: string): Promise<boolean> { public async killChildProcesses(pidFileName: string, killSelf: boolean = false): Promise<boolean> {
const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName); const commandText = this.osCommands && this.osCommands.killChildProcesses(pidFileName, killSelf);
const commandResult = await this.execute(commandText); const commandResult = await this.execute(commandText);
return commandResult.exitCode == 0; return commandResult.exitCode == 0;
} }
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
import logging import logging
import tensorflow as tf import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.optimizers import Adam from tensorflow.keras.optimizers import Adam
from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads from nni.nas.tensorflow.utils import AverageMeterGroup, fill_zero_grads
...@@ -39,9 +38,9 @@ class EnasTrainer: ...@@ -39,9 +38,9 @@ class EnasTrainer:
x, y = dataset_train x, y = dataset_train
split = int(len(x) * 0.9) split = int(len(x) * 0.9)
self.train_set = Dataset.from_tensor_slices((x[:split], y[:split])) self.train_set = tf.data.Dataset.from_tensor_slices((x[:split], y[:split]))
self.valid_set = Dataset.from_tensor_slices((x[split:], y[split:])) self.valid_set = tf.data.Dataset.from_tensor_slices((x[split:], y[split:]))
self.test_set = Dataset.from_tensor_slices(dataset_valid) self.test_set = tf.data.Dataset.from_tensor_slices(dataset_valid)
self.mutator = EnasMutator(model) self.mutator = EnasMutator(model)
self.mutator_optim = Adam(learning_rate=mutator_lr) self.mutator_optim = Adam(learning_rate=mutator_lr)
...@@ -151,9 +150,9 @@ class EnasTrainer: ...@@ -151,9 +150,9 @@ class EnasTrainer:
def _create_train_loader(self): def _create_train_loader(self):
train_set = self.train_set.shuffle(1000000).batch(self.batch_size) train_set = self.train_set.shuffle(1000000).repeat().batch(self.batch_size)
test_set = self.test_set.shuffle(1000000).batch(self.batch_size) test_set = self.test_set.shuffle(1000000).repeat().batch(self.batch_size)
return iter(train_set), iter(test_set) return iter(train_set), iter(test_set)
def _create_validate_loader(self): def _create_validate_loader(self):
return iter(self.test_set.shuffle(1000000).batch(self.batch_size)) return iter(self.test_set.shuffle(1000000).repeat().batch(self.batch_size))
...@@ -11,31 +11,9 @@ import traceback ...@@ -11,31 +11,9 @@ import traceback
from xml.dom import minidom from xml.dom import minidom
def check_ready_to_run():
if sys.platform == 'win32':
pgrep_output = subprocess.check_output(
'wmic process where "CommandLine like \'%nni_gpu_tool.gpu_metrics_collector%\' and name like \'%python%\'" get processId')
pidList = pgrep_output.decode("utf-8").strip().split()
pidList.pop(0) # remove the key word 'ProcessId'
pidList = list(map(int, pidList))
pidList.remove(os.getpid())
return not pidList
else:
pgrep_output = subprocess.check_output('pgrep -afu "$(whoami)" \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pid = pid.decode()
if "pgrep " in pid or pid.startswith('%s ' % os.getpid()) or pid.startswith('%s ' % os.getppid()):
continue
pidList.append(pid)
return not pidList
def main(argv): def main(argv):
metrics_output_dir = os.environ['METRIC_OUTPUT_DIR'] metrics_output_dir = os.environ['METRIC_OUTPUT_DIR']
if check_ready_to_run() == False:
print("GPU metrics collector is already running. exiting...")
exit(2)
cmd = 'nvidia-smi -q -x'.split() cmd = 'nvidia-smi -q -x'.split()
while(True): while(True):
try: try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment