Unverified Commit 0663218b authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #163 from Microsoft/master

merge master
parents 6c9360a5 cf983800
...@@ -43,7 +43,7 @@ def init_params(net): ...@@ -43,7 +43,7 @@ def init_params(net):
term_width = 0 term_width = 0
try: try:
_, term_width = os.popen('stty size', 'r').read().split() term_width = os.get_terminal_size().columns
except Exception as exception: except Exception as exception:
term_width = 200 term_width = 200
term_width = int(term_width) term_width = int(term_width)
......
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
$install_node = $true
$install_yarn = $true
# nodejs
$nodeUrl = "https://aka.ms/nni/nodejs-download/win64"
$yarnUrl = "https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir = "node-v*"
$unzipYarnDir = "yarn-v*"
$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME"
$WHICH_PYTHON = where.exe python
if($WHICH_PYTHON -eq $null){
throw "Can not find python"
}
else{
$pyVersion = & python -V 2>&1
$pyVersion = ([string]$pyVersion).substring(7,3)
if([double]$pyVersion -lt 3.5){
throw "python version should >= 3.5"
}
}
$WHICH_PIP = where.exe pip
if($WHICH_PIP -eq $null){
throw "Can not find pip"
}
$env:PYTHONIOENCODING = "UTF-8"
if($env:VIRTUAL_ENV){
$NNI_PYTHON3 = $env:VIRTUAL_ENV + "\Scripts"
$NNI_PKG_FOLDER = $env:VIRTUAL_ENV + "\nni"
$NNI_PYTHON_SCRIPTS = $NNI_PYTHON3
}
else{
$NNI_PYTHON3 = $(python -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))')
$NNI_PKG_FOLDER = $NNI_PYTHON3 + "\nni"
$NNI_PYTHON_SCRIPTS = $NNI_PYTHON3 + "\Scripts"
}
$PIP_INSTALL = """$NNI_PYTHON3\python"" -m pip install ."
if(!(Test-Path $NNI_DEPENDENCY_FOLDER)){
New-Item $NNI_DEPENDENCY_FOLDER -ItemType Directory
}
$NNI_NODE_ZIP = $NNI_DEPENDENCY_FOLDER+"\nni-node.zip"
$NNI_NODE_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-node"
$NNI_YARN_TARBALL = $NNI_DEPENDENCY_FOLDER+"\nni-yarn.tar.gz"
$NNI_YARN_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-yarn"
$NNI_YARN = $NNI_YARN_FOLDER +"\bin\yarn"
## Version number
$NNI_VERSION_VALUE = $(git describe --tags)
$NNI_VERSION_TEMPLATE = "999.0.0-developing"
if(!(Test-Path $NNI_NODE_ZIP)){
Write-Host "Downloading Node..."
(New-Object Net.WebClient).DownloadFile($nodeUrl, $NNI_NODE_ZIP)
}
if(!(Test-Path $NNI_YARN_TARBALL)){
Write-Host "Downloading Yarn..."
(New-Object Net.WebClient).DownloadFile($yarnUrl, $NNI_YARN_TARBALL)
}
$NNI_YARN_TARBALL = $NNI_YARN_TARBALL -split '\\' -join '\\'
$NNI_DEPENDENCY_FOLDER = $NNI_DEPENDENCY_FOLDER -split '\\' -join '\\'
$SCRIPT_PATH = $NNI_DEPENDENCY_FOLDER + '\extract.py'
$SCRIPT = "import tarfile",
("tar = tarfile.open(""{0}"")" -f $NNI_YARN_TARBALL),
("tar.extractall(""{0}"")" -f $NNI_DEPENDENCY_FOLDER),
"tar.close()"
[System.IO.File]::WriteAllLines($SCRIPT_PATH, $SCRIPT)
Add-Type -AssemblyName System.IO.Compression.FileSystem
function Unzip{
param([string]$zipfile, [string]$outpath)
[System.IO.Compression.ZipFile]::ExtractToDirectory($zipfile, $outpath)
}
if ($install_node) {
### nodejs install
if(!(Test-Path $NNI_NODE_FOLDER)){
Unzip $NNI_NODE_ZIP $NNI_DEPENDENCY_FOLDER
$unzipNodeDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipNodeDir"
Rename-Item $unzipNodeDir "nni-node"
}
Copy-Item "$NNI_NODE_FOLDER\node.exe" $NNI_PYTHON_SCRIPTS -Recurse -Force
### yarn install
if(!(Test-Path $NNI_YARN_FOLDER)){
cmd /C """$NNI_PYTHON3\python""" $SCRIPT_PATH
$unzipYarnDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipYarnDir"
Rename-Item $unzipYarnDir "nni-yarn"
}
}
## install-python-modules:
### Installing Python SDK
(Get-Content setup.py).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content setup.py
cmd /c $PIP_INSTALL
# Building NNI Manager
$env:PATH=$NNI_PYTHON_SCRIPTS+';'+$env:PATH
cd src\nni_manager
cmd /c $NNI_YARN
cmd /c $NNI_YARN build
Copy-Item config -Destination .\dist\ -Recurse -Force
# Building WebUI
cd ..\webui
cmd /c $NNI_YARN
cmd /c $NNI_YARN build
cd ..\..
## install-node-modules
if(!(Test-Path $NNI_PKG_FOLDER)){
New-Item $NNI_PKG_FOLDER -ItemType Directory
}
Remove-Item $NNI_PKG_FOLDER -Recurse -Force
Copy-Item "src\nni_manager\dist" $NNI_PKG_FOLDER -Recurse
Copy-Item "src\nni_manager\package.json" $NNI_PKG_FOLDER
$PKG_JSON = $NNI_PKG_FOLDER + "\package.json"
(Get-Content $PKG_JSON).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $PKG_JSON
cmd /c $NNI_YARN --prod --cwd $NNI_PKG_FOLDER
$NNI_PKG_FOLDER_STATIC = $NNI_PKG_FOLDER + "\static"
Copy-Item "src\webui\build" $NNI_PKG_FOLDER_STATIC -Recurse
...@@ -51,11 +51,12 @@ setup( ...@@ -51,11 +51,12 @@ setup(
'json_tricks', 'json_tricks',
'numpy', 'numpy',
'psutil', 'psutil',
'pyyaml', 'ruamel.yaml',
'requests', 'requests',
'scipy', 'scipy',
'schema', 'schema',
'PythonWebHDFS' 'PythonWebHDFS',
'colorama'
], ],
entry_points = { entry_points = {
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
import { ExperimentProfile, TrialJobStatistics } from './manager'; import { ExperimentProfile, TrialJobStatistics } from './manager';
import { TrialJobDetail, TrialJobStatus } from './trainingService'; import { TrialJobDetail, TrialJobStatus } from './trainingService';
type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED' | 'ADD_HYPERPARAMETER'; type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED' | 'ADD_HYPERPARAMETER' | 'IMPORT_DATA';
type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM' | 'REQUEST_PARAMETER'; type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM' | 'REQUEST_PARAMETER';
interface ExperimentProfileRecord { interface ExperimentProfileRecord {
......
...@@ -99,6 +99,7 @@ abstract class Manager { ...@@ -99,6 +99,7 @@ abstract class Manager {
public abstract stopExperiment(): Promise<void>; public abstract stopExperiment(): Promise<void>;
public abstract getExperimentProfile(): Promise<ExperimentProfile>; public abstract getExperimentProfile(): Promise<ExperimentProfile>;
public abstract updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void>; public abstract updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void>;
public abstract importData(data: string): Promise<void>;
public abstract addCustomizedTrialJob(hyperParams: string): Promise<void>; public abstract addCustomizedTrialJob(hyperParams: string): Promise<void>;
public abstract cancelTrialJobByUser(trialJobId: string): Promise<void>; public abstract cancelTrialJobByUser(trialJobId: string): Promise<void>;
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
import * as assert from 'assert'; import * as assert from 'assert';
import { randomBytes } from 'crypto'; import { randomBytes } from 'crypto';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { ChildProcess, spawn, StdioOptions } from 'child_process';
import * as fs from 'fs'; import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
import * as path from 'path'; import * as path from 'path';
...@@ -32,6 +34,7 @@ import * as util from 'util'; ...@@ -32,6 +34,7 @@ import * as util from 'util';
import { Database, DataStore } from './datastore'; import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo'; import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager'; import { Manager } from './manager';
import { TrialConfig } from '../training_service/common/trialConfig';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log'; import { getLogger } from './log';
...@@ -146,6 +149,23 @@ function parseArg(names: string[]): string { ...@@ -146,6 +149,23 @@ function parseArg(names: string[]): string {
return ''; return '';
} }
function encodeCmdLineArgs(args:any):any{
if(process.platform === 'win32'){
return JSON.stringify(args);
}
else{
return JSON.stringify(JSON.stringify(args));
}
}
function getCmdPy():string{
let cmd = 'python3';
if(process.platform === 'win32'){
cmd = 'python';
}
return cmd;
}
/** /**
* Generate command line to start automl algorithm(s), * Generate command line to start automl algorithm(s),
* either start advisor or start a process which runs tuner and assessor * either start advisor or start a process which runs tuner and assessor
...@@ -179,8 +199,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP ...@@ -179,8 +199,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (!tuner && !advisor) { if (!tuner && !advisor) {
throw new Error('Error: specify neither tuner nor advisor is not allowed'); throw new Error('Error: specify neither tuner nor advisor is not allowed');
} }
let command: string = `${getCmdPy()} -m nni`;
let command: string = `python3 -m nni`;
if (multiPhase) { if (multiPhase) {
command += ' --multi_phase'; command += ' --multi_phase';
} }
...@@ -192,7 +211,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP ...@@ -192,7 +211,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (advisor) { if (advisor) {
command += ` --advisor_class_name ${advisor.className}`; command += ` --advisor_class_name ${advisor.className}`;
if (advisor.classArgs !== undefined) { if (advisor.classArgs !== undefined) {
command += ` --advisor_args ${JSON.stringify(JSON.stringify(advisor.classArgs))}`; command += ` --advisor_args ${encodeCmdLineArgs(advisor.classArgs)}`;
} }
if (advisor.codeDir !== undefined && advisor.codeDir.length > 1) { if (advisor.codeDir !== undefined && advisor.codeDir.length > 1) {
command += ` --advisor_directory ${advisor.codeDir}`; command += ` --advisor_directory ${advisor.codeDir}`;
...@@ -203,7 +222,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP ...@@ -203,7 +222,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
} else { } else {
command += ` --tuner_class_name ${tuner.className}`; command += ` --tuner_class_name ${tuner.className}`;
if (tuner.classArgs !== undefined) { if (tuner.classArgs !== undefined) {
command += ` --tuner_args ${JSON.stringify(JSON.stringify(tuner.classArgs))}`; command += ` --tuner_args ${encodeCmdLineArgs(tuner.classArgs)}`;
} }
if (tuner.codeDir !== undefined && tuner.codeDir.length > 1) { if (tuner.codeDir !== undefined && tuner.codeDir.length > 1) {
command += ` --tuner_directory ${tuner.codeDir}`; command += ` --tuner_directory ${tuner.codeDir}`;
...@@ -215,7 +234,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP ...@@ -215,7 +234,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (assessor !== undefined && assessor.className !== undefined) { if (assessor !== undefined && assessor.className !== undefined) {
command += ` --assessor_class_name ${assessor.className}`; command += ` --assessor_class_name ${assessor.className}`;
if (assessor.classArgs !== undefined) { if (assessor.classArgs !== undefined) {
command += ` --assessor_args ${JSON.stringify(JSON.stringify(assessor.classArgs))}`; command += ` --assessor_args ${encodeCmdLineArgs(assessor.classArgs)}`;
} }
if (assessor.codeDir !== undefined && assessor.codeDir.length > 1) { if (assessor.codeDir !== undefined && assessor.codeDir.length > 1) {
command += ` --assessor_directory ${assessor.codeDir}`; command += ` --assessor_directory ${assessor.codeDir}`;
...@@ -363,6 +382,83 @@ async function getVersion(): Promise<string> { ...@@ -363,6 +382,83 @@ async function getVersion(): Promise<string> {
return deferred.promise; return deferred.promise;
} }
/**
* run command as ChildProcess
*/
function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess{
let cmd: string = command;
let arg: string[] = [];
let newShell: boolean = true;
if(process.platform === "win32"){
cmd = command.split(" ", 1)[0];
arg = command.substr(cmd.length+1).split(" ");
newShell = false;
}
const tunerProc: ChildProcess = spawn(cmd, arg, {
stdio,
cwd: newCwd,
env: newEnv,
shell: newShell
});
return tunerProc;
}
/**
* judge whether the process is alive
*/
async function isAlive(pid:any): Promise<boolean>{
let deferred : Deferred<boolean> = new Deferred<boolean>();
let alive: boolean = false;
if(process.platform ==='win32'){
try {
const str = cp.execSync(`powershell.exe Get-Process -Id ${pid} -ErrorAction SilentlyContinue`).toString();
if (str) {
alive = true;
}
}
catch (error) {
}
}
else{
try {
await cpp.exec(`kill -0 ${pid}`);
alive = true;
} catch (error) {
//ignore
}
}
deferred.resolve(alive);
return deferred.promise;
}
/**
* kill process
*/
async function killPid(pid:any): Promise<void>{
let deferred : Deferred<void> = new Deferred<void>();
try {
if (process.platform === "win32") {
await cpp.exec(`cmd /c taskkill /PID ${pid} /F`);
}
else{
await cpp.exec(`kill -9 ${pid}`);
}
} catch (error) {
// pid does not exist, do nothing here
}
deferred.resolve();
return deferred.promise;
}
function getNewLine(): string{
if (process.platform === "win32") {
return "\r\n";
}
else{
return "\n";
}
}
export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir, export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address, getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion }; mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine };
...@@ -22,6 +22,7 @@ const INITIALIZE = 'IN'; ...@@ -22,6 +22,7 @@ const INITIALIZE = 'IN';
const REQUEST_TRIAL_JOBS = 'GE'; const REQUEST_TRIAL_JOBS = 'GE';
const REPORT_METRIC_DATA = 'ME'; const REPORT_METRIC_DATA = 'ME';
const UPDATE_SEARCH_SPACE = 'SS'; const UPDATE_SEARCH_SPACE = 'SS';
const IMPORT_DATA = 'FD'
const ADD_CUSTOMIZED_TRIAL_JOB = 'AD'; const ADD_CUSTOMIZED_TRIAL_JOB = 'AD';
const TRIAL_END = 'EN'; const TRIAL_END = 'EN';
const TERMINATE = 'TE'; const TERMINATE = 'TE';
...@@ -38,6 +39,7 @@ const TUNER_COMMANDS: Set<string> = new Set([ ...@@ -38,6 +39,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
REQUEST_TRIAL_JOBS, REQUEST_TRIAL_JOBS,
REPORT_METRIC_DATA, REPORT_METRIC_DATA,
UPDATE_SEARCH_SPACE, UPDATE_SEARCH_SPACE,
IMPORT_DATA,
ADD_CUSTOMIZED_TRIAL_JOB, ADD_CUSTOMIZED_TRIAL_JOB,
TERMINATE, TERMINATE,
PING, PING,
...@@ -62,6 +64,7 @@ export { ...@@ -62,6 +64,7 @@ export {
REQUEST_TRIAL_JOBS, REQUEST_TRIAL_JOBS,
REPORT_METRIC_DATA, REPORT_METRIC_DATA,
UPDATE_SEARCH_SPACE, UPDATE_SEARCH_SPACE,
IMPORT_DATA,
ADD_CUSTOMIZED_TRIAL_JOB, ADD_CUSTOMIZED_TRIAL_JOB,
TRIAL_END, TRIAL_END,
TERMINATE, TERMINATE,
......
...@@ -35,10 +35,10 @@ import { ...@@ -35,10 +35,10 @@ import {
import { import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService'; } from '../common/trainingService';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getLogLevel } from '../common/utils'; import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
import { import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING, ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE, IMPORT_DATA
} from './commands'; } from './commands';
import { createDispatcherInterface, IpcInterface } from './ipcInterface'; import { createDispatcherInterface, IpcInterface } from './ipcInterface';
...@@ -99,6 +99,17 @@ class NNIManager implements Manager { ...@@ -99,6 +99,17 @@ class NNIManager implements Manager {
return this.storeExperimentProfile(); return this.storeExperimentProfile();
} }
public importData(data: string): Promise<void> {
if (this.dispatcher === undefined) {
return Promise.reject(
new Error('tuner has not been setup')
);
}
this.dispatcher.sendCommand(IMPORT_DATA, data);
return this.dataStore.storeTrialJobEvent('IMPORT_DATA', '', data);
}
public addCustomizedTrialJob(hyperParams: string): Promise<void> { public addCustomizedTrialJob(hyperParams: string): Promise<void> {
if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) { if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
return Promise.reject( return Promise.reject(
...@@ -290,12 +301,7 @@ class NNIManager implements Manager { ...@@ -290,12 +301,7 @@ class NNIManager implements Manager {
NNI_INCLUDE_INTERMEDIATE_RESULTS: includeIntermediateResultsEnv NNI_INCLUDE_INTERMEDIATE_RESULTS: includeIntermediateResultsEnv
}; };
let newEnv = Object.assign({}, process.env, nniEnv); let newEnv = Object.assign({}, process.env, nniEnv);
const tunerProc: ChildProcess = spawn(command, [], { const tunerProc: ChildProcess = getTunerProc(command,stdio,newCwd,newEnv);
stdio,
cwd: newCwd,
env: newEnv,
shell: true
});
this.dispatcherPid = tunerProc.pid; this.dispatcherPid = tunerProc.pid;
this.dispatcher = createDispatcherInterface(tunerProc); this.dispatcher = createDispatcherInterface(tunerProc);
...@@ -341,16 +347,10 @@ class NNIManager implements Manager { ...@@ -341,16 +347,10 @@ class NNIManager implements Manager {
// gracefully terminate tuner and assessor here, wait at most 30 seconds. // gracefully terminate tuner and assessor here, wait at most 30 seconds.
for (let i: number = 0; i < 30; i++) { for (let i: number = 0; i < 30; i++) {
if (!tunerAlive) { break; } if (!tunerAlive) { break; }
try { tunerAlive = await isAlive(this.dispatcherPid);
await cpp.exec(`kill -0 ${this.dispatcherPid}`);
} catch (error) { tunerAlive = false; }
await delay(1000); await delay(1000);
} }
try { await killPid(this.dispatcherPid);
await cpp.exec(`kill -9 ${this.dispatcherPid}`);
} catch (error) {
// this.tunerPid does not exist, do nothing here
}
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs(); const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
// TO DO: to promise all // TO DO: to promise all
for (const trialJob of trialJobList) { for (const trialJob of trialJobList) {
......
...@@ -42,6 +42,7 @@ describe('Unit test for dataStore', () => { ...@@ -42,6 +42,7 @@ describe('Unit test for dataStore', () => {
}); });
after(() => { after(() => {
ds.close();
cleanupUnitTest(); cleanupUnitTest();
}); });
......
...@@ -18,11 +18,10 @@ ...@@ -18,11 +18,10 @@
*/ */
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { ChildProcess, spawn, StdioOptions } from 'child_process'; import { ChildProcess, spawn, StdioOptions } from 'child_process';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils'; import { cleanupUnitTest, prepareUnitTest, getTunerProc, getCmdPy } from '../../common/utils';
import * as CommandType from '../commands'; import * as CommandType from '../commands';
import { createDispatcherInterface, IpcInterface } from '../ipcInterface'; import { createDispatcherInterface, IpcInterface } from '../ipcInterface';
import { NNIError } from '../../common/errors'; import { NNIError } from '../../common/errors';
...@@ -39,15 +38,21 @@ function runProcess(): Promise<Error | null> { ...@@ -39,15 +38,21 @@ function runProcess(): Promise<Error | null> {
// create fake assessor process // create fake assessor process
const stdio: StdioOptions = ['ignore', 'pipe', process.stderr, 'pipe', 'pipe']; const stdio: StdioOptions = ['ignore', 'pipe', process.stderr, 'pipe', 'pipe'];
const proc: ChildProcess = spawn('python3 assessor.py', [], { stdio, cwd: 'core/test', shell: true }); const command: string = getCmdPy() + ' assessor.py';
const proc: ChildProcess = getTunerProc(command, stdio, 'core/test', process.env);
// record its sent/received commands on exit // record its sent/received commands on exit
proc.on('error', (error: Error): void => { deferred.resolve(error); }); proc.on('error', (error: Error): void => { deferred.resolve(error); });
proc.on('exit', (code: number): void => { proc.on('exit', (code: number): void => {
if (code !== 0) { if (code !== 0) {
deferred.resolve(new Error(`return code: ${code}`)); deferred.resolve(new Error(`return code: ${code}`));
} else { } else {
sentCommands = proc.stdout.read().toString().split('\n'); let str = proc.stdout.read().toString();
if(str.search("\r\n")!=-1){
sentCommands = str.split("\r\n");
}
else{
sentCommands = str.split('\n');
}
deferred.resolve(null); deferred.resolve(null);
} }
}); });
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
import * as assert from 'assert'; import * as assert from 'assert';
import { ChildProcess, spawn, StdioOptions } from 'child_process'; import { ChildProcess, spawn, StdioOptions } from 'child_process';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest, getMsgDispatcherCommand } from '../../common/utils'; import { cleanupUnitTest, prepareUnitTest, getMsgDispatcherCommand, getTunerProc } from '../../common/utils';
import * as CommandType from '../commands'; import * as CommandType from '../commands';
import { createDispatcherInterface, IpcInterface } from '../ipcInterface'; import { createDispatcherInterface, IpcInterface } from '../ipcInterface';
...@@ -50,9 +50,7 @@ function startProcess(): void { ...@@ -50,9 +50,7 @@ function startProcess(): void {
// advisor // advisor
undefined undefined
); );
const proc: ChildProcess = getTunerProc(dispatcherCmd, stdio, 'core/test', process.env);
const proc: ChildProcess = spawn(dispatcherCmd, [], { stdio, cwd: 'core/test', shell: true });
proc.on('error', (error: Error): void => { proc.on('error', (error: Error): void => {
procExit = true; procExit = true;
procError = true; procError = true;
......
...@@ -33,6 +33,7 @@ import { NNIManager } from '../nnimanager'; ...@@ -33,6 +33,7 @@ import { NNIManager } from '../nnimanager';
import { SqlDB } from '../sqlDatabase'; import { SqlDB } from '../sqlDatabase';
import { MockedTrainingService } from './mockedTrainingService'; import { MockedTrainingService } from './mockedTrainingService';
import { MockedDataStore } from './mockedDatastore'; import { MockedDataStore } from './mockedDatastore';
import * as path from 'path';
async function initContainer(): Promise<void> { async function initContainer(): Promise<void> {
prepareUnitTest(); prepareUnitTest();
...@@ -183,7 +184,7 @@ describe('Unit test for nnimanager', function () { ...@@ -183,7 +184,7 @@ describe('Unit test for nnimanager', function () {
it('test getExperimentProfile', () => { it('test getExperimentProfile', () => {
return nniManager.getExperimentProfile().then((experimentProfile) => { return nniManager.getExperimentProfile().then((experimentProfile) => {
expect(experimentProfile.id).to.be.equal('unittest'); expect(experimentProfile.id).to.be.equal('unittest');
expect(experimentProfile.logDir).to.be.equal(os.homedir()+'/nni/experiments/unittest'); expect(experimentProfile.logDir).to.be.equal(path.join(os.homedir(),'nni','experiments','unittest'));
}).catch((error) => { }).catch((error) => {
assert.fail(error); assert.fail(error);
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
"version": "999.0.0-developing", "version": "999.0.0-developing",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"postbuild": "cp -rf config ./dist/",
"build": "tsc", "build": "tsc",
"test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors", "test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
"start": "node dist/main.js", "start": "node dist/main.js",
...@@ -35,7 +34,7 @@ ...@@ -35,7 +34,7 @@
"@types/express": "^4.16.0", "@types/express": "^4.16.0",
"@types/glob": "^7.1.1", "@types/glob": "^7.1.1",
"@types/mocha": "^5.2.5", "@types/mocha": "^5.2.5",
"@types/node": "^10.12.18", "@types/node": "10.12.18",
"@types/request": "^2.47.1", "@types/request": "^2.47.1",
"@types/rx": "^4.1.1", "@types/rx": "^4.1.1",
"@types/sqlite3": "^3.1.3", "@types/sqlite3": "^3.1.3",
......
...@@ -63,6 +63,7 @@ class NNIRestHandler { ...@@ -63,6 +63,7 @@ class NNIRestHandler {
this.checkStatus(router); this.checkStatus(router);
this.getExperimentProfile(router); this.getExperimentProfile(router);
this.updateExperimentProfile(router); this.updateExperimentProfile(router);
this.importData(router);
this.startExperiment(router); this.startExperiment(router);
this.getTrialJobStatistics(router); this.getTrialJobStatistics(router);
this.setClusterMetaData(router); this.setClusterMetaData(router);
...@@ -145,6 +146,16 @@ class NNIRestHandler { ...@@ -145,6 +146,16 @@ class NNIRestHandler {
}); });
} }
private importData(router: Router): void {
router.post('/experiment/import-data', (req: Request, res: Response) => {
this.nniManager.importData(JSON.stringify(req.body)).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private startExperiment(router: Router): void { private startExperiment(router: Router): void {
router.post('/experiment', expressJoi(ValidationSchemas.STARTEXPERIMENT), (req: Request, res: Response) => { router.post('/experiment', expressJoi(ValidationSchemas.STARTEXPERIMENT), (req: Request, res: Response) => {
if (isNewExperiment()) { if (isNewExperiment()) {
......
...@@ -46,6 +46,9 @@ export class MockedNNIManager extends Manager { ...@@ -46,6 +46,9 @@ export class MockedNNIManager extends Manager {
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> { public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
return Promise.resolve(); return Promise.resolve();
} }
public importData(data: string): Promise<void> {
return Promise.resolve();
}
public getTrialJobStatistics(): Promise<TrialJobStatistics[]> { public getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>(); const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>();
deferred.resolve([{ deferred.resolve([{
......
...@@ -59,10 +59,17 @@ export class GPUSummary { ...@@ -59,10 +59,17 @@ export class GPUSummary {
} }
} }
export const GPU_INFO_COLLECTOR_FORMAT: string = export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
` `
#!/bin/bash #!/bin/bash
export METRIC_OUTPUT_DIR={0} export METRIC_OUTPUT_DIR={0}
echo $$ >{1} echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector python3 -m nni_gpu_tool.gpu_metrics_collector
` `
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
...@@ -22,6 +22,12 @@ import { getLogger } from "common/log"; ...@@ -22,6 +22,12 @@ import { getLogger } from "common/log";
'use strict'; 'use strict';
import { countFilesRecursively } from '../../common/utils' import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp";
/** /**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken * Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...@@ -46,3 +52,130 @@ export async function validateCodeDir(codeDir: string) : Promise<number> { ...@@ -46,3 +52,130 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
return fileCount; return fileCount;
} }
/**
* crete a new directory
* @param directory
*/
export async function execMkdir(directory: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`);
} else {
await cpp.exec(`mkdir -p ${directory}`);
}
return Promise.resolve();
}
/**
* crete a new file
* @param filename
*/
export async function execNewFile(filename: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${filename} -ItemType "file" -Force`);
} else {
await cpp.exec(`touch ${filename}`);
}
return Promise.resolve();
}
/**
* run script
* @param filePath
*/
export function execScript(filePath: string): cp.ChildProcess {
if (process.platform === 'win32') {
return cp.exec(`powershell.exe -file ${filePath}`);
} else {
return cp.exec(`bash ${filePath}`);
}
}
/**
* output the last line of a file
* @param filePath
*/
export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> {
let cmdresult: cpp.childProcessPromise.Result;
if (process.platform === 'win32') {
cmdresult = await cpp.exec(`powershell.exe Get-Content ${filePath} -Tail 1`);
} else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
}
return Promise.resolve(cmdresult);
}
/**
* delete a directory
* @param directory
*/
export async function execRemove(directory: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Remove-Item ${directory}`);
} else {
await cpp.exec(`rm -rf ${directory}`);
}
return Promise.resolve();
}
/**
* kill a process
* @param directory
*/
export async function execKill(pid: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`);
} else {
await cpp.exec(`pkill -P ${pid}`);
}
return Promise.resolve();
}
/**
* set environment variable
* @param variable
* @returns command string
*/
export function setEnvironmentVariable(variable: { key: string; value: string }): string{
if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`;
}
else{
return `export ${variable.key}=${variable.value}`;
}
}
/**
* generate script file name
* @param fileNamePrefix
*/
export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') {
return fileNamePrefix + '.ps1';
} else {
return fileNamePrefix + '.sh';
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
} else {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
}
}
...@@ -25,9 +25,10 @@ import * as fs from 'fs'; ...@@ -25,9 +25,10 @@ import * as fs from 'fs';
import * as os from 'os'; import * as os from 'os';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { execMkdir, getScriptName, getgpuMetricsCollectorScriptContent, execScript, execTail, execRemove, execKill } from '../common/util'
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils'; import { delay } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT, GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
/** /**
* GPUScheduler for local training service * GPUScheduler for local training service
...@@ -57,6 +58,19 @@ class GPUScheduler { ...@@ -57,6 +58,19 @@ class GPUScheduler {
} }
} }
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder);
//generate gpu_metrics_collector script
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
const gpuMetricsCollectorScriptContent: string = getgpuMetricsCollectorScriptContent(this.gpuMetricCollectorScriptFolder);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
execScript(gpuMetricsCollectorScriptPath)
}
public getAvailableGPUIndices(): number[] { public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) { if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0) return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0)
...@@ -78,33 +92,16 @@ class GPUScheduler { ...@@ -78,33 +92,16 @@ class GPUScheduler {
this.stopping = true; this.stopping = true;
try { try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8'); const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`); await execKill(pid);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`); await execRemove(this.gpuMetricCollectorScriptFolder);
} catch (error) { } catch (error) {
this.log.error(`GPU scheduler error: ${error}`); this.log.error(`GPU scheduler error: ${error}`);
} }
} }
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
const gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid')
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
private async updateGPUSummary(): Promise<void> { private async updateGPUSummary(): Promise<void> {
const cmdresult: cpp.childProcessPromise.Result = const cmdresult: cpp.childProcessPromise.Result =
await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`); await execTail(path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics'));
if (cmdresult && cmdresult.stdout) { if (cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout); this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else { } else {
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
*/ */
'use strict'; 'use strict';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process'; import * as cp from 'child_process';
import { EventEmitter } from 'events'; import { EventEmitter } from 'events';
...@@ -32,7 +31,8 @@ import { ...@@ -32,7 +31,8 @@ import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm, HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString, isAlive, getNewLine } from '../../common/utils';
import { execMkdir, getScriptName, execScript, setEnvironmentVariable, execNewFile } from '../common/util'
import { TrialConfig } from '../common/trialConfig'; import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
...@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService { ...@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return this.getHostJob(trialJobId); return this.getHostJob(trialJobId);
} }
if (trialJob.status === 'RUNNING') { if (trialJob.status === 'RUNNING') {
let alive: boolean = false; let alive: boolean = await isAlive(trialJob.pid);
try {
await cpp.exec(`kill -0 ${trialJob.pid}`);
alive = true;
} catch (error) {
//ignore
}
if (!alive) { if (!alive) {
trialJob.endTime = Date.now(); trialJob.endTime = Date.now();
this.setTrialJobStatus(trialJob, 'FAILED'); this.setTrialJobStatus(trialJob, 'FAILED');
...@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService { ...@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
if (!this.initialized) { if (!this.initialized) {
this.rootDir = getExperimentRootDir(); this.rootDir = getExperimentRootDir();
await cpp.exec(`mkdir -p ${this.rootDir}`); if(!fs.existsSync(this.rootDir)){
await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
}
this.initialized = true; this.initialized = true;
} }
switch (key) { switch (key) {
...@@ -369,7 +364,7 @@ class LocalTrainingService implements TrainingService { ...@@ -369,7 +364,7 @@ class LocalTrainingService implements TrainingService {
private getEnvironmentVariables( private getEnvironmentVariables(
trialJobDetail: TrialJobDetail, trialJobDetail: TrialJobDetail,
resource?: { gpuIndices: number[] }): { key: string; value: string }[] { resource: { gpuIndices: number[] }): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [ const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' }, { key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory }, { key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
...@@ -379,12 +374,10 @@ class LocalTrainingService implements TrainingService { ...@@ -379,12 +374,10 @@ class LocalTrainingService implements TrainingService {
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
]; ];
if (resource !== undefined && resource.gpuIndices.length > 0) {
envVariables.push({ envVariables.push({
key: 'CUDA_VISIBLE_DEVICES', key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '' : resource.gpuIndices.join(',') value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
}); });
}
return envVariables; return envVariables;
} }
...@@ -467,36 +460,52 @@ class LocalTrainingService implements TrainingService { ...@@ -467,36 +460,52 @@ class LocalTrainingService implements TrainingService {
} }
} }
private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[]{
let script: string[] = [];
if (process.platform === "win32") {
script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + "000"`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
}
else{
script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s000\` >${path.join(workingDirectory, '.nni', 'state')}`);
}
return script;
}
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId); const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource); const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
const runScriptLines: string[] = [];
if (!this.localTrailConfig) { if (!this.localTrailConfig) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
runScriptLines.push( const runScriptLines: string[] = [];
'#!/bin/bash', if (process.platform !== "win32"){
`cd ${this.localTrailConfig.codeDir}`); runScriptLines.push('#!/bin/bash');
}
runScriptLines.push(`cd ${this.localTrailConfig.codeDir}`);
for (const variable of variables) { for (const variable of variables) {
runScriptLines.push(`export ${variable.key}=${variable.value}`); runScriptLines.push(setEnvironmentVariable(variable));
} }
runScriptLines.push( const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory);
`eval ${this.localTrailConfig.command} 2>${path.join(trialJobDetail.workingDirectory, 'stderr')}`, scripts.forEach(script => {
`echo $? \`date +%s000\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`); runScriptLines.push(script);
});
await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`); await execMkdir(trialJobDetail.workingDirectory);
await cpp.exec(`mkdir -p ${path.join(trialJobDetail.workingDirectory, '.nni')}`); await execMkdir(path.join(trialJobDetail.workingDirectory, '.nni'));
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`); await execNewFile(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
await fs.promises.writeFile( const scriptName: string = getScriptName('run');
path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 }); await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName), runScriptLines.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters); await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`); const trialJobProcess: cp.ChildProcess = execScript(path.join(trialJobDetail.workingDirectory, scriptName));
this.setTrialJobStatus(trialJobDetail, 'RUNNING'); this.setTrialJobStatus(trialJobDetail, 'RUNNING');
trialJobDetail.startTime = Date.now(); trialJobDetail.startTime = Date.now();
trialJobDetail.pid = process.pid; trialJobDetail.pid = trialJobProcess.pid;
this.setExtraProperties(trialJobDetail, resource); this.setExtraProperties(trialJobDetail, resource);
let buffer: Buffer = Buffer.alloc(0); let buffer: Buffer = Buffer.alloc(0);
......
...@@ -46,7 +46,7 @@ import { ...@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
} from './remoteMachineData'; } from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'; import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util'; import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
...@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format( const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT, GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir, remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'), path.join(remoteGPUScriptsDir, 'pid'),
); );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment