Unverified Commit 0663218b authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #163 from Microsoft/master

merge master
parents 6c9360a5 cf983800
......@@ -43,7 +43,7 @@ def init_params(net):
term_width = 0
try:
_, term_width = os.popen('stty size', 'r').read().split()
term_width = os.get_terminal_size().columns
except Exception as exception:
term_width = 200
term_width = int(term_width)
......
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
$install_node = $true
$install_yarn = $true
# nodejs
$nodeUrl = "https://aka.ms/nni/nodejs-download/win64"
$yarnUrl = "https://yarnpkg.com/latest.tar.gz"
$unzipNodeDir = "node-v*"
$unzipYarnDir = "yarn-v*"
$NNI_DEPENDENCY_FOLDER = "C:\tmp\$env:USERNAME"
$WHICH_PYTHON = where.exe python
if($WHICH_PYTHON -eq $null){
throw "Can not find python"
}
else{
$pyVersion = & python -V 2>&1
$pyVersion = ([string]$pyVersion).substring(7,3)
if([double]$pyVersion -lt 3.5){
throw "python version should >= 3.5"
}
}
$WHICH_PIP = where.exe pip
if($WHICH_PIP -eq $null){
throw "Can not find pip"
}
$env:PYTHONIOENCODING = "UTF-8"
if($env:VIRTUAL_ENV){
$NNI_PYTHON3 = $env:VIRTUAL_ENV + "\Scripts"
$NNI_PKG_FOLDER = $env:VIRTUAL_ENV + "\nni"
$NNI_PYTHON_SCRIPTS = $NNI_PYTHON3
}
else{
$NNI_PYTHON3 = $(python -c 'import site; from pathlib import Path; print(Path(site.getsitepackages()[0]))')
$NNI_PKG_FOLDER = $NNI_PYTHON3 + "\nni"
$NNI_PYTHON_SCRIPTS = $NNI_PYTHON3 + "\Scripts"
}
$PIP_INSTALL = """$NNI_PYTHON3\python"" -m pip install ."
if(!(Test-Path $NNI_DEPENDENCY_FOLDER)){
New-Item $NNI_DEPENDENCY_FOLDER -ItemType Directory
}
$NNI_NODE_ZIP = $NNI_DEPENDENCY_FOLDER+"\nni-node.zip"
$NNI_NODE_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-node"
$NNI_YARN_TARBALL = $NNI_DEPENDENCY_FOLDER+"\nni-yarn.tar.gz"
$NNI_YARN_FOLDER = $NNI_DEPENDENCY_FOLDER+"\nni-yarn"
$NNI_YARN = $NNI_YARN_FOLDER +"\bin\yarn"
## Version number
$NNI_VERSION_VALUE = $(git describe --tags)
$NNI_VERSION_TEMPLATE = "999.0.0-developing"
if(!(Test-Path $NNI_NODE_ZIP)){
Write-Host "Downloading Node..."
(New-Object Net.WebClient).DownloadFile($nodeUrl, $NNI_NODE_ZIP)
}
if(!(Test-Path $NNI_YARN_TARBALL)){
Write-Host "Downloading Yarn..."
(New-Object Net.WebClient).DownloadFile($yarnUrl, $NNI_YARN_TARBALL)
}
$NNI_YARN_TARBALL = $NNI_YARN_TARBALL -split '\\' -join '\\'
$NNI_DEPENDENCY_FOLDER = $NNI_DEPENDENCY_FOLDER -split '\\' -join '\\'
$SCRIPT_PATH = $NNI_DEPENDENCY_FOLDER + '\extract.py'
$SCRIPT = "import tarfile",
("tar = tarfile.open(""{0}"")" -f $NNI_YARN_TARBALL),
("tar.extractall(""{0}"")" -f $NNI_DEPENDENCY_FOLDER),
"tar.close()"
[System.IO.File]::WriteAllLines($SCRIPT_PATH, $SCRIPT)
Add-Type -AssemblyName System.IO.Compression.FileSystem
function Unzip{
param([string]$zipfile, [string]$outpath)
[System.IO.Compression.ZipFile]::ExtractToDirectory($zipfile, $outpath)
}
if ($install_node) {
### nodejs install
if(!(Test-Path $NNI_NODE_FOLDER)){
Unzip $NNI_NODE_ZIP $NNI_DEPENDENCY_FOLDER
$unzipNodeDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipNodeDir"
Rename-Item $unzipNodeDir "nni-node"
}
Copy-Item "$NNI_NODE_FOLDER\node.exe" $NNI_PYTHON_SCRIPTS -Recurse -Force
### yarn install
if(!(Test-Path $NNI_YARN_FOLDER)){
cmd /C """$NNI_PYTHON3\python""" $SCRIPT_PATH
$unzipYarnDir = Get-ChildItem "$NNI_DEPENDENCY_FOLDER\$unzipYarnDir"
Rename-Item $unzipYarnDir "nni-yarn"
}
}
## install-python-modules:
### Installing Python SDK
(Get-Content setup.py).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content setup.py
cmd /c $PIP_INSTALL
# Building NNI Manager
$env:PATH=$NNI_PYTHON_SCRIPTS+';'+$env:PATH
cd src\nni_manager
cmd /c $NNI_YARN
cmd /c $NNI_YARN build
Copy-Item config -Destination .\dist\ -Recurse -Force
# Building WebUI
cd ..\webui
cmd /c $NNI_YARN
cmd /c $NNI_YARN build
cd ..\..
## install-node-modules
if(!(Test-Path $NNI_PKG_FOLDER)){
New-Item $NNI_PKG_FOLDER -ItemType Directory
}
Remove-Item $NNI_PKG_FOLDER -Recurse -Force
Copy-Item "src\nni_manager\dist" $NNI_PKG_FOLDER -Recurse
Copy-Item "src\nni_manager\package.json" $NNI_PKG_FOLDER
$PKG_JSON = $NNI_PKG_FOLDER + "\package.json"
(Get-Content $PKG_JSON).replace($NNI_VERSION_TEMPLATE, $NNI_VERSION_VALUE) | Set-Content $PKG_JSON
cmd /c $NNI_YARN --prod --cwd $NNI_PKG_FOLDER
$NNI_PKG_FOLDER_STATIC = $NNI_PKG_FOLDER + "\static"
Copy-Item "src\webui\build" $NNI_PKG_FOLDER_STATIC -Recurse
......@@ -51,11 +51,12 @@ setup(
'json_tricks',
'numpy',
'psutil',
'pyyaml',
'ruamel.yaml',
'requests',
'scipy',
'schema',
'PythonWebHDFS'
'PythonWebHDFS',
'colorama'
],
entry_points = {
......
......@@ -22,7 +22,7 @@
import { ExperimentProfile, TrialJobStatistics } from './manager';
import { TrialJobDetail, TrialJobStatus } from './trainingService';
type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED' | 'ADD_HYPERPARAMETER';
type TrialJobEvent = TrialJobStatus | 'USER_TO_CANCEL' | 'ADD_CUSTOMIZED' | 'ADD_HYPERPARAMETER' | 'IMPORT_DATA';
type MetricType = 'PERIODICAL' | 'FINAL' | 'CUSTOM' | 'REQUEST_PARAMETER';
interface ExperimentProfileRecord {
......
......@@ -99,6 +99,7 @@ abstract class Manager {
public abstract stopExperiment(): Promise<void>;
public abstract getExperimentProfile(): Promise<ExperimentProfile>;
public abstract updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void>;
public abstract importData(data: string): Promise<void>;
public abstract addCustomizedTrialJob(hyperParams: string): Promise<void>;
public abstract cancelTrialJobByUser(trialJobId: string): Promise<void>;
......
......@@ -22,6 +22,8 @@
import * as assert from 'assert';
import { randomBytes } from 'crypto';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { ChildProcess, spawn, StdioOptions } from 'child_process';
import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
......@@ -32,6 +34,7 @@ import * as util from 'util';
import { Database, DataStore } from './datastore';
import { ExperimentStartupInfo, getExperimentId, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo';
import { Manager } from './manager';
import { TrialConfig } from '../training_service/common/trialConfig';
import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService';
import { getLogger } from './log';
......@@ -146,6 +149,23 @@ function parseArg(names: string[]): string {
return '';
}
function encodeCmdLineArgs(args:any):any{
if(process.platform === 'win32'){
return JSON.stringify(args);
}
else{
return JSON.stringify(JSON.stringify(args));
}
}
function getCmdPy():string{
let cmd = 'python3';
if(process.platform === 'win32'){
cmd = 'python';
}
return cmd;
}
/**
* Generate command line to start automl algorithm(s),
* either start advisor or start a process which runs tuner and assessor
......@@ -179,8 +199,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (!tuner && !advisor) {
throw new Error('Error: specify neither tuner nor advisor is not allowed');
}
let command: string = `python3 -m nni`;
let command: string = `${getCmdPy()} -m nni`;
if (multiPhase) {
command += ' --multi_phase';
}
......@@ -192,7 +211,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (advisor) {
command += ` --advisor_class_name ${advisor.className}`;
if (advisor.classArgs !== undefined) {
command += ` --advisor_args ${JSON.stringify(JSON.stringify(advisor.classArgs))}`;
command += ` --advisor_args ${encodeCmdLineArgs(advisor.classArgs)}`;
}
if (advisor.codeDir !== undefined && advisor.codeDir.length > 1) {
command += ` --advisor_directory ${advisor.codeDir}`;
......@@ -203,7 +222,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
} else {
command += ` --tuner_class_name ${tuner.className}`;
if (tuner.classArgs !== undefined) {
command += ` --tuner_args ${JSON.stringify(JSON.stringify(tuner.classArgs))}`;
command += ` --tuner_args ${encodeCmdLineArgs(tuner.classArgs)}`;
}
if (tuner.codeDir !== undefined && tuner.codeDir.length > 1) {
command += ` --tuner_directory ${tuner.codeDir}`;
......@@ -215,7 +234,7 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
if (assessor !== undefined && assessor.className !== undefined) {
command += ` --assessor_class_name ${assessor.className}`;
if (assessor.classArgs !== undefined) {
command += ` --assessor_args ${JSON.stringify(JSON.stringify(assessor.classArgs))}`;
command += ` --assessor_args ${encodeCmdLineArgs(assessor.classArgs)}`;
}
if (assessor.codeDir !== undefined && assessor.codeDir.length > 1) {
command += ` --assessor_directory ${assessor.codeDir}`;
......@@ -363,6 +382,83 @@ async function getVersion(): Promise<string> {
return deferred.promise;
}
/**
* run command as ChildProcess
*/
function getTunerProc(command: string, stdio: StdioOptions, newCwd: string, newEnv: any): ChildProcess{
let cmd: string = command;
let arg: string[] = [];
let newShell: boolean = true;
if(process.platform === "win32"){
cmd = command.split(" ", 1)[0];
arg = command.substr(cmd.length+1).split(" ");
newShell = false;
}
const tunerProc: ChildProcess = spawn(cmd, arg, {
stdio,
cwd: newCwd,
env: newEnv,
shell: newShell
});
return tunerProc;
}
/**
* judge whether the process is alive
*/
async function isAlive(pid:any): Promise<boolean>{
let deferred : Deferred<boolean> = new Deferred<boolean>();
let alive: boolean = false;
if(process.platform ==='win32'){
try {
const str = cp.execSync(`powershell.exe Get-Process -Id ${pid} -ErrorAction SilentlyContinue`).toString();
if (str) {
alive = true;
}
}
catch (error) {
}
}
else{
try {
await cpp.exec(`kill -0 ${pid}`);
alive = true;
} catch (error) {
//ignore
}
}
deferred.resolve(alive);
return deferred.promise;
}
/**
* kill process
*/
async function killPid(pid:any): Promise<void>{
let deferred : Deferred<void> = new Deferred<void>();
try {
if (process.platform === "win32") {
await cpp.exec(`cmd /c taskkill /PID ${pid} /F`);
}
else{
await cpp.exec(`kill -9 ${pid}`);
}
} catch (error) {
// pid does not exist, do nothing here
}
deferred.resolve();
return deferred.promise;
}
function getNewLine(): string{
if (process.platform === "win32") {
return "\r\n";
}
else{
return "\n";
}
}
export {countFilesRecursively, getRemoteTmpDir, generateParamFileName, getMsgDispatcherCommand, getCheckpointDir,
getLogDir, getExperimentRootDir, getJobCancelStatus, getDefaultDatabaseDir, getIPV4Address,
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion };
mkDirP, delay, prepareUnitTest, parseArg, cleanupUnitTest, uniqueString, randomSelect, getLogLevel, getVersion, getCmdPy, getTunerProc, isAlive, killPid, getNewLine };
......@@ -22,6 +22,7 @@ const INITIALIZE = 'IN';
const REQUEST_TRIAL_JOBS = 'GE';
const REPORT_METRIC_DATA = 'ME';
const UPDATE_SEARCH_SPACE = 'SS';
const IMPORT_DATA = 'FD'
const ADD_CUSTOMIZED_TRIAL_JOB = 'AD';
const TRIAL_END = 'EN';
const TERMINATE = 'TE';
......@@ -38,6 +39,7 @@ const TUNER_COMMANDS: Set<string> = new Set([
REQUEST_TRIAL_JOBS,
REPORT_METRIC_DATA,
UPDATE_SEARCH_SPACE,
IMPORT_DATA,
ADD_CUSTOMIZED_TRIAL_JOB,
TERMINATE,
PING,
......@@ -62,6 +64,7 @@ export {
REQUEST_TRIAL_JOBS,
REPORT_METRIC_DATA,
UPDATE_SEARCH_SPACE,
IMPORT_DATA,
ADD_CUSTOMIZED_TRIAL_JOB,
TRIAL_END,
TERMINATE,
......
......@@ -35,10 +35,10 @@ import {
import {
TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../common/trainingService';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getLogLevel } from '../common/utils';
import { delay, getCheckpointDir, getExperimentRootDir, getLogDir, getMsgDispatcherCommand, mkDirP, getTunerProc, getLogLevel, isAlive, killPid } from '../common/utils';
import {
ADD_CUSTOMIZED_TRIAL_JOB, INITIALIZE, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, PING,
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE
REPORT_METRIC_DATA, REQUEST_TRIAL_JOBS, SEND_TRIAL_JOB_PARAMETER, TERMINATE, TRIAL_END, UPDATE_SEARCH_SPACE, IMPORT_DATA
} from './commands';
import { createDispatcherInterface, IpcInterface } from './ipcInterface';
......@@ -99,6 +99,17 @@ class NNIManager implements Manager {
return this.storeExperimentProfile();
}
public importData(data: string): Promise<void> {
if (this.dispatcher === undefined) {
return Promise.reject(
new Error('tuner has not been setup')
);
}
this.dispatcher.sendCommand(IMPORT_DATA, data);
return this.dataStore.storeTrialJobEvent('IMPORT_DATA', '', data);
}
public addCustomizedTrialJob(hyperParams: string): Promise<void> {
if (this.currSubmittedTrialNum >= this.experimentProfile.params.maxTrialNum) {
return Promise.reject(
......@@ -290,12 +301,7 @@ class NNIManager implements Manager {
NNI_INCLUDE_INTERMEDIATE_RESULTS: includeIntermediateResultsEnv
};
let newEnv = Object.assign({}, process.env, nniEnv);
const tunerProc: ChildProcess = spawn(command, [], {
stdio,
cwd: newCwd,
env: newEnv,
shell: true
});
const tunerProc: ChildProcess = getTunerProc(command,stdio,newCwd,newEnv);
this.dispatcherPid = tunerProc.pid;
this.dispatcher = createDispatcherInterface(tunerProc);
......@@ -341,16 +347,10 @@ class NNIManager implements Manager {
// gracefully terminate tuner and assessor here, wait at most 30 seconds.
for (let i: number = 0; i < 30; i++) {
if (!tunerAlive) { break; }
try {
await cpp.exec(`kill -0 ${this.dispatcherPid}`);
} catch (error) { tunerAlive = false; }
tunerAlive = await isAlive(this.dispatcherPid);
await delay(1000);
}
try {
await cpp.exec(`kill -9 ${this.dispatcherPid}`);
} catch (error) {
// this.tunerPid does not exist, do nothing here
}
await killPid(this.dispatcherPid);
const trialJobList: TrialJobDetail[] = await this.trainingService.listTrialJobs();
// TO DO: to promise all
for (const trialJob of trialJobList) {
......
......@@ -42,6 +42,7 @@ describe('Unit test for dataStore', () => {
});
after(() => {
ds.close();
cleanupUnitTest();
});
......
......@@ -18,11 +18,10 @@
*/
'use strict';
import * as assert from 'assert';
import { ChildProcess, spawn, StdioOptions } from 'child_process';
import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest } from '../../common/utils';
import { cleanupUnitTest, prepareUnitTest, getTunerProc, getCmdPy } from '../../common/utils';
import * as CommandType from '../commands';
import { createDispatcherInterface, IpcInterface } from '../ipcInterface';
import { NNIError } from '../../common/errors';
......@@ -39,15 +38,21 @@ function runProcess(): Promise<Error | null> {
// create fake assessor process
const stdio: StdioOptions = ['ignore', 'pipe', process.stderr, 'pipe', 'pipe'];
const proc: ChildProcess = spawn('python3 assessor.py', [], { stdio, cwd: 'core/test', shell: true });
const command: string = getCmdPy() + ' assessor.py';
const proc: ChildProcess = getTunerProc(command, stdio, 'core/test', process.env);
// record its sent/received commands on exit
proc.on('error', (error: Error): void => { deferred.resolve(error); });
proc.on('exit', (code: number): void => {
if (code !== 0) {
deferred.resolve(new Error(`return code: ${code}`));
} else {
sentCommands = proc.stdout.read().toString().split('\n');
let str = proc.stdout.read().toString();
if(str.search("\r\n")!=-1){
sentCommands = str.split("\r\n");
}
else{
sentCommands = str.split('\n');
}
deferred.resolve(null);
}
});
......
......@@ -22,7 +22,7 @@
import * as assert from 'assert';
import { ChildProcess, spawn, StdioOptions } from 'child_process';
import { Deferred } from 'ts-deferred';
import { cleanupUnitTest, prepareUnitTest, getMsgDispatcherCommand } from '../../common/utils';
import { cleanupUnitTest, prepareUnitTest, getMsgDispatcherCommand, getTunerProc } from '../../common/utils';
import * as CommandType from '../commands';
import { createDispatcherInterface, IpcInterface } from '../ipcInterface';
......@@ -50,9 +50,7 @@ function startProcess(): void {
// advisor
undefined
);
const proc: ChildProcess = spawn(dispatcherCmd, [], { stdio, cwd: 'core/test', shell: true });
const proc: ChildProcess = getTunerProc(dispatcherCmd, stdio, 'core/test', process.env);
proc.on('error', (error: Error): void => {
procExit = true;
procError = true;
......
......@@ -33,6 +33,7 @@ import { NNIManager } from '../nnimanager';
import { SqlDB } from '../sqlDatabase';
import { MockedTrainingService } from './mockedTrainingService';
import { MockedDataStore } from './mockedDatastore';
import * as path from 'path';
async function initContainer(): Promise<void> {
prepareUnitTest();
......@@ -183,7 +184,7 @@ describe('Unit test for nnimanager', function () {
it('test getExperimentProfile', () => {
return nniManager.getExperimentProfile().then((experimentProfile) => {
expect(experimentProfile.id).to.be.equal('unittest');
expect(experimentProfile.logDir).to.be.equal(os.homedir()+'/nni/experiments/unittest');
expect(experimentProfile.logDir).to.be.equal(path.join(os.homedir(),'nni','experiments','unittest'));
}).catch((error) => {
assert.fail(error);
......
......@@ -3,7 +3,6 @@
"version": "999.0.0-developing",
"main": "index.js",
"scripts": {
"postbuild": "cp -rf config ./dist/",
"build": "tsc",
"test": "nyc mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --exclude node_modules/**/**/*.test.ts --colors",
"start": "node dist/main.js",
......@@ -35,7 +34,7 @@
"@types/express": "^4.16.0",
"@types/glob": "^7.1.1",
"@types/mocha": "^5.2.5",
"@types/node": "^10.12.18",
"@types/node": "10.12.18",
"@types/request": "^2.47.1",
"@types/rx": "^4.1.1",
"@types/sqlite3": "^3.1.3",
......
......@@ -63,6 +63,7 @@ class NNIRestHandler {
this.checkStatus(router);
this.getExperimentProfile(router);
this.updateExperimentProfile(router);
this.importData(router);
this.startExperiment(router);
this.getTrialJobStatistics(router);
this.setClusterMetaData(router);
......@@ -145,6 +146,16 @@ class NNIRestHandler {
});
}
private importData(router: Router): void {
router.post('/experiment/import-data', (req: Request, res: Response) => {
this.nniManager.importData(JSON.stringify(req.body)).then(() => {
res.send();
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private startExperiment(router: Router): void {
router.post('/experiment', expressJoi(ValidationSchemas.STARTEXPERIMENT), (req: Request, res: Response) => {
if (isNewExperiment()) {
......
......@@ -46,6 +46,9 @@ export class MockedNNIManager extends Manager {
public updateExperimentProfile(experimentProfile: ExperimentProfile, updateType: ProfileUpdateType): Promise<void> {
return Promise.resolve();
}
public importData(data: string): Promise<void> {
return Promise.resolve();
}
public getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>();
deferred.resolve([{
......
......@@ -59,10 +59,17 @@ export class GPUSummary {
}
}
export const GPU_INFO_COLLECTOR_FORMAT: string =
export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
......@@ -22,6 +22,12 @@ import { getLogger } from "common/log";
'use strict';
import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData'
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp";
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
......@@ -46,3 +52,130 @@ export async function validateCodeDir(codeDir: string) : Promise<number> {
return fileCount;
}
/**
* crete a new directory
* @param directory
*/
export async function execMkdir(directory: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${directory} -ItemType "directory" -Force`);
} else {
await cpp.exec(`mkdir -p ${directory}`);
}
return Promise.resolve();
}
/**
* crete a new file
* @param filename
*/
export async function execNewFile(filename: string): Promise<void> {
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe New-Item -Path ${filename} -ItemType "file" -Force`);
} else {
await cpp.exec(`touch ${filename}`);
}
return Promise.resolve();
}
/**
* run script
* @param filePath
*/
export function execScript(filePath: string): cp.ChildProcess {
if (process.platform === 'win32') {
return cp.exec(`powershell.exe -file ${filePath}`);
} else {
return cp.exec(`bash ${filePath}`);
}
}
/**
* output the last line of a file
* @param filePath
*/
export async function execTail(filePath: string): Promise<cpp.childProcessPromise.Result> {
let cmdresult: cpp.childProcessPromise.Result;
if (process.platform === 'win32') {
cmdresult = await cpp.exec(`powershell.exe Get-Content ${filePath} -Tail 1`);
} else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
}
return Promise.resolve(cmdresult);
}
/**
* delete a directory
* @param directory
*/
export async function execRemove(directory: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`powershell.exe Remove-Item ${directory}`);
} else {
await cpp.exec(`rm -rf ${directory}`);
}
return Promise.resolve();
}
/**
* kill a process
* @param directory
*/
export async function execKill(pid: string): Promise<void>{
if (process.platform === 'win32') {
await cpp.exec(`cmd /c taskkill /PID ${pid} /T /F`);
} else {
await cpp.exec(`pkill -P ${pid}`);
}
return Promise.resolve();
}
/**
* set environment variable
* @param variable
* @returns command string
*/
export function setEnvironmentVariable(variable: { key: string; value: string }): string{
if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`;
}
else{
return `export ${variable.key}=${variable.value}`;
}
}
/**
* generate script file name
* @param fileNamePrefix
*/
export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') {
return fileNamePrefix + '.ps1';
} else {
return fileNamePrefix + '.sh';
}
}
/**
* generate script file
* @param gpuMetricCollectorScriptFolder
*/
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
} else {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
);
}
}
......@@ -25,9 +25,10 @@ import * as fs from 'fs';
import * as os from 'os';
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { execMkdir, getScriptName, getgpuMetricsCollectorScriptContent, execScript, execTail, execRemove, execKill } from '../common/util'
import { getLogger, Logger } from '../../common/log';
import { delay } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT, GPUInfo, GPUSummary } from '../common/gpuData';
import { GPUInfo, GPUSummary } from '../common/gpuData';
/**
* GPUScheduler for local training service
......@@ -57,6 +58,19 @@ class GPUScheduler {
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await execMkdir(this.gpuMetricCollectorScriptFolder);
//generate gpu_metrics_collector script
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, getScriptName('gpu_metrics_collector'));
const gpuMetricsCollectorScriptContent: string = getgpuMetricsCollectorScriptContent(this.gpuMetricCollectorScriptFolder);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
execScript(gpuMetricsCollectorScriptPath)
}
public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0)
......@@ -78,33 +92,16 @@ class GPUScheduler {
this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
await execKill(pid);
await execRemove(this.gpuMetricCollectorScriptFolder);
} catch (error) {
this.log.error(`GPU scheduler error: ${error}`);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
const gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid')
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
private async updateGPUSummary(): Promise<void> {
const cmdresult: cpp.childProcessPromise.Result =
await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
await execTail(path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics'));
if (cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else {
......
......@@ -18,7 +18,6 @@
*/
'use strict';
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import { EventEmitter } from 'events';
......@@ -32,7 +31,8 @@ import {
HostJobApplicationForm, HyperParameters, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
} from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString } from '../../common/utils';
import { delay, generateParamFileName, getExperimentRootDir, getJobCancelStatus, uniqueString, isAlive, getNewLine } from '../../common/utils';
import { execMkdir, getScriptName, execScript, setEnvironmentVariable, execNewFile } from '../common/util'
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler';
......@@ -169,14 +169,7 @@ class LocalTrainingService implements TrainingService {
return this.getHostJob(trialJobId);
}
if (trialJob.status === 'RUNNING') {
let alive: boolean = false;
try {
await cpp.exec(`kill -0 ${trialJob.pid}`);
alive = true;
} catch (error) {
//ignore
}
let alive: boolean = await isAlive(trialJob.pid);
if (!alive) {
trialJob.endTime = Date.now();
this.setTrialJobStatus(trialJob, 'FAILED');
......@@ -284,7 +277,9 @@ class LocalTrainingService implements TrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> {
if (!this.initialized) {
this.rootDir = getExperimentRootDir();
await cpp.exec(`mkdir -p ${this.rootDir}`);
if(!fs.existsSync(this.rootDir)){
await cpp.exec(`powershell.exe mkdir ${this.rootDir}`);
}
this.initialized = true;
}
switch (key) {
......@@ -369,7 +364,7 @@ class LocalTrainingService implements TrainingService {
private getEnvironmentVariables(
trialJobDetail: TrialJobDetail,
resource?: { gpuIndices: number[] }): { key: string; value: string }[] {
resource: { gpuIndices: number[] }): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_SYS_DIR', value: trialJobDetail.workingDirectory },
......@@ -379,12 +374,10 @@ class LocalTrainingService implements TrainingService {
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
];
if (resource !== undefined && resource.gpuIndices.length > 0) {
envVariables.push({
key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '' : resource.gpuIndices.join(',')
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
});
}
return envVariables;
}
......@@ -467,36 +460,52 @@ class LocalTrainingService implements TrainingService {
}
}
private getScript(localTrailConfig: TrialConfig, workingDirectory: string): string[]{
let script: string[] = [];
if (process.platform === "win32") {
script.push(
`cmd /c ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`$NOW_DATE = [int64](([datetime]::UtcNow)-(get-date "1/1/1970")).TotalSeconds`,
`$NOW_DATE = "$NOW_DATE" + "000"`,
`Write $LASTEXITCODE " " $NOW_DATE | Out-File ${path.join(workingDirectory, '.nni', 'state')} -NoNewline -encoding utf8`);
}
else{
script.push(
`eval ${localTrailConfig.command} 2>${path.join(workingDirectory, 'stderr')}`,
`echo $? \`date +%s000\` >${path.join(workingDirectory, '.nni', 'state')}`);
}
return script;
}
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
const runScriptLines: string[] = [];
if (!this.localTrailConfig) {
throw new Error('trial config is not initialized');
}
runScriptLines.push(
'#!/bin/bash',
`cd ${this.localTrailConfig.codeDir}`);
const runScriptLines: string[] = [];
if (process.platform !== "win32"){
runScriptLines.push('#!/bin/bash');
}
runScriptLines.push(`cd ${this.localTrailConfig.codeDir}`);
for (const variable of variables) {
runScriptLines.push(`export ${variable.key}=${variable.value}`);
runScriptLines.push(setEnvironmentVariable(variable));
}
runScriptLines.push(
`eval ${this.localTrailConfig.command} 2>${path.join(trialJobDetail.workingDirectory, 'stderr')}`,
`echo $? \`date +%s000\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`);
await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`);
await cpp.exec(`mkdir -p ${path.join(trialJobDetail.workingDirectory, '.nni')}`);
await cpp.exec(`touch ${path.join(trialJobDetail.workingDirectory, '.nni', 'metrics')}`);
await fs.promises.writeFile(
path.join(trialJobDetail.workingDirectory, 'run.sh'), runScriptLines.join('\n'), { encoding: 'utf8', mode: 0o777 });
const scripts: string[] = this.getScript(this.localTrailConfig, trialJobDetail.workingDirectory);
scripts.forEach(script => {
runScriptLines.push(script);
});
await execMkdir(trialJobDetail.workingDirectory);
await execMkdir(path.join(trialJobDetail.workingDirectory, '.nni'));
await execNewFile(path.join(trialJobDetail.workingDirectory, '.nni', 'metrics'));
const scriptName: string = getScriptName('run');
await fs.promises.writeFile(path.join(trialJobDetail.workingDirectory, scriptName), runScriptLines.join(getNewLine()), { encoding: 'utf8', mode: 0o777 });
await this.writeParameterFile(trialJobDetail.workingDirectory, (<TrialJobApplicationForm>trialJobDetail.form).hyperParameters);
const process: cp.ChildProcess = cp.exec(`bash ${path.join(trialJobDetail.workingDirectory, 'run.sh')}`);
const trialJobProcess: cp.ChildProcess = execScript(path.join(trialJobDetail.workingDirectory, scriptName));
this.setTrialJobStatus(trialJobDetail, 'RUNNING');
trialJobDetail.startTime = Date.now();
trialJobDetail.pid = process.pid;
trialJobDetail.pid = trialJobProcess.pid;
this.setExtraProperties(trialJobDetail, resource);
let buffer: Buffer = Buffer.alloc(0);
......
......@@ -46,7 +46,7 @@ import {
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
} from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
......@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
GPU_INFO_COLLECTOR_FORMAT_LINUX,
remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'),
);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment