Unverified Commit 87ed70cd authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Merge pull request #4 from Microsoft/merge-from-dogfood-v1-0824

[Code merge] Merge code from dogfood-v1 branch
parents f1f6f880 61d47a4d
......@@ -19,9 +19,11 @@
'use strict';
import * as assert from 'assert';
import * as nodeNvidiaSmi from 'node-nvidia-smi';
import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log';
/* Example of nvidia-smi result
{
......@@ -287,9 +289,13 @@ class GPUScheduler {
private gpuSummary!: GPUSummary;
private stopping: boolean;
private log: Logger;
private nvdmNotFoundRegex: RegExp;
constructor() {
this.stopping = false;
this.log = getLogger();
this.nvdmNotFoundRegex = /nvidia-smi: not found/gi;
}
public async run(): Promise<void> {
......@@ -297,7 +303,11 @@ class GPUScheduler {
try {
this.gpuSummary = await this.readGPUSummary();
} catch (error) {
console.error('Read GPU summary failed with error', error);
this.log.error('Read GPU summary failed with error: ', error);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if(this.nvdmNotFoundRegex.test(error)) {
break;
}
}
await delay(5000);
}
......@@ -315,28 +325,42 @@ class GPUScheduler {
this.stopping = true;
}
private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] {
let gpuInfos : GPUInfo[] = [];
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
assert(gpuNumber > 0);
if(gpuNumber == 1) {
const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
} else {
const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu;
gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
}
return gpuInfos;
}
private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
return new GPUInfo(
typeof embededGPUSummary.process === 'object' ? 1 : 0,
parseFloat(embededGPUSummary.utilization.memory_util),
parseFloat(embededGPUSummary.utilization.gpu_util),
parseInt(embededGPUSummary.minor_number, 10));
}
private readGPUSummary(): Promise<GPUSummary> {
return new Promise((resolve: Function, reject: Function): void => {
nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => {
if (error !== undefined) {
if (error) {
reject(error);
} else {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
const gpuSummary: GPUSummary = new GPUSummary(
parseInt(data.nvidia_smi_log.attached_gpus, 10),
gpuNumber,
Date().toString(),
data.nvidia_smi_log.gpu.map((gpuInfo: {
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}) => new GPUInfo(
typeof gpuInfo.process === 'object' ? 1 : 0,
parseFloat(gpuInfo.utilization.memory_util),
parseFloat(gpuInfo.utilization.gpu_util),
parseInt(gpuInfo.minor_number, 10)
))
this.generateEmbededGPUSummary(data)
);
resolve(gpuSummary);
}
......
......@@ -27,6 +27,8 @@ import * as path from 'path';
import * as ts from 'tail-stream';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import {
HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus
......@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService {
private initialized: boolean;
private stopping: boolean;
private rootDir!: string;
private codeDir!: string;
private command!: string;
private log: Logger;
protected log: Logger;
protected localTrailConfig?: TrialConfig;
constructor() {
this.eventEmitter = new EventEmitter();
......@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService {
this.initialized = true;
}
switch (key) {
case 'codeDir':
this.codeDir = value;
break;
case 'command':
this.command = value;
case TrialConfigMetadataKey.TRIAL_CONFIG:
this.localTrailConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error
if (!this.localTrailConfig) {
throw new Error('trial config parsed failed');
}
break;
default:
}
......@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService {
public getClusterMetadata(key: string): Promise<string> {
switch (key) {
case 'codeDir':
return Promise.resolve(this.codeDir);
case 'command':
return Promise.resolve(this.command);
case TrialConfigMetadataKey.TRIAL_CONFIG:
let getResult : Promise<string>;
if(!this.localTrailConfig) {
getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
} else {
getResult = Promise.resolve(!this.localTrailConfig? '' : JSON.stringify(this.localTrailConfig));
}
return getResult;
default:
return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found'));
}
......@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService {
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
const runScriptLines: string[] = [];
if (!this.localTrailConfig) {
throw new Error('trial config is not initialized');
}
runScriptLines.push(
'#!/bin/bash',
`cd ${this.codeDir}`);
`cd ${this.localTrailConfig.codeDir}`);
for (const variable of variables) {
runScriptLines.push(`export ${variable.key}=${variable.value}`);
}
runScriptLines.push(
`eval ${this.command} 2>${path.join(trialJobDetail.workingDirectory, '.nni', 'stderr')}`,
`eval ${this.localTrailConfig.command} 2>${path.join(trialJobDetail.workingDirectory, '.nni', 'stderr')}`,
`echo $? \`date +%s%3N\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`);
await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`);
......
......@@ -22,6 +22,7 @@
import { TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUScheduler } from './gpuScheduler';
import { LocalTrainingService } from './localTrainingService';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
type LocalTrialJobDetailForGPU = TrialJobDetail & { gpuIndices: number[] };
......@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> {
await super.setClusterMetadata(key, value);
switch (key) {
case 'requiredGPUNum':
this.requiredGPUNum = parseInt(value, 10);
case TrialConfigMetadataKey.TRIAL_CONFIG:
if(this.localTrailConfig !== undefined) {
this.requiredGPUNum = this.localTrailConfig.gpuNum;
} else {
// If no valid trial config is initialized, set requiredGPUNum to 0 as fallback value.
this.requiredGPUNum = 0;
}
this.log.info('required GPU number is ' + this.requiredGPUNum);
if (this.gpuScheduler === undefined) {
this.gpuScheduler = new GPUScheduler();
}
......@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public getClusterMetadata(key: string): Promise<string> {
switch (key) {
case 'requiredGPUNum':
return Promise.resolve(`${this.requiredGPUNum}`);
default:
return super.getClusterMetadata(key);
}
}
public cleanUp(): Promise<void> {
if (this.gpuScheduler !== undefined) {
this.gpuScheduler.stop();
......@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
protected onTrialJobStatusChanged(trialJob: LocalTrialJobDetailForGPU, oldStatus: TrialJobStatus): void {
if (trialJob.gpuIndices.length !== 0) {
if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length !== 0) {
if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
for (const index of trialJob.gpuIndices) {
this.availableGPUIndices[index] = false;
......
......@@ -24,7 +24,7 @@ import { EventEmitter } from 'events';
import * as path from 'path';
import { Client } from 'ssh2';
import { getLogger, Logger } from '../../common/log';
import { TrialJobStatus } from '../../common/trainingService';
import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService';
import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility';
......@@ -56,8 +56,12 @@ export class MetricsCollector {
if (rmMetrics !== undefined && rmMetrics.length > 0) {
rmMetrics.forEach((jobMetrics) => {
const trialJobId : string = jobMetrics.jobId;
const trialJobDetail : RemoteMachineTrialJobDetail = <RemoteMachineTrialJobDetail>this.trialJobsMap.get(trialJobId);
assert(trialJobDetail);
// If job status is not alive again, remove its GPU reservation
if(!['RUNNING'].includes(jobMetrics.jobStatus)) {
trialJobDetail.status = jobMetrics.jobStatus;
this.log.info(`Set trialjob ${trialJobDetail.id} status to ${trialJobDetail.status}`);
runningJobsMap.forEach((jobIds: string[], rmMeta: RemoteMachineMeta) => {
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
if(rmMeta.gpuReservation !== undefined) {
......@@ -81,11 +85,19 @@ export class MetricsCollector {
if (status.includes(trialJob.status)) {
if (map.has(trialJob.rmMeta)) {
const ids = map.get(trialJob.rmMeta);
if (ids !== undefined) {
if (ids !== undefined && !ids.includes(id)) {
ids.push(id);
}
} else {
map.set(trialJob.rmMeta, [id]);
let initJobIds : string[] = [id];
// If the remote machine has jobs reserve GPU, also put that jobs into list to get metrics data
if(trialJob.rmMeta.gpuReservation !== undefined) {
const concatJobIds : string[] = initJobIds.concat(Array.from(trialJob.rmMeta.gpuReservation.values()));
initJobIds = concatJobIds.filter((item, pos) => concatJobIds.indexOf(item) === pos);
}
map.set(trialJob.rmMeta, initJobIds);
}
}
});
......
......@@ -23,15 +23,6 @@ import { Client } from 'ssh2';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUSummary } from '../common/gpuData';
/**
* Enum of key for remote machine metadata for configuration
*/
export enum RemoteMachineMetadataKey {
MACHINE_LIST = 'machine_list',
TRIAL_CONFIG = 'trial_config',
EXPERIMENT_ID = 'experimentId',
RANDOM_SCHEDULER = 'random_scheduler'
}
/**
* Metadata of remote machine for configuration and statuc query
......@@ -54,21 +45,6 @@ export class RemoteMachineMeta {
}
}
/**
* Configuration for trial job on remote machine
*/
export class RemoteMachineTrialConfig {
public readonly command : string;
public readonly codeDir : string;
public readonly gpuNum : number;
constructor(command : string, codeDir : string, gpuNum : number) {
this.command = command;
this.codeDir = codeDir;
this.gpuNum = gpuNum;
}
}
/**
* The execution result for command executed on remote machine
*/
......
......@@ -37,12 +37,14 @@ import {
} from '../../common/trainingService';
import { delay, getExperimentRootDir, uniqueString } from '../../common/utils';
import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler';
import { MetricsCollector } from './metricsCollector';
import {
HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta, RemoteMachineMetadataKey,
HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta,
REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult,
RemoteMachineTrialConfig, RemoteMachineTrialJobDetail, ScheduleResultType
RemoteMachineTrialJobDetail, ScheduleResultType
} from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility';
......@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Experiment root directory
private expRootDir: string;
private remoteExpRootDir: string;
private trialConfig: RemoteMachineTrialConfig | undefined;
private trialConfig: TrialConfig | undefined;
private gpuScheduler: GPUScheduler;
private jobQueue: string[];
private timer: ObservableTimer;
......@@ -93,7 +95,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Wait to schedule job in next time iteration
break;
}
};
}
const metricsCollector: MetricsCollector = new MetricsCollector(
this.machineSSHClientMap, this.trialJobsMap, this.remoteExpRootDir, this.metricsEmitter);
await metricsCollector.collectMetrics();
......@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService {
form);
this.jobQueue.push(trialJobId);
this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail);
} else {
return Promise.reject(new Error(`Job form not supported: ${JSON.stringify(form)}, jobType should be HOST or TRIAL.`));
......@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove the job with trialJobId from job queue
const index : number = this.jobQueue.indexOf(trialJobId);
if(index >= 0) {
if (index >= 0) {
this.jobQueue.splice(index, 1);
}
......@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService {
*/
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case RemoteMachineMetadataKey.MACHINE_LIST:
case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value);
break;
case RemoteMachineMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: RemoteMachineTrialConfig = <RemoteMachineTrialConfig>JSON.parse(value);
case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error
if (!remoteMachineTrailConfig) {
throw new Error('trial config parsed failed');
......@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService {
this.log.error(errorMessage);
deferred.reject();
throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
} else if(rmScheduleResult.resultType == ScheduleResultType.SUCCEED
} else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
&& rmScheduleResult.scheduleInfo !== undefined) {
const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
......@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
deferred.resolve(true);
} else if(rmScheduleResult.resultType == ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
} else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
deferred.resolve(false);
} else {
deferred.reject('Invalid schedule resutl type: ' + rmScheduleResult.resultType);
deferred.reject(`Invalid schedule resutl type: ${rmScheduleResult.resultType}`);
}
return deferred.promise;
......
......@@ -19,14 +19,16 @@
'use strict';
import * as assert from 'assert';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { Client, ClientChannel, SFTPWrapper } from 'ssh2';
import * as stream from "stream";
import * as stream from 'stream';
import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors';
import { getExperimentRootDir } from '../../common/utils';
import { getLogger } from '../../common/log';
import { uniqueString } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData';
/**
......@@ -43,17 +45,18 @@ export namespace SSHClientUtility {
*/
export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
const localCompressedDir: string = path.join(getExperimentRootDir(), 'directory.tar.gz');
const remoteCompressedDir: string = path.join(remoteDirectory, 'directory.tar.gz');
const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(os.tmpdir(), tmpTarName);
// Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localCompressedDir} -C ${localDirectory} .`);
await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);
// Copy the compressed file to remoteDirectory and delete it
await copyFileToRemote(localCompressedDir, remoteCompressedDir, sshClient);
await cpp.exec(`rm ${localCompressedDir}`);
await copyFileToRemote(localTarPath, remoteTarPath, sshClient);
await cpp.exec(`rm ${localTarPath}`);
// Decompress the remote compressed file in and delete it
await remoteExeCommand(`tar -oxzf ${remoteCompressedDir} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteCompressedDir}`, sshClient);
await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
deferred.resolve();
return deferred.promise;
......@@ -65,18 +68,23 @@ export namespace SSHClientUtility {
* @param remoteFilePath the target path in remote machine
* @param sshClient SSH Client
*/
export function copyFileToRemote(localFilePath : string, remoteFilePath : string, sshClient : Client) : Promise<string> {
const deferred: Deferred<string> = new Deferred<string>();
export function copyFileToRemote(localFilePath : string, remoteFilePath : string, sshClient : Client) : Promise<boolean> {
assert(sshClient !== undefined);
const deferred: Deferred<boolean> = new Deferred<boolean>();
sshClient.sftp((err : Error, sftp : SFTPWrapper) => {
if (err) {
deferred.reject();
getLogger().error(`copyFileToRemote: ${err.message}, ${localFilePath}, ${remoteFilePath}`);
deferred.reject(err);
return;
}
assert(sftp !== undefined);
sftp.fastPut(localFilePath, remoteFilePath, (fastPutErr : Error) => {
sftp.end();
if (fastPutErr) {
deferred.reject();
deferred.reject(fastPutErr);
} else {
deferred.resolve('success');
deferred.resolve(true);
}
});
});
......@@ -97,14 +105,16 @@ export namespace SSHClientUtility {
client.exec(command, (err : Error, channel : ClientChannel) => {
if (err) {
getLogger().error(`remoteExeCommand: ${err.message}`);
deferred.reject(err);
return;
}
channel.on('data', function(data : any, dataStderr : any) {
channel.on('data', (data : any, dataStderr : any) => {
if (dataStderr) {
stderr += data.toString();
}
else {
} else {
stdout += data.toString();
}
}).on('exit', (code, signal) => {
......@@ -124,7 +134,10 @@ export namespace SSHClientUtility {
const deferred: Deferred<string> = new Deferred<string>();
sshClient.sftp((err: Error, sftp : SFTPWrapper) => {
if (err) {
getLogger().error(`getRemoteFileContent: ${err.message}`);
deferred.reject(new Error(`SFTP error: ${err.message}`));
return;
}
try {
const sftpStream : stream.Readable = sftp.createReadStream(filePath);
......@@ -133,11 +146,16 @@ export namespace SSHClientUtility {
sftpStream.on('data', (data : Buffer | string) => {
dataBuffer += data;
}).on('error', (streamErr: Error) => {
sftp.end();
deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
}).on('end', () => {
// sftp connection need to be released manually once operation is done
sftp.end();
deferred.resolve(dataBuffer);
});
} catch (error) {
getLogger().error(`getRemoteFileContent: ${error.message}`);
sftp.end();
deferred.reject(new Error(`SFTP error: ${error.message}`));
}
});
......
......@@ -27,7 +27,7 @@ import * as tmp from 'tmp';
import * as component from '../../common/component';
import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils';
import { RemoteMachineMetadataKey } from '../remote_machine/remoteMachineData';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { RemoteMachineTrainingService } from '../remote_machine/remoteMachineTrainingService';
// copy mockedTrail.py to local folder
......@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
if (skip) {
return;
}
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.MACHINE_LIST, machineList);
await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.MACHINE_LIST, machineList);
await remoteMachineTrainingService.setClusterMetadata(
RemoteMachineMetadataKey.TRIAL_CONFIG, `{"command":"sleep 1h && echo ","codeDir":"${localCodeDir}","gpuNum":1}`);
TrialConfigMetadataKey.TRIAL_CONFIG, `{"command":"sleep 1h && echo ","codeDir":"${localCodeDir}","gpuNum":1}`);
const form: TrialJobApplicationForm = {
jobType: 'TRIAL',
hyperParameters: 'mock hyperparameters'
......@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
return;
}
// set machine list'
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.MACHINE_LIST, machineList);
await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.MACHINE_LIST, machineList);
// set meta data
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}`
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.TRIAL_CONFIG, trialConfig);
await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig);
// submit job
const form: TrialJobApplicationForm = {
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import { Client } from 'ssh2';
import { Deferred } from 'ts-deferred';
import { SSHClientUtility } from '../remote_machine/sshClientUtility';
const LOCALFILE: string = '/tmp/sshclientUTData';
const REMOTEFILE: string = '/tmp/sshclientUTData';
async function copyFile(conn: Client): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
conn.sftp((err, sftp) => {
if (err) {
deferred.reject(err);
return;
}
sftp.fastPut(
LOCALFILE,
REMOTEFILE, (fastPutErr: Error) => {
sftp.end();
if (fastPutErr) {
deferred.reject(fastPutErr);
} else {
deferred.resolve();
}
}
);
});
return deferred.promise;
}
async function copyFileToRemoteLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.copyFileToRemote(LOCALFILE, REMOTEFILE, conn);
}
}
async function remoteExeCommandLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.remoteExeCommand('ls', conn);
}
}
async function getRemoteFileContentLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.getRemoteFileContent(REMOTEFILE, conn);
}
}
describe('sshClientUtility test', () => {
let skip: boolean = true;
let rmMeta: any;
try {
rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8'));
} catch (err) {
skip = true;
}
before(async () => {
await cpp.exec(`echo '1234' > ${LOCALFILE}`);
});
after(() => {
fs.unlinkSync(LOCALFILE);
});
it('Test SSHClientUtility', (done) => {
if (skip) {
done();
return;
}
const conn: Client = new Client();
conn.on('ready', async () => {
await copyFile(conn);
await Promise.all([
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
remoteExeCommandLoop(conn),
getRemoteFileContentLoop(conn)
]);
done();
}).connect(rmMeta);
});
});
......@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' {
function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
namespace smi {
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: {
interface EmbededGPUSummary {
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}[];
}
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: EmbededGPUSummary[] | EmbededGPUSummary;
};
}
}
......
......@@ -18,7 +18,7 @@
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import logging
from enum import Enum
......@@ -64,8 +64,14 @@ def receive():
Returns a tuple of command (CommandType) and payload (str)
"""
header = _in_file.read(8)
logging.getLogger(__name__).debug('Received command, header: [%s]' % header)
if header is None or len(header) < 8:
# Pipe EOF encountered
logging.getLogger(__name__).debug('Pipe EOF encountered')
return None, None
length = int(header[2:])
data = _in_file.read(length)
command = CommandType(header[:2])
data = data.decode('utf8')
logging.getLogger(__name__).debug('Received command, data: [%s]' % data)
return command, data
......@@ -127,6 +127,8 @@ def _handle_request(tuner):
_logger.debug('waiting receive_message')
command, data = receive()
if command is None:
return False
_logger.debug(command)
_logger.debug(data)
......
......@@ -18,7 +18,6 @@ Click the tab "Overview".
* See good performance trial.
* See search_space json.
* See complete trial cdf graph.
### View job accuracy
......@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove
Click the tab "Hyper Parameter" to see the parallel graph.
* You can select the percentage to cut down some lines.
* Choose two axes to swap its positions
* Choose two axis to swap its positions
### View trial status
......@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically:
* Trial detail: trial's id, trial's duration, start time, end time, status and accuracy.
* Kill: you can kill a job that status is running.
* Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page.
* Log: click the button, you can see the log about NNI and pai.
### Control
Click the tab "Control" to add a new trial or update the search_space file.
Click the tab "Control" to add a new trial or update the search_space file and some experiment parameters.
### View Tensorboard Graph
......
......@@ -4,22 +4,6 @@
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="theme-color" content="#000000">
<!--
manifest.json provides metadata used when your web app is added to the
homescreen on Android. See https://developers.google.com/web/fundamentals/engage-and-retain/web-app-manifest/
-->
<link rel="manifest" href="%PUBLIC_URL%/manifest.json">
<link rel="shortcut icon" href="%PUBLIC_URL%/icon.jpg">
<!--
Notice the use of %PUBLIC_URL% in the tags above.
It will be replaced with the URL of the `public` folder during the build.
Only files inside the `public` folder can be referenced from the HTML.
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
work correctly both with client-side routing and a non-root public URL.
Learn how to configure a non-root public URL by running `npm run build`.
-->
<title>Neural Network Intelligence</title>
</head>
......
.header_title{
width: 100%;
height: 60px;
line-height: 60px;
font-size: 24px;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
color: white;
background-color: rgb(60,141,188) ;
user-select: none;
text-align: center;
......
......@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> {
render () {
return (
<div className="App">
<header className="header_title"><img src={require('./logo.jpg')} alt=""/></header>
{/* <header className="header_title"><img src={require('./logo.jpg')} alt=""/></header> */}
<header className="header_title">Neural Network Intelligence</header>
<div className="content">
<SlideBar />
<div className="right">{this.props.children}</div>
......
......@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> {
// update trial number parameters
trialParameterMess = (exper: Experiments, str: string) => {
this.getUpdateExample();
axios(`${MANAGER_IP}/experiment`, {
method: 'PUT',
headers: {
......@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> {
}).then(res => {
if (res.status === 200) {
message.success(`Update ${str.toLocaleLowerCase()} successfully`);
this.getUpdateExample();
} else {
message.error(`Update ${str.toLocaleLowerCase()} failed`);
}
......@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> {
}
userUpdateSeaspace = () => {
this.updateSearchLoad();
this.getUpdateExample();
const { updateSearch } = this.state;
if (updateSearch !== '' || updateSearch !== null) {
const { experiment } = this.state;
......
......@@ -2,12 +2,12 @@ import * as React from 'react';
import axios from 'axios';
import { Table, Select, Row, Col, Icon } from 'antd';
import { MANAGER_IP, overviewItem, roundNum } from '../const';
import ReactEcharts from 'echarts-for-react';
// import ReactEcharts from 'echarts-for-react';
const Option = Select.Option;
import JSONTree from 'react-json-tree';
require('echarts/lib/chart/line');
require('echarts/lib/component/tooltip');
require('echarts/lib/component/title');
// require('echarts/lib/chart/line');
// require('echarts/lib/component/tooltip');
// require('echarts/lib/component/title');
require('../style/sessionpro.css');
interface TableObj {
......@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> {
});
}
// draw CDF
const { trialRun } = this.state;
if (this._isMounted) {
this.setState({
option: this.getOption(trialRun)
});
}
// const { trialRun } = this.state;
// if (this._isMounted) {
// this.setState({
// option: this.getOption(trialRun)
// });
// }
// CDF graph 'No data' judge
if (trialRun.length === 0) {
if (this._isMounted) {
this.setState({
noData: 'No data'
});
}
} else {
if (this._isMounted) {
this.setState({
noData: ''
});
}
}
// if (trialRun.length === 0) {
// if (this._isMounted) {
// this.setState({
// noData: 'No data'
// });
// }
// } else {
// if (this._isMounted) {
// this.setState({
// noData: ''
// });
// }
// }
}
});
}
......@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> {
};
const {
trialProfile, searchSpace, tunerAssessor, tableData, option, noData
trialProfile, searchSpace, tunerAssessor, tableData,
// option, noData
} = this.state;
let running;
if (trialProfile.endTime === 'not over') {
......@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> {
scroll={{ x: '100%', y: 540 }}
/>
</div>
<div className="cdf">
{/* <div className="cdf">
<ReactEcharts
option={option}
style={{ height: 500, padding: '0px' }}
/>
<div className="addNodata">{noData}</div>
</div>
</div> */}
</div>
);
}
......
......@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s'
EXPERIMENT_SUCCESS_INFO = 'Start experiment success! The experiment id is %s, and the restful server post is %s.\n' \
'You can use these commands to get more information about this experiment:\n' \
' commands description\n' \
'1. nnictl experiment ls list all of experiments\n' \
'1. nnictl experiment show show the information of experiments\n' \
'2. nnictl trial ls list all of trial jobs\n' \
'3. nnictl stop stop a experiment\n' \
'4. nnictl trial kill kill a trial job by id\n' \
......
......@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None):
process = Popen(cmds, stdout=stdout_file, stderr=stderr_file)
return process
def set_local_config(experiment_config, port):
'''Call setClusterMetadata (rest PUT /parameters/cluster-metadata) to pass platform and machineList"'''
def set_trial_config(experiment_config, port):
'''set trial configuration'''
request_data = dict()
request_data['codeDir'] = experiment_config['trial']['trialCodeDir']
request_data['command'] = experiment_config['trial']['trialCommand']
value_dict = dict()
value_dict['command'] = experiment_config['trial']['trialCommand']
value_dict['codeDir'] = experiment_config['trial']['trialCodeDir']
value_dict['gpuNum'] = experiment_config['trial']['trialGpuNum']
request_data['trial_config'] = value_dict
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
return True if response and response.status_code == 200 else False
return True if response.status_code == 200 else False
def set_local_config(experiment_config, port):
'''set local configuration'''
return set_trial_config(experiment_config, port)
def set_remote_config(experiment_config, port):
'''Call setClusterMetadata to pass trial'''
......@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port):
return False
#set trial_config
request_data = dict()
value_dict = dict()
value_dict['command'] = experiment_config['trial']['trialCommand']
value_dict['codeDir'] = experiment_config['trial']['trialCodeDir']
value_dict['gpuNum'] = experiment_config['trial']['trialGpuNum']
request_data['trial_config'] = value_dict
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
return True if response.status_code == 200 else False
return set_trial_config(experiment_config, port)
def set_experiment(experiment_config, mode, port):
'''Call startExperiment (rest POST /experiment) with yaml file content'''
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment