"src/git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "f73162a75c16e818dcebc6bae397e40d82b04b5c"
Commit 61d47a4d authored by Deshui Yu's avatar Deshui Yu
Browse files

[Code merge] Merge code from dogfood-v1 branch

parent f1f6f880
...@@ -19,9 +19,11 @@ ...@@ -19,9 +19,11 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import * as nodeNvidiaSmi from 'node-nvidia-smi'; import * as nodeNvidiaSmi from 'node-nvidia-smi';
import { delay } from '../../common/utils'; import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log';
/* Example of nvidia-smi result /* Example of nvidia-smi result
{ {
...@@ -287,9 +289,13 @@ class GPUScheduler { ...@@ -287,9 +289,13 @@ class GPUScheduler {
private gpuSummary!: GPUSummary; private gpuSummary!: GPUSummary;
private stopping: boolean; private stopping: boolean;
private log: Logger;
private nvdmNotFoundRegex: RegExp;
constructor() { constructor() {
this.stopping = false; this.stopping = false;
this.log = getLogger();
this.nvdmNotFoundRegex = /nvidia-smi: not found/gi;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -297,7 +303,11 @@ class GPUScheduler { ...@@ -297,7 +303,11 @@ class GPUScheduler {
try { try {
this.gpuSummary = await this.readGPUSummary(); this.gpuSummary = await this.readGPUSummary();
} catch (error) { } catch (error) {
console.error('Read GPU summary failed with error', error); this.log.error('Read GPU summary failed with error: ', error);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if(this.nvdmNotFoundRegex.test(error)) {
break;
}
} }
await delay(5000); await delay(5000);
} }
...@@ -315,28 +325,42 @@ class GPUScheduler { ...@@ -315,28 +325,42 @@ class GPUScheduler {
this.stopping = true; this.stopping = true;
} }
private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] {
let gpuInfos : GPUInfo[] = [];
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
assert(gpuNumber > 0);
if(gpuNumber == 1) {
const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
} else {
const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu;
gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
}
return gpuInfos;
}
private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
return new GPUInfo(
typeof embededGPUSummary.process === 'object' ? 1 : 0,
parseFloat(embededGPUSummary.utilization.memory_util),
parseFloat(embededGPUSummary.utilization.gpu_util),
parseInt(embededGPUSummary.minor_number, 10));
}
private readGPUSummary(): Promise<GPUSummary> { private readGPUSummary(): Promise<GPUSummary> {
return new Promise((resolve: Function, reject: Function): void => { return new Promise((resolve: Function, reject: Function): void => {
nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => { nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => {
if (error !== undefined) { if (error) {
reject(error); reject(error);
} else { } else {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
const gpuSummary: GPUSummary = new GPUSummary( const gpuSummary: GPUSummary = new GPUSummary(
parseInt(data.nvidia_smi_log.attached_gpus, 10), gpuNumber,
Date().toString(), Date().toString(),
data.nvidia_smi_log.gpu.map((gpuInfo: { this.generateEmbededGPUSummary(data)
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}) => new GPUInfo(
typeof gpuInfo.process === 'object' ? 1 : 0,
parseFloat(gpuInfo.utilization.memory_util),
parseFloat(gpuInfo.utilization.gpu_util),
parseInt(gpuInfo.minor_number, 10)
))
); );
resolve(gpuSummary); resolve(gpuSummary);
} }
......
...@@ -27,6 +27,8 @@ import * as path from 'path'; ...@@ -27,6 +27,8 @@ import * as path from 'path';
import * as ts from 'tail-stream'; import * as ts from 'tail-stream';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { import {
HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm, HostJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric, TrialJobStatus TrialJobDetail, TrialJobMetric, TrialJobStatus
...@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService { ...@@ -92,9 +94,8 @@ class LocalTrainingService implements TrainingService {
private initialized: boolean; private initialized: boolean;
private stopping: boolean; private stopping: boolean;
private rootDir!: string; private rootDir!: string;
private codeDir!: string; protected log: Logger;
private command!: string; protected localTrailConfig?: TrialConfig;
private log: Logger;
constructor() { constructor() {
this.eventEmitter = new EventEmitter(); this.eventEmitter = new EventEmitter();
...@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService { ...@@ -227,11 +228,12 @@ class LocalTrainingService implements TrainingService {
this.initialized = true; this.initialized = true;
} }
switch (key) { switch (key) {
case 'codeDir': case TrialConfigMetadataKey.TRIAL_CONFIG:
this.codeDir = value; this.localTrailConfig = <TrialConfig>JSON.parse(value);
break; // Parse trial config failed, throw Error
case 'command': if (!this.localTrailConfig) {
this.command = value; throw new Error('trial config parsed failed');
}
break; break;
default: default:
} }
...@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService { ...@@ -239,10 +241,14 @@ class LocalTrainingService implements TrainingService {
public getClusterMetadata(key: string): Promise<string> { public getClusterMetadata(key: string): Promise<string> {
switch (key) { switch (key) {
case 'codeDir': case TrialConfigMetadataKey.TRIAL_CONFIG:
return Promise.resolve(this.codeDir); let getResult : Promise<string>;
case 'command': if(!this.localTrailConfig) {
return Promise.resolve(this.command); getResult = Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, `${key} is never set yet`));
} else {
getResult = Promise.resolve(!this.localTrailConfig? '' : JSON.stringify(this.localTrailConfig));
}
return getResult;
default: default:
return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found')); return Promise.reject(new NNIError(NNIErrorNames.NOT_FOUND, 'Key not found'));
} }
...@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService { ...@@ -292,14 +298,18 @@ class LocalTrainingService implements TrainingService {
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource); const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource);
const runScriptLines: string[] = []; const runScriptLines: string[] = [];
if (!this.localTrailConfig) {
throw new Error('trial config is not initialized');
}
runScriptLines.push( runScriptLines.push(
'#!/bin/bash', '#!/bin/bash',
`cd ${this.codeDir}`); `cd ${this.localTrailConfig.codeDir}`);
for (const variable of variables) { for (const variable of variables) {
runScriptLines.push(`export ${variable.key}=${variable.value}`); runScriptLines.push(`export ${variable.key}=${variable.value}`);
} }
runScriptLines.push( runScriptLines.push(
`eval ${this.command} 2>${path.join(trialJobDetail.workingDirectory, '.nni', 'stderr')}`, `eval ${this.localTrailConfig.command} 2>${path.join(trialJobDetail.workingDirectory, '.nni', 'stderr')}`,
`echo $? \`date +%s%3N\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`); `echo $? \`date +%s%3N\` >${path.join(trialJobDetail.workingDirectory, '.nni', 'state')}`);
await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`); await cpp.exec(`mkdir -p ${trialJobDetail.workingDirectory}`);
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
import { TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
import { LocalTrainingService } from './localTrainingService'; import { LocalTrainingService } from './localTrainingService';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
type LocalTrialJobDetailForGPU = TrialJobDetail & { gpuIndices: number[] }; type LocalTrialJobDetailForGPU = TrialJobDetail & { gpuIndices: number[] };
...@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService { ...@@ -52,8 +53,14 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
await super.setClusterMetadata(key, value); await super.setClusterMetadata(key, value);
switch (key) { switch (key) {
case 'requiredGPUNum': case TrialConfigMetadataKey.TRIAL_CONFIG:
this.requiredGPUNum = parseInt(value, 10); if(this.localTrailConfig !== undefined) {
this.requiredGPUNum = this.localTrailConfig.gpuNum;
} else {
// If no valid trial config is initialized, set requiredGPUNum to 0 as fallback value.
this.requiredGPUNum = 0;
}
this.log.info('required GPU number is ' + this.requiredGPUNum);
if (this.gpuScheduler === undefined) { if (this.gpuScheduler === undefined) {
this.gpuScheduler = new GPUScheduler(); this.gpuScheduler = new GPUScheduler();
} }
...@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService { ...@@ -62,15 +69,6 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
} }
} }
public getClusterMetadata(key: string): Promise<string> {
switch (key) {
case 'requiredGPUNum':
return Promise.resolve(`${this.requiredGPUNum}`);
default:
return super.getClusterMetadata(key);
}
}
public cleanUp(): Promise<void> { public cleanUp(): Promise<void> {
if (this.gpuScheduler !== undefined) { if (this.gpuScheduler !== undefined) {
this.gpuScheduler.stop(); this.gpuScheduler.stop();
...@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService { ...@@ -80,7 +78,7 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
} }
protected onTrialJobStatusChanged(trialJob: LocalTrialJobDetailForGPU, oldStatus: TrialJobStatus): void { protected onTrialJobStatusChanged(trialJob: LocalTrialJobDetailForGPU, oldStatus: TrialJobStatus): void {
if (trialJob.gpuIndices.length !== 0) { if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length !== 0) {
if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') { if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
for (const index of trialJob.gpuIndices) { for (const index of trialJob.gpuIndices) {
this.availableGPUIndices[index] = false; this.availableGPUIndices[index] = false;
......
...@@ -24,7 +24,7 @@ import { EventEmitter } from 'events'; ...@@ -24,7 +24,7 @@ import { EventEmitter } from 'events';
import * as path from 'path'; import * as path from 'path';
import { Client } from 'ssh2'; import { Client } from 'ssh2';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus, TrialJobDetail } from '../../common/trainingService';
import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData'; import { JobMetrics, RemoteCommandResult, RemoteMachineMeta, RemoteMachineTrialJobDetail } from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
...@@ -56,8 +56,12 @@ export class MetricsCollector { ...@@ -56,8 +56,12 @@ export class MetricsCollector {
if (rmMetrics !== undefined && rmMetrics.length > 0) { if (rmMetrics !== undefined && rmMetrics.length > 0) {
rmMetrics.forEach((jobMetrics) => { rmMetrics.forEach((jobMetrics) => {
const trialJobId : string = jobMetrics.jobId; const trialJobId : string = jobMetrics.jobId;
const trialJobDetail : RemoteMachineTrialJobDetail = <RemoteMachineTrialJobDetail>this.trialJobsMap.get(trialJobId);
assert(trialJobDetail);
// If job status is not alive again, remove its GPU reservation // If job status is not alive again, remove its GPU reservation
if(!['RUNNING'].includes(jobMetrics.jobStatus)) { if(!['RUNNING'].includes(jobMetrics.jobStatus)) {
trialJobDetail.status = jobMetrics.jobStatus;
this.log.info(`Set trialjob ${trialJobDetail.id} status to ${trialJobDetail.status}`);
runningJobsMap.forEach((jobIds: string[], rmMeta: RemoteMachineMeta) => { runningJobsMap.forEach((jobIds: string[], rmMeta: RemoteMachineMeta) => {
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined // If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
if(rmMeta.gpuReservation !== undefined) { if(rmMeta.gpuReservation !== undefined) {
...@@ -81,11 +85,19 @@ export class MetricsCollector { ...@@ -81,11 +85,19 @@ export class MetricsCollector {
if (status.includes(trialJob.status)) { if (status.includes(trialJob.status)) {
if (map.has(trialJob.rmMeta)) { if (map.has(trialJob.rmMeta)) {
const ids = map.get(trialJob.rmMeta); const ids = map.get(trialJob.rmMeta);
if (ids !== undefined) { if (ids !== undefined && !ids.includes(id)) {
ids.push(id); ids.push(id);
} }
} else { } else {
map.set(trialJob.rmMeta, [id]); let initJobIds : string[] = [id];
// If the remote machine has jobs reserve GPU, also put that jobs into list to get metrics data
if(trialJob.rmMeta.gpuReservation !== undefined) {
const concatJobIds : string[] = initJobIds.concat(Array.from(trialJob.rmMeta.gpuReservation.values()));
initJobIds = concatJobIds.filter((item, pos) => concatJobIds.indexOf(item) === pos);
}
map.set(trialJob.rmMeta, initJobIds);
} }
} }
}); });
......
...@@ -23,15 +23,6 @@ import { Client } from 'ssh2'; ...@@ -23,15 +23,6 @@ import { Client } from 'ssh2';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary } from '../common/gpuData';
/**
* Enum of key for remote machine metadata for configuration
*/
export enum RemoteMachineMetadataKey {
MACHINE_LIST = 'machine_list',
TRIAL_CONFIG = 'trial_config',
EXPERIMENT_ID = 'experimentId',
RANDOM_SCHEDULER = 'random_scheduler'
}
/** /**
* Metadata of remote machine for configuration and statuc query * Metadata of remote machine for configuration and statuc query
...@@ -54,21 +45,6 @@ export class RemoteMachineMeta { ...@@ -54,21 +45,6 @@ export class RemoteMachineMeta {
} }
} }
/**
* Configuration for trial job on remote machine
*/
export class RemoteMachineTrialConfig {
public readonly command : string;
public readonly codeDir : string;
public readonly gpuNum : number;
constructor(command : string, codeDir : string, gpuNum : number) {
this.command = command;
this.codeDir = codeDir;
this.gpuNum = gpuNum;
}
}
/** /**
* The execution result for command executed on remote machine * The execution result for command executed on remote machine
*/ */
......
...@@ -37,12 +37,14 @@ import { ...@@ -37,12 +37,14 @@ import {
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, getExperimentRootDir, uniqueString } from '../../common/utils'; import { delay, getExperimentRootDir, uniqueString } from '../../common/utils';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
import { MetricsCollector } from './metricsCollector'; import { MetricsCollector } from './metricsCollector';
import { import {
HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta, RemoteMachineMetadataKey, HOSTJOBSHELLFORMAT, RemoteCommandResult, RemoteMachineMeta,
REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, REMOTEMACHINERUNSHELLFORMAT, RemoteMachineScheduleInfo, RemoteMachineScheduleResult,
RemoteMachineTrialConfig, RemoteMachineTrialJobDetail, ScheduleResultType RemoteMachineTrialJobDetail, ScheduleResultType
} from './remoteMachineData'; } from './remoteMachineData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
...@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -56,7 +58,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Experiment root directory // Experiment root directory
private expRootDir: string; private expRootDir: string;
private remoteExpRootDir: string; private remoteExpRootDir: string;
private trialConfig: RemoteMachineTrialConfig | undefined; private trialConfig: TrialConfig | undefined;
private gpuScheduler: GPUScheduler; private gpuScheduler: GPUScheduler;
private jobQueue: string[]; private jobQueue: string[];
private timer: ObservableTimer; private timer: ObservableTimer;
...@@ -93,7 +95,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -93,7 +95,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Wait to schedule job in next time iteration // Wait to schedule job in next time iteration
break; break;
} }
}; }
const metricsCollector: MetricsCollector = new MetricsCollector( const metricsCollector: MetricsCollector = new MetricsCollector(
this.machineSSHClientMap, this.trialJobsMap, this.remoteExpRootDir, this.metricsEmitter); this.machineSSHClientMap, this.trialJobsMap, this.remoteExpRootDir, this.metricsEmitter);
await metricsCollector.collectMetrics(); await metricsCollector.collectMetrics();
...@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -186,6 +188,7 @@ class RemoteMachineTrainingService implements TrainingService {
form); form);
this.jobQueue.push(trialJobId); this.jobQueue.push(trialJobId);
this.trialJobsMap.set(trialJobId, trialJobDetail); this.trialJobsMap.set(trialJobId, trialJobDetail);
return Promise.resolve(trialJobDetail); return Promise.resolve(trialJobDetail);
} else { } else {
return Promise.reject(new Error(`Job form not supported: ${JSON.stringify(form)}, jobType should be HOST or TRIAL.`)); return Promise.reject(new Error(`Job form not supported: ${JSON.stringify(form)}, jobType should be HOST or TRIAL.`));
...@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -207,7 +210,7 @@ class RemoteMachineTrainingService implements TrainingService {
// Remove the job with trialJobId from job queue // Remove the job with trialJobId from job queue
const index : number = this.jobQueue.indexOf(trialJobId); const index : number = this.jobQueue.indexOf(trialJobId);
if(index >= 0) { if (index >= 0) {
this.jobQueue.splice(index, 1); this.jobQueue.splice(index, 1);
} }
...@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -243,11 +246,11 @@ class RemoteMachineTrainingService implements TrainingService {
*/ */
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) { switch (key) {
case RemoteMachineMetadataKey.MACHINE_LIST: case TrialConfigMetadataKey.MACHINE_LIST:
await this.setupConnections(value); await this.setupConnections(value);
break; break;
case RemoteMachineMetadataKey.TRIAL_CONFIG: case TrialConfigMetadataKey.TRIAL_CONFIG:
const remoteMachineTrailConfig: RemoteMachineTrialConfig = <RemoteMachineTrialConfig>JSON.parse(value); const remoteMachineTrailConfig: TrialConfig = <TrialConfig>JSON.parse(value);
// Parse trial config failed, throw Error // Parse trial config failed, throw Error
if (!remoteMachineTrailConfig) { if (!remoteMachineTrailConfig) {
throw new Error('trial config parsed failed'); throw new Error('trial config parsed failed');
...@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -351,7 +354,7 @@ class RemoteMachineTrainingService implements TrainingService {
this.log.error(errorMessage); this.log.error(errorMessage);
deferred.reject(); deferred.reject();
throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage); throw new NNIError(NNIErrorNames.RESOURCE_NOT_AVAILABLE, errorMessage);
} else if(rmScheduleResult.resultType == ScheduleResultType.SUCCEED } else if (rmScheduleResult.resultType === ScheduleResultType.SUCCEED
&& rmScheduleResult.scheduleInfo !== undefined) { && rmScheduleResult.scheduleInfo !== undefined) {
const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo; const rmScheduleInfo : RemoteMachineScheduleInfo = rmScheduleResult.scheduleInfo;
const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId); const trialWorkingFolder: string = path.join(this.remoteExpRootDir, 'trials', trialJobId);
...@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -364,11 +367,11 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail.rmMeta = rmScheduleInfo.rmMeta; trialJobDetail.rmMeta = rmScheduleInfo.rmMeta;
deferred.resolve(true); deferred.resolve(true);
} else if(rmScheduleResult.resultType == ScheduleResultType.TMP_NO_AVAILABLE_GPU) { } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`); this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
deferred.resolve(false); deferred.resolve(false);
} else { } else {
deferred.reject('Invalid schedule resutl type: ' + rmScheduleResult.resultType); deferred.reject(`Invalid schedule resutl type: ${rmScheduleResult.resultType}`);
} }
return deferred.promise; return deferred.promise;
......
...@@ -19,14 +19,16 @@ ...@@ -19,14 +19,16 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as os from 'os';
import { Client, ClientChannel, SFTPWrapper } from 'ssh2'; import { Client, ClientChannel, SFTPWrapper } from 'ssh2';
import * as stream from "stream"; import * as stream from 'stream';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { getExperimentRootDir } from '../../common/utils'; import { getLogger } from '../../common/log';
import { uniqueString } from '../../common/utils';
import { RemoteCommandResult } from './remoteMachineData'; import { RemoteCommandResult } from './remoteMachineData';
/** /**
...@@ -43,17 +45,18 @@ export namespace SSHClientUtility { ...@@ -43,17 +45,18 @@ export namespace SSHClientUtility {
*/ */
export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise<void> { export async function copyDirectoryToRemote(localDirectory : string, remoteDirectory : string, sshClient : Client) : Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
const localCompressedDir: string = path.join(getExperimentRootDir(), 'directory.tar.gz'); const tmpTarName: string = `${uniqueString(10)}.tar.gz`;
const remoteCompressedDir: string = path.join(remoteDirectory, 'directory.tar.gz'); const localTarPath: string = path.join(os.tmpdir(), tmpTarName);
const remoteTarPath: string = path.join(os.tmpdir(), tmpTarName);
// Compress files in local directory to experiment root directory // Compress files in local directory to experiment root directory
await cpp.exec(`tar -czf ${localCompressedDir} -C ${localDirectory} .`); await cpp.exec(`tar -czf ${localTarPath} -C ${localDirectory} .`);
// Copy the compressed file to remoteDirectory and delete it // Copy the compressed file to remoteDirectory and delete it
await copyFileToRemote(localCompressedDir, remoteCompressedDir, sshClient); await copyFileToRemote(localTarPath, remoteTarPath, sshClient);
await cpp.exec(`rm ${localCompressedDir}`); await cpp.exec(`rm ${localTarPath}`);
// Decompress the remote compressed file in and delete it // Decompress the remote compressed file in and delete it
await remoteExeCommand(`tar -oxzf ${remoteCompressedDir} -C ${remoteDirectory}`, sshClient); await remoteExeCommand(`tar -oxzf ${remoteTarPath} -C ${remoteDirectory}`, sshClient);
await remoteExeCommand(`rm ${remoteCompressedDir}`, sshClient); await remoteExeCommand(`rm ${remoteTarPath}`, sshClient);
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
...@@ -65,18 +68,23 @@ export namespace SSHClientUtility { ...@@ -65,18 +68,23 @@ export namespace SSHClientUtility {
* @param remoteFilePath the target path in remote machine * @param remoteFilePath the target path in remote machine
* @param sshClient SSH Client * @param sshClient SSH Client
*/ */
export function copyFileToRemote(localFilePath : string, remoteFilePath : string, sshClient : Client) : Promise<string> { export function copyFileToRemote(localFilePath : string, remoteFilePath : string, sshClient : Client) : Promise<boolean> {
const deferred: Deferred<string> = new Deferred<string>(); assert(sshClient !== undefined);
const deferred: Deferred<boolean> = new Deferred<boolean>();
sshClient.sftp((err : Error, sftp : SFTPWrapper) => { sshClient.sftp((err : Error, sftp : SFTPWrapper) => {
if (err) { if (err) {
deferred.reject(); getLogger().error(`copyFileToRemote: ${err.message}, ${localFilePath}, ${remoteFilePath}`);
deferred.reject(err);
return;
} }
assert(sftp !== undefined);
sftp.fastPut(localFilePath, remoteFilePath, (fastPutErr : Error) => { sftp.fastPut(localFilePath, remoteFilePath, (fastPutErr : Error) => {
sftp.end(); sftp.end();
if (fastPutErr) { if (fastPutErr) {
deferred.reject(); deferred.reject(fastPutErr);
} else { } else {
deferred.resolve('success'); deferred.resolve(true);
} }
}); });
}); });
...@@ -97,14 +105,16 @@ export namespace SSHClientUtility { ...@@ -97,14 +105,16 @@ export namespace SSHClientUtility {
client.exec(command, (err : Error, channel : ClientChannel) => { client.exec(command, (err : Error, channel : ClientChannel) => {
if (err) { if (err) {
getLogger().error(`remoteExeCommand: ${err.message}`);
deferred.reject(err); deferred.reject(err);
return;
} }
channel.on('data', function(data : any, dataStderr : any) { channel.on('data', (data : any, dataStderr : any) => {
if (dataStderr) { if (dataStderr) {
stderr += data.toString(); stderr += data.toString();
} } else {
else {
stdout += data.toString(); stdout += data.toString();
} }
}).on('exit', (code, signal) => { }).on('exit', (code, signal) => {
...@@ -124,7 +134,10 @@ export namespace SSHClientUtility { ...@@ -124,7 +134,10 @@ export namespace SSHClientUtility {
const deferred: Deferred<string> = new Deferred<string>(); const deferred: Deferred<string> = new Deferred<string>();
sshClient.sftp((err: Error, sftp : SFTPWrapper) => { sshClient.sftp((err: Error, sftp : SFTPWrapper) => {
if (err) { if (err) {
getLogger().error(`getRemoteFileContent: ${err.message}`);
deferred.reject(new Error(`SFTP error: ${err.message}`)); deferred.reject(new Error(`SFTP error: ${err.message}`));
return;
} }
try { try {
const sftpStream : stream.Readable = sftp.createReadStream(filePath); const sftpStream : stream.Readable = sftp.createReadStream(filePath);
...@@ -133,11 +146,16 @@ export namespace SSHClientUtility { ...@@ -133,11 +146,16 @@ export namespace SSHClientUtility {
sftpStream.on('data', (data : Buffer | string) => { sftpStream.on('data', (data : Buffer | string) => {
dataBuffer += data; dataBuffer += data;
}).on('error', (streamErr: Error) => { }).on('error', (streamErr: Error) => {
sftp.end();
deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message)); deferred.reject(new NNIError(NNIErrorNames.NOT_FOUND, streamErr.message));
}).on('end', () => { }).on('end', () => {
// sftp connection need to be released manually once operation is done
sftp.end();
deferred.resolve(dataBuffer); deferred.resolve(dataBuffer);
}); });
} catch (error) { } catch (error) {
getLogger().error(`getRemoteFileContent: ${error.message}`);
sftp.end();
deferred.reject(new Error(`SFTP error: ${error.message}`)); deferred.reject(new Error(`SFTP error: ${error.message}`));
} }
}); });
......
...@@ -27,7 +27,7 @@ import * as tmp from 'tmp'; ...@@ -27,7 +27,7 @@ import * as tmp from 'tmp';
import * as component from '../../common/component'; import * as component from '../../common/component';
import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrainingService } from '../../common/trainingService';
import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils'; import { cleanupUnitTest, delay, prepareUnitTest } from '../../common/utils';
import { RemoteMachineMetadataKey } from '../remote_machine/remoteMachineData'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { RemoteMachineTrainingService } from '../remote_machine/remoteMachineTrainingService'; import { RemoteMachineTrainingService } from '../remote_machine/remoteMachineTrainingService';
// copy mockedTrail.py to local folder // copy mockedTrail.py to local folder
...@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => { ...@@ -95,9 +95,9 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
if (skip) { if (skip) {
return; return;
} }
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.MACHINE_LIST, machineList); await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.MACHINE_LIST, machineList);
await remoteMachineTrainingService.setClusterMetadata( await remoteMachineTrainingService.setClusterMetadata(
RemoteMachineMetadataKey.TRIAL_CONFIG, `{"command":"sleep 1h && echo ","codeDir":"${localCodeDir}","gpuNum":1}`); TrialConfigMetadataKey.TRIAL_CONFIG, `{"command":"sleep 1h && echo ","codeDir":"${localCodeDir}","gpuNum":1}`);
const form: TrialJobApplicationForm = { const form: TrialJobApplicationForm = {
jobType: 'TRIAL', jobType: 'TRIAL',
hyperParameters: 'mock hyperparameters' hyperParameters: 'mock hyperparameters'
...@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => { ...@@ -126,11 +126,11 @@ describe('Unit Test for RemoteMachineTrainingService', () => {
return; return;
} }
// set machine list' // set machine list'
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.MACHINE_LIST, machineList); await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.MACHINE_LIST, machineList);
// set meta data // set meta data
const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}` const trialConfig: string = `{\"command\":\"python3 mockedTrial.py\", \"codeDir\":\"${localCodeDir}\",\"gpuNum\":0}`
await remoteMachineTrainingService.setClusterMetadata(RemoteMachineMetadataKey.TRIAL_CONFIG, trialConfig); await remoteMachineTrainingService.setClusterMetadata(TrialConfigMetadataKey.TRIAL_CONFIG, trialConfig);
// submit job // submit job
const form: TrialJobApplicationForm = { const form: TrialJobApplicationForm = {
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as cpp from 'child-process-promise';
import * as fs from 'fs';
import { Client } from 'ssh2';
import { Deferred } from 'ts-deferred';
import { SSHClientUtility } from '../remote_machine/sshClientUtility';
const LOCALFILE: string = '/tmp/sshclientUTData';
const REMOTEFILE: string = '/tmp/sshclientUTData';
async function copyFile(conn: Client): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
conn.sftp((err, sftp) => {
if (err) {
deferred.reject(err);
return;
}
sftp.fastPut(
LOCALFILE,
REMOTEFILE, (fastPutErr: Error) => {
sftp.end();
if (fastPutErr) {
deferred.reject(fastPutErr);
} else {
deferred.resolve();
}
}
);
});
return deferred.promise;
}
async function copyFileToRemoteLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.copyFileToRemote(LOCALFILE, REMOTEFILE, conn);
}
}
async function remoteExeCommandLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.remoteExeCommand('ls', conn);
}
}
async function getRemoteFileContentLoop(conn: Client): Promise<void> {
for (let i: number = 0; i < 500; i++) {
console.log(i);
await SSHClientUtility.getRemoteFileContent(REMOTEFILE, conn);
}
}
describe('sshClientUtility test', () => {
let skip: boolean = true;
let rmMeta: any;
try {
rmMeta = JSON.parse(fs.readFileSync('../../.vscode/rminfo.json', 'utf8'));
} catch (err) {
skip = true;
}
before(async () => {
await cpp.exec(`echo '1234' > ${LOCALFILE}`);
});
after(() => {
fs.unlinkSync(LOCALFILE);
});
it('Test SSHClientUtility', (done) => {
if (skip) {
done();
return;
}
const conn: Client = new Client();
conn.on('ready', async () => {
await copyFile(conn);
await Promise.all([
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
copyFileToRemoteLoop(conn),
remoteExeCommandLoop(conn),
getRemoteFileContentLoop(conn)
]);
done();
}).connect(rmMeta);
});
});
...@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' { ...@@ -2,17 +2,19 @@ declare module 'node-nvidia-smi' {
function smi(callback: (error: Error, data: smi.GPUInfo) => void): void; function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
namespace smi { namespace smi {
interface GPUInfo { interface EmbededGPUSummary {
nvidia_smi_log: {
attached_gpus: string;
gpu: {
minor_number: string; minor_number: string;
utilization: { utilization: {
gpu_util: string; gpu_util: string;
memory_util: string; memory_util: string;
}; };
process: string | object; process: string | object;
}[]; }
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: EmbededGPUSummary[] | EmbededGPUSummary;
}; };
} }
} }
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ================================================================================================== # ==================================================================================================
import logging
from enum import Enum from enum import Enum
...@@ -64,8 +64,14 @@ def receive(): ...@@ -64,8 +64,14 @@ def receive():
Returns a tuple of command (CommandType) and payload (str) Returns a tuple of command (CommandType) and payload (str)
""" """
header = _in_file.read(8) header = _in_file.read(8)
logging.getLogger(__name__).debug('Received command, header: [%s]' % header)
if header is None or len(header) < 8:
# Pipe EOF encountered
logging.getLogger(__name__).debug('Pipe EOF encountered')
return None, None
length = int(header[2:]) length = int(header[2:])
data = _in_file.read(length) data = _in_file.read(length)
command = CommandType(header[:2]) command = CommandType(header[:2])
data = data.decode('utf8') data = data.decode('utf8')
logging.getLogger(__name__).debug('Received command, data: [%s]' % data)
return command, data return command, data
...@@ -127,6 +127,8 @@ def _handle_request(tuner): ...@@ -127,6 +127,8 @@ def _handle_request(tuner):
_logger.debug('waiting receive_message') _logger.debug('waiting receive_message')
command, data = receive() command, data = receive()
if command is None:
return False
_logger.debug(command) _logger.debug(command)
_logger.debug(data) _logger.debug(data)
......
...@@ -18,7 +18,6 @@ Click the tab "Overview". ...@@ -18,7 +18,6 @@ Click the tab "Overview".
* See good performance trial. * See good performance trial.
* See search_space json. * See search_space json.
* See complete trial cdf graph.
### View job accuracy ### View job accuracy
...@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove ...@@ -29,7 +28,7 @@ Click the tab "Optimization Progress" to see the point graph of all trials. Hove
Click the tab "Hyper Parameter" to see the parallel graph. Click the tab "Hyper Parameter" to see the parallel graph.
* You can select the percentage to cut down some lines. * You can select the percentage to cut down some lines.
* Choose two axes to swap its positions * Choose two axis to swap its positions
### View trial status ### View trial status
...@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically: ...@@ -39,11 +38,10 @@ Click the tab "Trial Status" to see the status of the all trials. Specifically:
* Trial detail: trial's id, trial's duration, start time, end time, status and accuracy. * Trial detail: trial's id, trial's duration, start time, end time, status and accuracy.
* Kill: you can kill a job that status is running. * Kill: you can kill a job that status is running.
* Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page. * Tensor: you can see a job in the tensorflow graph, it will link to the Tensorboard page.
* Log: click the button, you can see the log about NNI and pai.
### Control ### Control
Click the tab "Control" to add a new trial or update the search_space file. Click the tab "Control" to add a new trial or update the search_space file and some experiment parameters.
### View Tensorboard Graph ### View Tensorboard Graph
......
...@@ -4,22 +4,6 @@ ...@@ -4,22 +4,6 @@
<head> <head>
<meta charset="utf-8"> <meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="theme-color" content="#000000">
<!--
manifest.json provides metadata used when your web app is added to the
homescreen on Android. See https://developers.google.com/web/fundamentals/engage-and-retain/web-app-manifest/
-->
<link rel="manifest" href="%PUBLIC_URL%/manifest.json">
<link rel="shortcut icon" href="%PUBLIC_URL%/icon.jpg">
<!--
Notice the use of %PUBLIC_URL% in the tags above.
It will be replaced with the URL of the `public` folder during the build.
Only files inside the `public` folder can be referenced from the HTML.
Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
work correctly both with client-side routing and a non-root public URL.
Learn how to configure a non-root public URL by running `npm run build`.
-->
<title>Neural Network Intelligence</title> <title>Neural Network Intelligence</title>
</head> </head>
......
.header_title{ .header_title{
width: 100%; width: 100%;
height: 60px; height: 60px;
line-height: 60px;
font-size: 24px;
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
color: white;
background-color: rgb(60,141,188) ; background-color: rgb(60,141,188) ;
user-select: none; user-select: none;
text-align: center; text-align: center;
......
...@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> { ...@@ -6,7 +6,8 @@ class App extends React.Component<{}, {}> {
render () { render () {
return ( return (
<div className="App"> <div className="App">
<header className="header_title"><img src={require('./logo.jpg')} alt=""/></header> {/* <header className="header_title"><img src={require('./logo.jpg')} alt=""/></header> */}
<header className="header_title">Neural Network Intelligence</header>
<div className="content"> <div className="content">
<SlideBar /> <SlideBar />
<div className="right">{this.props.children}</div> <div className="right">{this.props.children}</div>
......
...@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> { ...@@ -203,7 +203,7 @@ class Control extends React.Component<{}, ControlState> {
// update trial number parameters // update trial number parameters
trialParameterMess = (exper: Experiments, str: string) => { trialParameterMess = (exper: Experiments, str: string) => {
this.getUpdateExample();
axios(`${MANAGER_IP}/experiment`, { axios(`${MANAGER_IP}/experiment`, {
method: 'PUT', method: 'PUT',
headers: { headers: {
...@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> { ...@@ -216,6 +216,7 @@ class Control extends React.Component<{}, ControlState> {
}).then(res => { }).then(res => {
if (res.status === 200) { if (res.status === 200) {
message.success(`Update ${str.toLocaleLowerCase()} successfully`); message.success(`Update ${str.toLocaleLowerCase()} successfully`);
this.getUpdateExample();
} else { } else {
message.error(`Update ${str.toLocaleLowerCase()} failed`); message.error(`Update ${str.toLocaleLowerCase()} failed`);
} }
...@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> { ...@@ -284,8 +285,8 @@ class Control extends React.Component<{}, ControlState> {
} }
userUpdateSeaspace = () => { userUpdateSeaspace = () => {
this.updateSearchLoad(); this.updateSearchLoad();
this.getUpdateExample();
const { updateSearch } = this.state; const { updateSearch } = this.state;
if (updateSearch !== '' || updateSearch !== null) { if (updateSearch !== '' || updateSearch !== null) {
const { experiment } = this.state; const { experiment } = this.state;
......
...@@ -2,12 +2,12 @@ import * as React from 'react'; ...@@ -2,12 +2,12 @@ import * as React from 'react';
import axios from 'axios'; import axios from 'axios';
import { Table, Select, Row, Col, Icon } from 'antd'; import { Table, Select, Row, Col, Icon } from 'antd';
import { MANAGER_IP, overviewItem, roundNum } from '../const'; import { MANAGER_IP, overviewItem, roundNum } from '../const';
import ReactEcharts from 'echarts-for-react'; // import ReactEcharts from 'echarts-for-react';
const Option = Select.Option; const Option = Select.Option;
import JSONTree from 'react-json-tree'; import JSONTree from 'react-json-tree';
require('echarts/lib/chart/line'); // require('echarts/lib/chart/line');
require('echarts/lib/component/tooltip'); // require('echarts/lib/component/tooltip');
require('echarts/lib/component/title'); // require('echarts/lib/component/title');
require('../style/sessionpro.css'); require('../style/sessionpro.css');
interface TableObj { interface TableObj {
...@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> { ...@@ -266,26 +266,26 @@ class Sessionpro extends React.Component<{}, SessionState> {
}); });
} }
// draw CDF // draw CDF
const { trialRun } = this.state; // const { trialRun } = this.state;
if (this._isMounted) { // if (this._isMounted) {
this.setState({ // this.setState({
option: this.getOption(trialRun) // option: this.getOption(trialRun)
}); // });
} // }
// CDF graph 'No data' judge // CDF graph 'No data' judge
if (trialRun.length === 0) { // if (trialRun.length === 0) {
if (this._isMounted) { // if (this._isMounted) {
this.setState({ // this.setState({
noData: 'No data' // noData: 'No data'
}); // });
} // }
} else { // } else {
if (this._isMounted) { // if (this._isMounted) {
this.setState({ // this.setState({
noData: '' // noData: ''
}); // });
} // }
} // }
} }
}); });
} }
...@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> { ...@@ -372,7 +372,8 @@ class Sessionpro extends React.Component<{}, SessionState> {
}; };
const { const {
trialProfile, searchSpace, tunerAssessor, tableData, option, noData trialProfile, searchSpace, tunerAssessor, tableData,
// option, noData
} = this.state; } = this.state;
let running; let running;
if (trialProfile.endTime === 'not over') { if (trialProfile.endTime === 'not over') {
...@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> { ...@@ -500,13 +501,13 @@ class Sessionpro extends React.Component<{}, SessionState> {
scroll={{ x: '100%', y: 540 }} scroll={{ x: '100%', y: 540 }}
/> />
</div> </div>
<div className="cdf"> {/* <div className="cdf">
<ReactEcharts <ReactEcharts
option={option} option={option}
style={{ height: 500, padding: '0px' }} style={{ height: 500, padding: '0px' }}
/> />
<div className="addNodata">{noData}</div> <div className="addNodata">{noData}</div>
</div> </div> */}
</div> </div>
); );
} }
......
...@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s' ...@@ -43,7 +43,7 @@ WARNING_INFO = 'Waining: %s'
EXPERIMENT_SUCCESS_INFO = 'Start experiment success! The experiment id is %s, and the restful server post is %s.\n' \ EXPERIMENT_SUCCESS_INFO = 'Start experiment success! The experiment id is %s, and the restful server post is %s.\n' \
'You can use these commands to get more information about this experiment:\n' \ 'You can use these commands to get more information about this experiment:\n' \
' commands description\n' \ ' commands description\n' \
'1. nnictl experiment ls list all of experiments\n' \ '1. nnictl experiment show show the information of experiments\n' \
'2. nnictl trial ls list all of trial jobs\n' \ '2. nnictl trial ls list all of trial jobs\n' \
'3. nnictl stop stop a experiment\n' \ '3. nnictl stop stop a experiment\n' \
'4. nnictl trial kill kill a trial job by id\n' \ '4. nnictl trial kill kill a trial job by id\n' \
......
...@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None): ...@@ -54,13 +54,20 @@ def start_rest_server(manager, port, platform, mode, experiment_id=None):
process = Popen(cmds, stdout=stdout_file, stderr=stderr_file) process = Popen(cmds, stdout=stdout_file, stderr=stderr_file)
return process return process
def set_local_config(experiment_config, port): def set_trial_config(experiment_config, port):
'''Call setClusterMetadata (rest PUT /parameters/cluster-metadata) to pass platform and machineList"''' '''set trial configuration'''
request_data = dict() request_data = dict()
request_data['codeDir'] = experiment_config['trial']['trialCodeDir'] value_dict = dict()
request_data['command'] = experiment_config['trial']['trialCommand'] value_dict['command'] = experiment_config['trial']['trialCommand']
value_dict['codeDir'] = experiment_config['trial']['trialCodeDir']
value_dict['gpuNum'] = experiment_config['trial']['trialGpuNum']
request_data['trial_config'] = value_dict
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20) response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
return True if response and response.status_code == 200 else False return True if response.status_code == 200 else False
def set_local_config(experiment_config, port):
'''set local configuration'''
return set_trial_config(experiment_config, port)
def set_remote_config(experiment_config, port): def set_remote_config(experiment_config, port):
'''Call setClusterMetadata to pass trial''' '''Call setClusterMetadata to pass trial'''
...@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port): ...@@ -72,14 +79,7 @@ def set_remote_config(experiment_config, port):
return False return False
#set trial_config #set trial_config
request_data = dict() return set_trial_config(experiment_config, port)
value_dict = dict()
value_dict['command'] = experiment_config['trial']['trialCommand']
value_dict['codeDir'] = experiment_config['trial']['trialCodeDir']
value_dict['gpuNum'] = experiment_config['trial']['trialGpuNum']
request_data['trial_config'] = value_dict
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
return True if response.status_code == 200 else False
def set_experiment(experiment_config, mode, port): def set_experiment(experiment_config, mode, port):
'''Call startExperiment (rest POST /experiment) with yaml file content''' '''Call startExperiment (rest POST /experiment) with yaml file content'''
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment