Unverified Commit 143c6615 authored by Chi Song's avatar Chi Song Committed by GitHub
Browse files

Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)

parent 8a20c348
...@@ -222,15 +222,16 @@ function getIPV4Address(): string { ...@@ -222,15 +222,16 @@ function getIPV4Address(): string {
return cachedipv4Address; return cachedipv4Address;
} }
if (os.networkInterfaces().eth0) { const networkInterfaces = os.networkInterfaces();
for (const item of os.networkInterfaces().eth0) { if (networkInterfaces.eth0) {
for (const item of networkInterfaces.eth0) {
if (item.family === 'IPv4') { if (item.family === 'IPv4') {
cachedipv4Address = item.address; cachedipv4Address = item.address;
return cachedipv4Address; return cachedipv4Address;
} }
} }
} else { } else {
throw Error('getIPV4Address() failed because os.networkInterfaces().eth0 is undefined.'); throw Error(`getIPV4Address() failed because os.networkInterfaces().eth0 is undefined. Please specify NNI manager IP in config.`);
} }
throw Error('getIPV4Address() failed because no valid IPv4 address found.') throw Error('getIPV4Address() failed because no valid IPv4 address found.')
......
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
"@types/express": "^4.16.0", "@types/express": "^4.16.0",
"@types/glob": "^7.1.1", "@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1", "@types/js-base64": "^2.3.1",
"@types/js-yaml": "^3.12.5",
"@types/mocha": "^5.2.5", "@types/mocha": "^5.2.5",
"@types/node": "10.12.18", "@types/node": "10.12.18",
"@types/request": "^2.47.1", "@types/request": "^2.47.1",
......
...@@ -107,6 +107,11 @@ export namespace ValidationSchemas { ...@@ -107,6 +107,11 @@ export namespace ValidationSchemas {
token: joi.string().min(1), token: joi.string().min(1),
host: joi.string().min(1).required(), host: joi.string().min(1).required(),
reuse: joi.boolean(), reuse: joi.boolean(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(1),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean(),
}), }),
kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
operator: joi.string().min(1).required(), operator: joi.string().min(1).required(),
......
...@@ -3,6 +3,17 @@ ...@@ -3,6 +3,17 @@
'use strict'; 'use strict';
export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
/** /**
* GPU Infromation class * GPU Infromation class
* Representing the dynamic and static information retrieved from Nvidia-smi * Representing the dynamic and static information retrieved from Nvidia-smi
...@@ -52,6 +63,19 @@ export class GPUSummary { ...@@ -52,6 +63,19 @@ export class GPUSummary {
} }
} }
export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
` `
$env:METRIC_OUTPUT_DIR="{0}" $env:METRIC_OUTPUT_DIR="{0}"
......
...@@ -17,6 +17,10 @@ export class TrialConfig { ...@@ -17,6 +17,10 @@ export class TrialConfig {
// Required GPU number for trial job. The number should be in [0,100] // Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum: number; public readonly gpuNum: number;
// this flag uses for UT now.
// in future, all environments should be reusable, and this can be configurable by user.
public reuseEnvironment: boolean | undefined = true;
/** /**
* Constructor * Constructor
* @param command Trail command * @param command Trail command
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
'use strict'; 'use strict';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
export class PAIClusterConfig { export class PAIClusterConfig {
public readonly userName: string; public readonly userName: string;
...@@ -12,6 +12,13 @@ export class PAIClusterConfig { ...@@ -12,6 +12,13 @@ export class PAIClusterConfig {
public readonly token?: string; public readonly token?: string;
public readonly reuse?: boolean; public readonly reuse?: boolean;
public cpuNum?: number;
public memoryMB?: number;
public gpuNum?: number;
public useActiveGpu?: boolean;
public maxTrialNumPerGpu?: number;
/** /**
* Constructor * Constructor
* @param userName User name of PAI Cluster * @param userName User name of PAI Cluster
...@@ -20,12 +27,16 @@ export class PAIClusterConfig { ...@@ -20,12 +27,16 @@ export class PAIClusterConfig {
* @param token PAI token of PAI Cluster * @param token PAI token of PAI Cluster
* @param reuse If job is reusable for multiple trials * @param reuse If job is reusable for multiple trials
*/ */
constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) { constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean,
cpuNum?: number, memoryMB?: number, gpuNum?: number) {
this.userName = userName; this.userName = userName;
this.passWord = passWord; this.passWord = passWord;
this.host = host; this.host = host;
this.token = token; this.token = token;
this.reuse = reuse; this.reuse = reuse;
this.cpuNum = cpuNum;
this.memoryMB = memoryMB;
this.gpuNum = gpuNum;
} }
} }
......
...@@ -6,10 +6,8 @@ ...@@ -6,10 +6,8 @@
import * as assert from 'assert'; import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils'; import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData'; import { GPUInfo, parseGpuIndices, ScheduleResultType } from '../common/gpuData';
import { import { ExecutorManager, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail } from './remoteMachineData';
parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, ScheduleResultType, ExecutorManager
} from './remoteMachineData';
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin'; type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
...@@ -39,7 +37,7 @@ export class GPUScheduler { ...@@ -39,7 +37,7 @@ export class GPUScheduler {
* @param requiredGPUNum required GPU number * @param requiredGPUNum required GPU number
*/ */
public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult { public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) { if (requiredGPUNum === undefined) {
requiredGPUNum = 0; requiredGPUNum = 0;
} }
assert(requiredGPUNum >= 0); assert(requiredGPUNum >= 0);
...@@ -48,7 +46,7 @@ export class GPUScheduler { ...@@ -48,7 +46,7 @@ export class GPUScheduler {
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) => const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta: RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum)); rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) { if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number // If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly // Return REQUIRE_EXCEED_TOTAL directly
...@@ -75,8 +73,8 @@ export class GPUScheduler { ...@@ -75,8 +73,8 @@ export class GPUScheduler {
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `); this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
return { return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU, resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
scheduleInfo : undefined scheduleInfo: undefined
}; };
} }
...@@ -159,7 +157,7 @@ export class GPUScheduler { ...@@ -159,7 +157,7 @@ export class GPUScheduler {
const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index); const num: number | undefined = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1; const maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu ? rmMeta.maxTrialNumPerGpu : 1;
if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) || if ((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) { (num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo); availableGPUs.push(gpuInfo);
} }
} else { } else {
...@@ -200,7 +198,7 @@ export class GPUScheduler { ...@@ -200,7 +198,7 @@ export class GPUScheduler {
} }
private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta, private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult { gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum); assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum); const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => { allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
...@@ -222,10 +220,10 @@ export class GPUScheduler { ...@@ -222,10 +220,10 @@ export class GPUScheduler {
scheduleInfo: { scheduleInfo: {
rmMeta: rmMeta, rmMeta: rmMeta,
cudaVisibleDevice: allocatedGPUs cudaVisibleDevice: allocatedGPUs
.map((gpuInfo: GPUInfo) => { .map((gpuInfo: GPUInfo) => {
return gpuInfo.index; return gpuInfo.index;
}) })
.join(',') .join(',')
} }
}; };
} }
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
'use strict'; 'use strict';
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary, ScheduleResultType } from '../common/gpuData';
import { ShellExecutor } from './shellExecutor'; import { ShellExecutor } from './shellExecutor';
/** /**
...@@ -25,18 +25,6 @@ export class RemoteMachineMeta { ...@@ -25,18 +25,6 @@ export class RemoteMachineMeta {
public readonly useActiveGpu?: boolean = false; public readonly useActiveGpu?: boolean = false;
} }
export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
if (gpuIndices !== undefined) {
const indices: number[] = gpuIndices.split(',')
.map((x: string) => parseInt(x, 10));
if (indices.length > 0) {
return new Set(indices);
} else {
throw new Error('gpuIndices can not be empty if specified.');
}
}
}
/** /**
* The execution result for command executed on remote machine * The execution result for command executed on remote machine
*/ */
...@@ -168,14 +156,3 @@ export class ExecutorManager { ...@@ -168,14 +156,3 @@ export class ExecutorManager {
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType }; export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };
export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string }; export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
export enum ScheduleResultType {
// Schedule succeeded
SUCCEED,
// Temporarily, no enough available GPU right now
TMP_NO_AVAILABLE_GPU,
// Cannot match requirement even if all GPU are a
REQUIRE_EXCEED_TOTAL
}
...@@ -7,6 +7,7 @@ import * as assert from 'assert'; ...@@ -7,6 +7,7 @@ import * as assert from 'assert';
import { EventEmitter } from 'events'; import { EventEmitter } from 'events';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import * as component from '../../common/component'; import * as component from '../../common/component';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
...@@ -22,18 +23,16 @@ import { ...@@ -22,18 +23,16 @@ import {
getVersion, uniqueString getVersion, uniqueString
} from '../../common/utils'; } from '../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary, ScheduleResultType } from '../common/gpuData';
import { TrialConfig } from '../common/trialConfig'; import { TrialConfig } from '../common/trialConfig';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util'; import { execMkdir, validateCodeDir } from '../common/util';
import { GPUScheduler } from './gpuScheduler'; import { GPUScheduler } from './gpuScheduler';
import { import {
RemoteMachineMeta, ExecutorManager, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, RemoteMachineTrialJobDetail
ScheduleResultType, ExecutorManager
} from './remoteMachineData'; } from './remoteMachineData';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
import { ShellExecutor } from 'training_service/remote_machine/shellExecutor';
/** /**
* Training Service implementation for Remote Machine (Linux) * Training Service implementation for Remote Machine (Linux)
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
'use strict'; 'use strict';
import { EventEmitter } from 'events';
import { delay } from "../../../common/utils"; import { delay } from "../../../common/utils";
import { AMLEnvironmentInformation } from '../aml/amlConfig'; import { AMLEnvironmentInformation } from '../aml/amlConfig';
import { CommandChannel, RunnerConnection } from "../commandChannel"; import { CommandChannel, RunnerConnection } from "../commandChannel";
...@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection { ...@@ -15,11 +14,7 @@ class AMLRunnerConnection extends RunnerConnection {
export class AMLCommandChannel extends CommandChannel { export class AMLCommandChannel extends CommandChannel {
private stopping: boolean = false; private stopping: boolean = false;
private sendQueues: [EnvironmentInformation, string][] = []; private sendQueues: [EnvironmentInformation, string][] = [];
private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;
public constructor(commandEmitter: EventEmitter) {
super(commandEmitter);
}
public get channelName(): Channel { public get channelName(): Channel {
return "aml"; return "aml";
} }
...@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel { ...@@ -99,11 +94,11 @@ export class AMLCommandChannel extends CommandChannel {
const messages = command['trial_runner']; const messages = command['trial_runner'];
if (messages) { if (messages) {
if (messages instanceof Object && currentMessageIndex < messages.length - 1) { if (messages instanceof Object && currentMessageIndex < messages.length - 1) {
for (let index = currentMessageIndex + 1; index < messages.length; index ++) { for (let index = currentMessageIndex + 1; index < messages.length; index++) {
this.handleCommand(runnerConnection.environment, messages[index]); this.handleCommand(runnerConnection.environment, messages[index]);
} }
currentMessageIndex = messages.length - 1; currentMessageIndex = messages.length - 1;
} else if (currentMessageIndex === -1){ } else if (currentMessageIndex === -1) {
this.handleCommand(runnerConnection.environment, messages); this.handleCommand(runnerConnection.environment, messages);
currentMessageIndex += 1; currentMessageIndex += 1;
} }
......
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
'use strict'; 'use strict';
import { GPUSummary } from "training_service/common/gpuData"; import { EventEmitter } from "events";
import { getLogger, Logger } from "../../common/log"; import { getLogger, Logger } from "../../common/log";
import { TrialJobStatus } from "../../common/trainingService"; import { TrialJobStatus } from "../../common/trainingService";
import { EventEmitter } from "events"; import { GPUInfo } from "../../training_service/common/gpuData";
import { WebCommandChannel } from "./channels/webCommandChannel"; import { WebCommandChannel } from "./channels/webCommandChannel";
import { CommandChannel } from "./commandChannel"; import { CommandChannel } from "./commandChannel";
...@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel"; ...@@ -14,24 +14,50 @@ import { CommandChannel } from "./commandChannel";
export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED';
export type Channel = "web" | "file" | "aml" | "ut"; export type Channel = "web" | "file" | "aml" | "ut";
export class TrialGpuSummary {
// GPU count on the machine
public gpuCount: number;
// The timestamp when GPU summary data queried
public timestamp: string;
// The array of GPU information for each GPU card
public gpuInfos: GPUInfo[];
// GPU assigned status
public assignedGpuIndexMap: Map<number, number> = new Map<number, number>();
constructor(gpuCount: number, timestamp: string, gpuInfos: GPUInfo[]) {
this.gpuCount = gpuCount;
this.timestamp = timestamp;
this.gpuInfos = gpuInfos;
}
}
export class EnvironmentInformation { export class EnvironmentInformation {
// node id is 5 chars, so won't conflict.
private readonly defaultNodeId = "default";
private log: Logger; private log: Logger;
private isNoGpuWarned: boolean = false;
// NNI environment ID
public id: string;
// training platform unique job ID.
public jobId: string;
// training platform job friendly name, in case it's different with job ID.
public jobName: string;
// key states // key states
// true: environment is ready to run trial.
public isIdle: boolean = false;
// true: environment is running, waiting, or unknown. // true: environment is running, waiting, or unknown.
public isAlive: boolean = true; public isAlive: boolean = true;
// true: Runner is initialized, and can receive trials.
public isRunnerReady: boolean = false;
// don't set status in environment directly, use setFinalState function to set a final state. // don't set status in environment directly, use setFinalState function to set a final state.
public status: EnvironmentStatus = "UNKNOWN"; public status: EnvironmentStatus = "UNKNOWN";
// true: environment is ready to run trial.
public runningTrialCount: number = 0;
// uses to count how many trial runs on this environment.
// it can be used in many scenarios, but for now, it uses for reusable.
public assignedTrialCount: number = 0;
// NNI environment ID
public id: string;
// training platform unique job ID.
public envId: string;
// training platform job friendly name, in case it's different with job ID.
public name: string;
public trackingUrl: string = ""; public trackingUrl: string = "";
public workingFolder: string = ""; public workingFolder: string = "";
public runnerWorkingFolder: string = ""; public runnerWorkingFolder: string = "";
...@@ -40,41 +66,82 @@ export class EnvironmentInformation { ...@@ -40,41 +66,82 @@ export class EnvironmentInformation {
// it's used to aggregate node status for multiple node trial // it's used to aggregate node status for multiple node trial
public nodes: Map<string, NodeInfomation>; public nodes: Map<string, NodeInfomation>;
public gpuSummary: Map<string, GPUSummary> = new Map<string, GPUSummary>(); public gpuSummaries: Map<string, TrialGpuSummary> = new Map<string, TrialGpuSummary>();
constructor(id: string, jobName: string, jobId?: string) { // use can specify which gpus can be used by NNI.
// it's usable for sharable environment like remote machine.
public usableGpus?: number[];
// user can specify how to use GPU resource for an environment, like local and remote.
public maxTrialNumberPerGpu?: number;
public useActiveGpu?: boolean;
constructor(id: string, name: string, envId?: string) {
this.log = getLogger(); this.log = getLogger();
this.id = id; this.id = id;
this.jobName = jobName; this.name = name;
this.jobId = jobId ? jobId : jobName; this.envId = envId ? envId : name;
this.nodes = new Map<string, NodeInfomation>(); this.nodes = new Map<string, NodeInfomation>();
} }
public setFinalStatus(status: EnvironmentStatus): void { public setStatus(status: EnvironmentStatus): void {
switch (status) { if (this.status !== status) {
case 'WAITING': this.log.info(`EnvironmentInformation: ${this.envId} change status from ${this.status} to ${status}.`)
case 'SUCCEEDED': this.status = status;
case 'FAILED': }
case 'USER_CANCELED': }
this.status = status;
break; public setGpuSummary(nodeId: string, newGpuSummary: TrialGpuSummary): void {
default: if (nodeId === null || nodeId === undefined) {
this.log.error(`Environment: job ${this.jobId} set an invalid final state ${status}.`); nodeId = this.defaultNodeId;
break; }
const originalGpuSummary = this.gpuSummaries.get(nodeId);
if (undefined === originalGpuSummary) {
newGpuSummary.assignedGpuIndexMap = new Map<number, number>();
this.gpuSummaries.set(nodeId, newGpuSummary);
} else {
originalGpuSummary.gpuCount = newGpuSummary.gpuCount;
originalGpuSummary.timestamp = newGpuSummary.timestamp;
originalGpuSummary.gpuInfos = newGpuSummary.gpuInfos;
}
}
public get defaultGpuSummary(): TrialGpuSummary | undefined {
const gpuSummary = this.gpuSummaries.get(this.defaultNodeId);
if (gpuSummary === undefined) {
if (false === this.isNoGpuWarned) {
this.log.warning(`EnvironmentInformation: ${this.envId} no default gpu found. current gpu info ${JSON.stringify(this.gpuSummaries)}`);
this.isNoGpuWarned = true;
}
} else {
this.isNoGpuWarned = false;
} }
return gpuSummary;
} }
} }
export abstract class EnvironmentService { export abstract class EnvironmentService {
public abstract get hasStorageService(): boolean; public abstract get hasStorageService(): boolean;
public abstract config(key: string, value: string): Promise<void>; public abstract config(key: string, value: string): Promise<void>;
public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>; public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void>;
public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>; public abstract startEnvironment(environment: EnvironmentInformation): Promise<void>;
public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>; public abstract stopEnvironment(environment: EnvironmentInformation): Promise<void>;
public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { // It depends on environment pressure and settings
// for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
public get environmentMaintenceLoopInterval(): number {
return 5000;
}
// it's needed in two scenario
// 1. remote machine has fixed number, so it can return false, when all environment are assigned.
// 2. If there are consistent error on requested environments, for example, authentication failure on platform.
public get hasMoreEnvironments(): boolean {
return true;
}
public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
return new WebCommandChannel(commandEmitter); return new WebCommandChannel(commandEmitter);
} }
...@@ -101,7 +168,7 @@ export class RunnerSettings { ...@@ -101,7 +168,7 @@ export class RunnerSettings {
public nniManagerVersion: string = ""; public nniManagerVersion: string = "";
public logCollection: string = "none"; public logCollection: string = "none";
public command: string = ""; public command: string = "";
public enableGpuCollector: boolean = false; public enableGpuCollector: boolean = true;
// specify which communication channel is used by runner. // specify which communication channel is used by runner.
// supported channel includes: rest, storage, aml // supported channel includes: rest, storage, aml
......
...@@ -3,24 +3,20 @@ ...@@ -3,24 +3,20 @@
'use strict'; 'use strict';
import { EventEmitter } from "events";
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../../common/log'; import { getLogger, Logger } from '../../../common/log';
import { getExperimentRootDir } from '../../../common/utils';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { AMLClusterConfig, AMLTrialConfig } from '../aml/amlConfig';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import { AMLEnvironmentInformation } from '../aml/amlConfig';
import { AMLClient } from '../aml/amlClient';
import {
NNIManagerIpConfig,
} from '../../../common/trainingService';
import { validateCodeDir } from '../../common/util'; import { validateCodeDir } from '../../common/util';
import { getExperimentRootDir } from '../../../common/utils'; import { AMLClient } from '../aml/amlClient';
import { AMLClusterConfig, AMLEnvironmentInformation, AMLTrialConfig } from '../aml/amlConfig';
import { AMLCommandChannel } from '../channels/amlCommandChannel'; import { AMLCommandChannel } from '../channels/amlCommandChannel';
import { CommandChannel } from "../commandChannel"; import { CommandChannel } from "../commandChannel";
import { EventEmitter } from "events"; import { EnvironmentInformation, EnvironmentService, EnvironmentStatus } from '../environment';
/** /**
...@@ -28,17 +24,11 @@ import { EventEmitter } from "events"; ...@@ -28,17 +24,11 @@ import { EventEmitter } from "events";
*/ */
@component.Singleton @component.Singleton
export class AMLEnvironmentService extends EnvironmentService { export class AMLEnvironmentService extends EnvironmentService {
private readonly log: Logger = getLogger(); private readonly log: Logger = getLogger();
public amlClusterConfig: AMLClusterConfig | undefined; public amlClusterConfig: AMLClusterConfig | undefined;
public amlTrialConfig: AMLTrialConfig | undefined; public amlTrialConfig: AMLTrialConfig | undefined;
private amlJobConfig: any;
private stopping: boolean = false;
private versionCheck: boolean = true;
private isMultiPhase: boolean = false;
private nniVersion?: string;
private experimentId: string; private experimentId: string;
private nniManagerIpConfig?: NNIManagerIpConfig;
private experimentRootDir: string; private experimentRootDir: string;
constructor() { constructor() {
...@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService { ...@@ -51,7 +41,7 @@ export class AMLEnvironmentService extends EnvironmentService {
return false; return false;
} }
public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
return new AMLCommandChannel(commandEmitter); return new AMLCommandChannel(commandEmitter);
} }
...@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService { ...@@ -83,29 +73,31 @@ export class AMLEnvironmentService extends EnvironmentService {
public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
environments.forEach(async (environment) => { environments.forEach(async (environment) => {
const amlClient = (environment as AMLEnvironmentInformation).amlClient; const amlClient = (environment as AMLEnvironmentInformation).amlClient;
if (!amlClient) { if (!amlClient) {
throw new Error('AML client not initialized!'); throw new Error('AML client not initialized!');
} }
const status = await amlClient.updateStatus(environment.status); const newStatus = await amlClient.updateStatus(environment.status);
switch (status.toUpperCase()) { switch (newStatus.toUpperCase()) {
case 'WAITING': case 'WAITING':
case 'RUNNING':
case 'QUEUED': case 'QUEUED':
// RUNNING status is set by runner, and ignore waiting status environment.setStatus('WAITING');
break;
case 'RUNNING':
environment.setStatus('RUNNING');
break; break;
case 'COMPLETED': case 'COMPLETED':
case 'SUCCEEDED': case 'SUCCEEDED':
environment.setFinalStatus('SUCCEEDED'); environment.setStatus('SUCCEEDED');
break; break;
case 'FAILED': case 'FAILED':
environment.setFinalStatus('FAILED'); environment.setStatus(newStatus.toUpperCase() as EnvironmentStatus);
break; break;
case 'STOPPED': case 'STOPPED':
case 'STOPPING': case 'STOPPING':
environment.setFinalStatus('USER_CANCELED'); environment.setStatus('USER_CANCELED');
break; break;
default: default:
environment.setFinalStatus('UNKNOWN'); environment.setStatus('UNKNOWN');
} }
}); });
} }
...@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService { ...@@ -120,7 +112,7 @@ export class AMLEnvironmentService extends EnvironmentService {
const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation;
const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp");
environment.command = `import os\nos.system('${amlEnvironment.command}')`; environment.command = `import os\nos.system('${amlEnvironment.command}')`;
await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command ,{ encoding: 'utf8' }); await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command, { encoding: 'utf8' });
const amlClient = new AMLClient( const amlClient = new AMLClient(
this.amlClusterConfig.subscriptionId, this.amlClusterConfig.subscriptionId,
this.amlClusterConfig.resourceGroup, this.amlClusterConfig.resourceGroup,
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import * as yaml from 'js-yaml';
import * as request from 'request'; import * as request from 'request';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
...@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig'; ...@@ -15,7 +16,6 @@ import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig';
import { EnvironmentInformation, EnvironmentService } from '../environment'; import { EnvironmentInformation, EnvironmentService } from '../environment';
import { StorageService } from '../storageService'; import { StorageService } from '../storageService';
const yaml = require('js-yaml');
/** /**
* Collector PAI jobs info from PAI cluster, and update pai job status locally * Collector PAI jobs info from PAI cluster, and update pai job status locally
...@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -40,6 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
} }
public get environmentMaintenceLoopInterval(): number {
return 5000;
}
public get hasStorageService(): boolean { public get hasStorageService(): boolean {
return true; return true;
} }
...@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -72,6 +76,16 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if (this.paiTrialConfig.paiConfigPath) { if (this.paiTrialConfig.paiConfigPath) {
this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
} }
if (this.paiClusterConfig.gpuNum === undefined) {
this.paiClusterConfig.gpuNum = this.paiTrialConfig.gpuNum;
}
if (this.paiClusterConfig.cpuNum === undefined) {
this.paiClusterConfig.cpuNum = this.paiTrialConfig.cpuNum;
}
if (this.paiClusterConfig.memoryMB === undefined) {
this.paiClusterConfig.memoryMB = this.paiTrialConfig.memoryMB;
}
break; break;
} }
default: default:
...@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -111,37 +125,35 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
}); });
environments.forEach((environment) => { environments.forEach((environment) => {
if (jobInfos.has(environment.jobId)) { if (jobInfos.has(environment.envId)) {
const jobResponse = jobInfos.get(environment.jobId); const jobResponse = jobInfos.get(environment.envId);
if (jobResponse && jobResponse.state) { if (jobResponse && jobResponse.state) {
const oldEnvironmentStatus = environment.status; const oldEnvironmentStatus = environment.status;
switch (jobResponse.state) { switch (jobResponse.state) {
case 'RUNNING': case 'RUNNING':
case 'WAITING': case 'WAITING':
// RUNNING status is set by runner, and ignore waiting status
break;
case 'SUCCEEDED': case 'SUCCEEDED':
case 'FAILED': case 'FAILED':
environment.setFinalStatus(jobResponse.state); environment.setStatus(jobResponse.state);
break; break;
case 'STOPPED': case 'STOPPED':
case 'STOPPING': case 'STOPPING':
environment.setFinalStatus('USER_CANCELED'); environment.setStatus('USER_CANCELED');
break; break;
default: default:
this.log.error(`OpenPAI: job ${environment.jobId} returns unknown state ${jobResponse.state}.`); this.log.error(`OpenPAI: job ${environment.envId} returns unknown state ${jobResponse.state}.`);
environment.setFinalStatus('UNKNOWN'); environment.setStatus('UNKNOWN');
} }
if (oldEnvironmentStatus !== environment.status) { if (oldEnvironmentStatus !== environment.status) {
this.log.debug(`OpenPAI: job ${environment.jobId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`) this.log.debug(`OpenPAI: job ${environment.envId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`)
} }
} else { } else {
this.log.error(`OpenPAI: job ${environment.jobId} has no state returned. body:${JSON.stringify(jobResponse)}`); this.log.error(`OpenPAI: job ${environment.envId} has no state returned. body:${JSON.stringify(jobResponse)}`);
// some error happens, and mark this environment // some error happens, and mark this environment
environment.status = 'FAILED'; environment.status = 'FAILED';
} }
} else { } else {
this.log.error(`OpenPAI job ${environment.jobId} is not found in job list.`); this.log.error(`OpenPAI job ${environment.envId} is not found in job list.`);
environment.status = 'UNKNOWN'; environment.status = 'UNKNOWN';
} }
}); });
...@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -169,8 +181,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
// Step 1. Prepare PAI job configuration // Step 1. Prepare PAI job configuration
const environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`; const environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`;
environment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`; environment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`;
environment.command = `cd ${environmentRoot} && ${environment.command}` environment.command = `cd ${environmentRoot} && ${environment.command}`;
environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.jobId}` environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.envId}`;
environment.useActiveGpu = this.paiClusterConfig.useActiveGpu;
environment.maxTrialNumberPerGpu = this.paiClusterConfig.maxTrialNumPerGpu;
// Step 2. Generate Job Configuration in yaml format // Step 2. Generate Job Configuration in yaml format
const paiJobConfig = this.generateJobConfigInYamlFormat(environment); const paiJobConfig = this.generateJobConfigInYamlFormat(environment);
...@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -189,7 +203,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
request(submitJobRequest, (error, response, body) => { request(submitJobRequest, (error, response, body) => {
if ((error !== undefined && error !== null) || response.statusCode >= 400) { if ((error !== undefined && error !== null) || response.statusCode >= 400) {
const errorMessage: string = (error !== undefined && error !== null) ? error.message : const errorMessage: string = (error !== undefined && error !== null) ? error.message :
`start environment ${environment.jobId} failed, http code:${response.statusCode}, http body: ${body}`; `start environment ${environment.envId} failed, http code:${response.statusCode}, http body: ${body}`;
this.log.error(errorMessage); this.log.error(errorMessage);
environment.status = 'FAILED'; environment.status = 'FAILED';
...@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -211,7 +225,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
} }
const stopJobRequest: request.Options = { const stopJobRequest: request.Options = {
uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${environment.jobId}/executionType`, uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${environment.envId}/executionType`,
method: 'PUT', method: 'PUT',
json: true, json: true,
body: { value: 'STOP' }, body: { value: 'STOP' },
...@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -222,17 +236,17 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
} }
}; };
this.log.debug(`stopping OpenPAI environment ${environment.jobId}, ${stopJobRequest.uri}`); this.log.debug(`stopping OpenPAI environment ${environment.envId}, ${stopJobRequest.uri}`);
try { try {
request(stopJobRequest, (error, response, _body) => { request(stopJobRequest, (error, response, _body) => {
try { try {
if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) {
this.log.error(`OpenPAI: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); this.log.error(`OpenPAI: stop job ${environment.envId} failed with ${response.statusCode}\n${error}`);
deferred.reject((error !== undefined && error !== null) ? error : deferred.reject((error !== undefined && error !== null) ? error :
`Stop trial failed, http code: ${response.statusCode}`); `Stop trial failed, http code: ${response.statusCode}`);
} else { } else {
this.log.info(`OpenPAI job ${environment.jobId} stopped.`); this.log.info(`OpenPAI job ${environment.envId} stopped.`);
} }
deferred.resolve(); deferred.resolve();
} catch (error) { } catch (error) {
...@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -265,7 +279,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
if (this.paiTrialConfig === undefined) { if (this.paiTrialConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
const jobName = environment.jobId; const jobName = environment.envId;
let nniJobConfig: any = undefined; let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) { if (this.paiTrialConfig.paiConfigPath) {
...@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -284,7 +298,6 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
environment.nodeCount += instanceCount; environment.nodeCount += instanceCount;
} }
// Each taskRole will generate new command in NNI's command format // Each taskRole will generate new command in NNI's command format
// Each command will be formatted to NNI style // Each command will be formatted to NNI style
for (const taskRoleName in nniJobConfig.taskRoles) { for (const taskRoleName in nniJobConfig.taskRoles) {
...@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -298,6 +311,19 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
} }
} else { } else {
if (this.paiClusterConfig === undefined) {
throw new Error('PAI Cluster config is not initialized');
}
if (this.paiClusterConfig.gpuNum === undefined) {
throw new Error('PAI Cluster gpuNum is not initialized');
}
if (this.paiClusterConfig.cpuNum === undefined) {
throw new Error('PAI Cluster cpuNum is not initialized');
}
if (this.paiClusterConfig.memoryMB === undefined) {
throw new Error('PAI Cluster memoryMB is not initialized');
}
nniJobConfig = { nniJobConfig = {
protocolVersion: 2, protocolVersion: 2,
name: jobName, name: jobName,
...@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -320,9 +346,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount: 0, taskRetryCount: 0,
dockerImage: 'docker_image_0', dockerImage: 'docker_image_0',
resourcePerInstance: { resourcePerInstance: {
gpu: this.paiTrialConfig.gpuNum, gpu: this.paiClusterConfig.gpuNum,
cpu: this.paiTrialConfig.cpuNum, cpu: this.paiClusterConfig.cpuNum,
memoryMB: this.paiTrialConfig.memoryMB memoryMB: this.paiClusterConfig.memoryMB
}, },
commands: [ commands: [
environment.command environment.command
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils';
import { GPUInfo, ScheduleResultType } from '../common/gpuData';
import { EnvironmentInformation } from './environment';
import { TrialDetail } from './trial';
type SCHEDULE_POLICY_NAME = 'random' | 'round-robin';
export class GpuSchedulerSetting {
public useActiveGpu: boolean = false;
public maxTrialNumberPerGpu: number = 1;
}
export type GpuScheduleResult = {
resultType: ScheduleResultType;
environment: EnvironmentInformation | undefined;
gpuIndices: GPUInfo[] | undefined;
};
/**
* A simple GPU scheduler implementation
*/
export class GpuScheduler {
// private readonly machineExecutorMap: Set<TrialDetail>;
private readonly log: Logger = getLogger();
private readonly policyName: SCHEDULE_POLICY_NAME = 'round-robin';
private defaultSetting: GpuSchedulerSetting;
private roundRobinIndex: number = 0;
/**
* Constructor
* @param environments map from remote machine to executor
*/
constructor(gpuSchedulerSetting: GpuSchedulerSetting | undefined = undefined) {
if (undefined === gpuSchedulerSetting) {
gpuSchedulerSetting = new GpuSchedulerSetting();
}
this.defaultSetting = gpuSchedulerSetting;
}
public setSettings(gpuSchedulerSetting: GpuSchedulerSetting): void {
this.defaultSetting = gpuSchedulerSetting;
}
/**
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(environments: EnvironmentInformation[], requiredGPUNum: number | undefined, trialDetail: TrialDetail): GpuScheduleResult {
if (requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleEnvironments: EnvironmentInformation[] = environments.filter((environment: EnvironmentInformation) =>
environment.defaultGpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && environment.defaultGpuSummary.gpuCount >= requiredGPUNum));
if (eligibleEnvironments.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly
return ({
resultType: ScheduleResultType.REQUIRE_EXCEED_TOTAL,
gpuIndices: undefined,
environment: undefined,
});
}
// Step 2: Allocate Host/GPU for specified trial job
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if (requiredGPUNum > 0) {
// Trial job requires GPU
const result: GpuScheduleResult | undefined = this.scheduleGPUHost(environments, requiredGPUNum, trialDetail);
if (result !== undefined) {
return result;
}
} else {
// Trail job does not need GPU
const allocatedRm: EnvironmentInformation = this.selectMachine(environments, environments);
return this.allocateHost(requiredGPUNum, allocatedRm, [], trialDetail);
}
return {
resultType: ScheduleResultType.TMP_NO_AVAILABLE_GPU,
gpuIndices: undefined,
environment: undefined,
};
}
/**
* remove the job's gpu reversion
*/
public removeGpuReservation(trial: TrialDetail): void {
if (trial.environment !== undefined &&
trial.environment.defaultGpuSummary !== undefined &&
trial.assignedGpus !== undefined &&
trial.assignedGpus.length > 0) {
for (const gpuInfo of trial.assignedGpus) {
const defaultGpuSummary = trial.environment.defaultGpuSummary;
const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
if (num !== undefined) {
if (num === 1) {
defaultGpuSummary.assignedGpuIndexMap.delete(gpuInfo.index);
} else {
defaultGpuSummary.assignedGpuIndexMap.set(gpuInfo.index, num - 1);
}
}
}
}
}
private scheduleGPUHost(environments: EnvironmentInformation[], requiredGPUNumber: number, trial: TrialDetail): GpuScheduleResult | undefined {
const totalResourceMap: Map<EnvironmentInformation, GPUInfo[]> = this.gpuResourceDetection(environments);
const qualifiedEnvironments: EnvironmentInformation[] = [];
totalResourceMap.forEach((gpuInfos: GPUInfo[], environment: EnvironmentInformation) => {
if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNumber) {
qualifiedEnvironments.push(environment);
}
});
if (qualifiedEnvironments.length > 0) {
const allocatedEnvironment: EnvironmentInformation = this.selectMachine(qualifiedEnvironments, environments);
const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedEnvironment);
if (gpuInfos !== undefined) { // should always true
return this.allocateHost(requiredGPUNumber, allocatedEnvironment, gpuInfos, trial);
} else {
assert(false, 'gpuInfos is undefined');
}
}
}
/**
* Detect available GPU resource for an environment
* @returns Available GPUs on environments
*/
private gpuResourceDetection(environments: EnvironmentInformation[]): Map<EnvironmentInformation, GPUInfo[]> {
const totalResourceMap: Map<EnvironmentInformation, GPUInfo[]> = new Map<EnvironmentInformation, GPUInfo[]>();
environments.forEach((environment: EnvironmentInformation) => {
// Assgin totoal GPU count as init available GPU number
if (environment.defaultGpuSummary !== undefined) {
const defaultGpuSummary = environment.defaultGpuSummary;
const availableGPUs: GPUInfo[] = [];
const designatedGpuIndices: Set<number> = new Set<number>(environment.usableGpus);
if (designatedGpuIndices.size > 0) {
for (const gpuIndex of designatedGpuIndices) {
if (gpuIndex >= environment.defaultGpuSummary.gpuCount) {
throw new Error(`Specified GPU index not found: ${gpuIndex}`);
}
}
}
if (undefined !== defaultGpuSummary.gpuInfos) {
defaultGpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => {
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if (designatedGpuIndices.size === 0 || designatedGpuIndices.has(gpuInfo.index)) {
if (defaultGpuSummary.assignedGpuIndexMap !== undefined) {
const num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
const maxTrialNumberPerGpu: number = environment.maxTrialNumberPerGpu ? environment.maxTrialNumberPerGpu : this.defaultSetting.maxTrialNumberPerGpu;
const useActiveGpu: boolean = environment.useActiveGpu ? environment.useActiveGpu : this.defaultSetting.useActiveGpu;
if ((num === undefined && (!useActiveGpu && gpuInfo.activeProcessNum === 0 || useActiveGpu)) ||
(num !== undefined && num < maxTrialNumberPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
throw new Error(`occupiedGpuIndexMap is undefined!`);
}
}
});
}
totalResourceMap.set(environment, availableGPUs);
}
});
return totalResourceMap;
}
private selectMachine(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
assert(qualifiedEnvironments !== undefined && qualifiedEnvironments.length > 0);
if (this.policyName === 'random') {
return randomSelect(qualifiedEnvironments);
} else if (this.policyName === 'round-robin') {
return this.roundRobinSelect(qualifiedEnvironments, allEnvironments);
} else {
throw new Error(`Unsupported schedule policy: ${this.policyName}`);
}
}
private roundRobinSelect(qualifiedEnvironments: EnvironmentInformation[], allEnvironments: EnvironmentInformation[]): EnvironmentInformation {
while (!qualifiedEnvironments.includes(allEnvironments[this.roundRobinIndex % allEnvironments.length])) {
this.roundRobinIndex++;
}
return allEnvironments[this.roundRobinIndex++ % allEnvironments.length];
}
private selectGPUsForTrial(gpuInfos: GPUInfo[], requiredGPUNum: number): GPUInfo[] {
// Sequentially allocate GPUs
return gpuInfos.slice(0, requiredGPUNum);
}
private allocateHost(requiredGPUNum: number, environment: EnvironmentInformation,
gpuInfos: GPUInfo[], trialDetails: TrialDetail): GpuScheduleResult {
assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
const defaultGpuSummary = environment.defaultGpuSummary;
if (undefined === defaultGpuSummary) {
throw new Error(`Environment ${environment.id} defaultGpuSummary shouldn't be undefined!`);
}
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
let num: number | undefined = defaultGpuSummary.assignedGpuIndexMap.get(gpuInfo.index);
if (num === undefined) {
num = 0;
}
defaultGpuSummary.assignedGpuIndexMap.set(gpuInfo.index, num + 1);
});
trialDetails.assignedGpus = allocatedGPUs;
return {
resultType: ScheduleResultType.SUCCEED,
environment: environment,
gpuIndices: allocatedGPUs,
};
}
}
...@@ -83,7 +83,7 @@ export abstract class StorageService { ...@@ -83,7 +83,7 @@ export abstract class StorageService {
localPath = this.expandPath(false, localPath); localPath = this.expandPath(false, localPath);
remotePath = this.expandPath(true, remotePath); remotePath = this.expandPath(true, remotePath);
this.logger.debug(`copy remotePath: ${remotePath} to localPath: ${localPath}`); this.logger.debug(`copy remotePath: ${remotePath} to localPath: ${localPath}`);
return await this.internalCopy(localPath, remotePath, true, true, false); return await this.internalCopy(remotePath, localPath, true, true, false);
} }
public async removeDirectory(remotePath: string, isRecursive: boolean): Promise<void> { public async removeDirectory(remotePath: string, isRecursive: boolean): Promise<void> {
...@@ -151,7 +151,7 @@ export abstract class StorageService { ...@@ -151,7 +151,7 @@ export abstract class StorageService {
localPath = this.expandPath(false, localPath); localPath = this.expandPath(false, localPath);
remotePath = this.expandPath(true, remotePath); remotePath = this.expandPath(true, remotePath);
this.logger.debug(`copy file remotePath: ${remotePath} to localPath: ${localPath}`); this.logger.debug(`copy file remotePath: ${remotePath} to localPath: ${localPath}`);
await this.internalCopy(localPath, remotePath, false, true, false); await this.internalCopy(remotePath, localPath, false, true, false);
} }
public async removeFile(remotePath: string): Promise<void> { public async removeFile(remotePath: string): Promise<void> {
......
...@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService { ...@@ -17,12 +17,12 @@ export class MountedStorageService extends StorageService {
if (isRecursive) { if (isRecursive) {
const children = await fs.promises.readdir(path); const children = await fs.promises.readdir(path);
for (const file of children) { for (const file of children) {
const stat = await fs.promises.lstat(file); const filePath = this.internalJoin(path, file);
this.internalRemove(file, stat.isDirectory(), isRecursive); const stat = await fs.promises.lstat(filePath);
await this.internalRemove(filePath, stat.isDirectory(), isRecursive);
} }
} else {
await fs.promises.rmdir(path);
} }
await fs.promises.rmdir(path);
} else { } else {
await fs.promises.unlink(path); await fs.promises.unlink(path);
} }
...@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService { ...@@ -98,7 +98,7 @@ export class MountedStorageService extends StorageService {
{ {
encoding: "utf8", encoding: "utf8",
start: current, start: current,
end: readLength + current, end: readLength + current - 1,
}).on("data", (data) => { }).on("data", (data) => {
result += data; result += data;
}).on("end", () => { }).on("end", () => {
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as chai from 'chai';
import * as fs from 'fs';
import * as path from 'path';
import { getLogger, Logger } from "../../../common/log";
import { cleanupUnitTest, prepareUnitTest } from '../../../common/utils';
import { MountedStorageService } from "../storages/mountedStorageService";
import chaiAsPromised = require("chai-as-promised");
async function remove(removedPath: string, isDirectory: boolean, isRecursive: boolean): Promise<void> {
if (isDirectory) {
if (isRecursive) {
const children = await fs.promises.readdir(removedPath);
for (const fileName of children) {
const filePath = path.join(removedPath, fileName);
const stat = await fs.promises.lstat(filePath);
await remove(filePath, stat.isDirectory(), isRecursive);
}
}
await fs.promises.rmdir(removedPath);
} else {
await fs.promises.unlink(removedPath);
}
}
describe('Unit Test for MountedStorageService', () => {
let service: MountedStorageService;
let log: Logger;
let localPath = "reusableut/local";
let mountedPath = "reusableut/mounted";
const testPath = "testpath";
const testFileName = "testfile.txt";
let localCopiedPath: string;
let localFileName: string;
let mountedFileName: string;
before(() => {
chai.should();
chai.use(chaiAsPromised);
prepareUnitTest();
log = getLogger();
const testRoot = path.dirname(__filename);
localPath = path.join(testRoot, localPath);
mountedPath = path.join(testRoot, mountedPath);
service = new MountedStorageService();
service.initialize(localPath, mountedPath);
localCopiedPath = path.join(localPath, testPath);
localFileName = path.join(localCopiedPath, testFileName);
mountedFileName = path.join(testPath, testFileName);
});
after(() => {
cleanupUnitTest();
});
beforeEach(async () => {
if (!fs.existsSync(localPath)) {
await fs.promises.mkdir(localPath, { recursive: true });
}
if (!fs.existsSync(mountedPath)) {
await fs.promises.mkdir(mountedPath, { recursive: true });
}
log.info(`localFileName: ${localFileName}`);
await fs.promises.mkdir(localCopiedPath, { recursive: true });
await fs.promises.writeFile(localFileName, "hello world");
});
afterEach(async () => {
const testRootPath = path.normalize(`${localPath}/../../reusableut`);
await remove(testRootPath, true, true);
});
it('copyAndRename', async () => {
await service.copyDirectory(localCopiedPath, ".");
chai.expect(fs.existsSync(mountedPath));
const newName = `${testFileName}new`;
await service.rename(mountedFileName, newName);
chai.assert.isFalse(fs.existsSync(testPath));
const newTestPath = `${mountedFileName}new`;
chai.assert.isTrue(await service.exists(newTestPath));
await service.copyFileBack(newTestPath, ".");
const localNewFileName = `${localPath}/${newName}`;
chai.assert.isTrue(fs.existsSync(localNewFileName));
fs.unlinkSync(`${localFileName}`);
fs.rmdirSync(`${localPath}/${testPath}`);
await service.copyDirectoryBack(`${mountedPath}/${testPath}`, `.`);
const localNewName = `${localFileName}new`;
chai.assert.isTrue(fs.existsSync(localNewName));
})
it('FileContentTest', async () => {
const savedFileName = "savedfile.txt";
await service.save("01234", savedFileName);
chai.expect(fs.existsSync(savedFileName));
let content = await service.readFileContent(savedFileName, 0, -1);
chai.assert.equal(content, "01234");
await service.save("56789", savedFileName, true);
content = await service.readFileContent(savedFileName, 0, -1);
chai.assert.equal(content, "0123456789");
content = await service.readFileContent(savedFileName, -1, 1);
chai.assert.equal(content, "0");
content = await service.readFileContent(savedFileName, 5, 1);
chai.assert.equal(content, "5");
content = await service.readFileContent(savedFileName, 5, -1);
chai.assert.equal(content, "56789");
});
});
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import { encodeCommand } from "../../../core/ipcInterface";
import { Command, CommandChannel, RunnerConnection } from "../commandChannel";
import { Channel, EnvironmentInformation } from "../environment";
class UtRunnerConnection extends RunnerConnection {
}
export class UtCommandChannel extends CommandChannel {
private readonly receivedCommands: Command[] = [];
public get channelName(): Channel {
return "ut";
}
public async testSendCommandToTrialDispatcher(environment: EnvironmentInformation, commandType: string, commandData: any) {
const content = encodeCommand(commandType, JSON.stringify(commandData));
this.log.debug(`UtCommandChannel: env ${environment.id} send test command ${content}`);
this.handleCommand(environment, content.toString("utf8"));
}
public async testReceiveCommandFromTrialDispatcher(): Promise<Command | undefined> {
return this.receivedCommands.shift();
}
public async config(_key: string, value: any): Promise<void> {
// do nothing
}
public async start(): Promise<void> {
// do nothing
}
public async stop(): Promise<void> {
// do nothing
}
public async run(): Promise<void> {
// do nothing
}
protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise<void> {
const parsedCommands = this.parseCommands(message);
for (const parsedCommand of parsedCommands) {
const command = new Command(environment, parsedCommand[0], parsedCommand[1]);
this.receivedCommands.push(command);
}
}
protected createRunnerConnection(environment: EnvironmentInformation): RunnerConnection {
// do nothing
return new UtRunnerConnection(environment);
}
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import { EnvironmentInformation, EnvironmentService, EnvironmentStatus } from "../environment";
import { EventEmitter } from "events";
import { CommandChannel } from "../commandChannel";
import { UtCommandChannel } from "./utCommandChannel";
export class UtEnvironmentService extends EnvironmentService {
private commandChannel: UtCommandChannel | undefined;
private allEnvironments = new Map<string, EnvironmentInformation>();
private hasMoreEnvironmentsInternal = true;
constructor() {
super();
}
public get hasStorageService(): boolean {
// storage service is tested by integration testing.
return false;
}
public get environmentMaintenceLoopInterval(): number {
return 1;
}
public testSetEnvironmentStatus(environment: EnvironmentInformation, newStatus: EnvironmentStatus): void {
environment.status = newStatus;
}
public testReset(): void {
this.allEnvironments.clear();
}
public testGetEnvironments(): Map<string, EnvironmentInformation> {
return this.allEnvironments;
}
public testGetCommandChannel(): UtCommandChannel {
if (this.commandChannel === undefined) {
throw new Error(`command channel shouldn't be undefined.`);
}
return this.commandChannel;
}
public testSetNoMoreEnvironment(hasMore: boolean): void {
this.hasMoreEnvironmentsInternal = hasMore;
}
public get hasMoreEnvironments(): boolean {
return this.hasMoreEnvironmentsInternal;
}
public createCommandChannel(commandEmitter: EventEmitter): CommandChannel {
this.commandChannel = new UtCommandChannel(commandEmitter)
return this.commandChannel;
}
public async config(_key: string, _value: string): Promise<void> {
// do nothing
}
public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise<void> {
// do nothing
}
public async startEnvironment(environment: EnvironmentInformation): Promise<void> {
if (!this.allEnvironments.has(environment.id)) {
this.allEnvironments.set(environment.id, environment);
environment.status = "WAITING";
}
}
public async stopEnvironment(environment: EnvironmentInformation): Promise<void> {
environment.status = "USER_CANCELED";
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment