Unverified Commit 204b1eba authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Set gpuNum as optional (#1389)

parent 555334de
...@@ -266,7 +266,7 @@ machineList: ...@@ -266,7 +266,7 @@ machineList:
* __gpuNum__ * __gpuNum__
__gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number. __gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number. If the field is not set, NNI will not set `CUDA_VISIBLE_DEVICES` in script (that is, will not control the visibility of GPUs on trial command through `CUDA_VISIBLE_DEVICES`), and will not manage gpu resource.
Note: users could only specify one way to set tuner, for example, set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and could not set them both. Note: users could only specify one way to set tuner, for example, set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and could not set them both.
......
...@@ -307,9 +307,11 @@ class LocalTrainingService implements TrainingService { ...@@ -307,9 +307,11 @@ class LocalTrainingService implements TrainingService {
if (this.localTrailConfig === undefined) { if (this.localTrailConfig === undefined) {
throw new Error('trial config parsed failed'); throw new Error('trial config parsed failed');
} }
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`); if (this.localTrailConfig.gpuNum !== undefined) {
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) { this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
this.gpuScheduler = new GPUScheduler(); if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler();
}
} }
break; break;
case TrialConfigMetadataKey.LOCAL_CONFIG: case TrialConfigMetadataKey.LOCAL_CONFIG:
...@@ -399,7 +401,8 @@ class LocalTrainingService implements TrainingService { ...@@ -399,7 +401,8 @@ class LocalTrainingService implements TrainingService {
private getEnvironmentVariables( private getEnvironmentVariables(
trialJobDetail: TrialJobDetail, trialJobDetail: TrialJobDetail,
resource: { gpuIndices: number[] }): { key: string; value: string }[] { resource: { gpuIndices: number[] },
gpuNum: number | undefined): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [ const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' }, { key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_EXP_ID', value: this.experimentId }, { key: 'NNI_EXP_ID', value: this.experimentId },
...@@ -409,11 +412,12 @@ class LocalTrainingService implements TrainingService { ...@@ -409,11 +412,12 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() }, { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
]; ];
if (gpuNum !== undefined) {
envVariables.push({ envVariables.push({
key: 'CUDA_VISIBLE_DEVICES', key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',') value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
}); });
}
return envVariables; return envVariables;
} }
...@@ -490,6 +494,7 @@ class LocalTrainingService implements TrainingService { ...@@ -490,6 +494,7 @@ class LocalTrainingService implements TrainingService {
if (!success) { if (!success) {
break; break;
} }
this.occupyResource(resource); this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource); await this.runTrialJob(trialJobId, resource);
} }
...@@ -526,7 +531,10 @@ class LocalTrainingService implements TrainingService { ...@@ -526,7 +531,10 @@ class LocalTrainingService implements TrainingService {
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId); const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource); if (this.localTrailConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`);
}
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum);
if (this.localTrailConfig === undefined) { if (this.localTrailConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
......
...@@ -48,14 +48,17 @@ export class GPUScheduler { ...@@ -48,14 +48,17 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum) * Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number * @param requiredGPUNum required GPU number
*/ */
public scheduleMachine(requiredGPUNum: number, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult { public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0); assert(requiredGPUNum >= 0);
const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys()); const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys());
assert(allRMs.length > 0); assert(allRMs.length > 0);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) => const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || rmMeta.gpuSummary.gpuCount >= requiredGPUNum); rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) { if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number // If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly // Return REQUIRE_EXCEED_TOTAL directly
......
...@@ -601,12 +601,16 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -601,12 +601,16 @@ class RemoteMachineTrainingService implements TrainingService {
let command: string; let command: string;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device // Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device // If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) { // If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`; if (this.trialConfig.gpuNum === undefined) {
command = this.trialConfig.command;
} else { } else {
command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`; if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) {
command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`;
} else {
command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`;
}
} }
// tslint:disable-next-line: strict-boolean-expressions // tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
if (this.remoteRestServerPort === undefined) { if (this.remoteRestServerPort === undefined) {
......
...@@ -220,7 +220,7 @@ common_trial_schema = { ...@@ -220,7 +220,7 @@ common_trial_schema = {
'trial':{ 'trial':{
'command': setType('command', str), 'command': setType('command', str),
'codeDir': setPathCheck('codeDir'), 'codeDir': setPathCheck('codeDir'),
'gpuNum': setNumberRange('gpuNum', int, 0, 99999), Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
Optional('nasMode'): setChoice('classic_mode', 'enas_mode', 'oneshot_mode') Optional('nasMode'): setChoice('classic_mode', 'enas_mode', 'oneshot_mode')
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment