"git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "eab0da154b4c8cf68f95ef294844649c9e17ee60"
Unverified Commit 28e26ae9 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

[Kubeflow Training Service] Explicitly set cuda_visible_devices env var (#388)

* Use different output folder for ps and worker

* Add cuda_visible_devices env var if gpuNum is 0
parent 1df750e2
...@@ -57,22 +57,4 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -57,22 +57,4 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
} }
} }
export const KUBEFLOW_RUN_SHELL_FORMAT: string =
`#!/bin/bash
export NNI_PLATFORM=kubeflow
export NNI_SYS_DIR={0}
export NNI_OUTPUT_DIR={1}
export MULTI_PHASE=false
export NNI_TRIAL_JOB_ID={2}
export NNI_EXP_ID={3}
export NNI_CODE_DIR={4}
export NNI_TRIAL_SEQ_ID={5}
mkdir -p $NNI_SYS_DIR
mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh # Check and install NNI pkg
python3 -m nni_trial_tool.trial_keeper --trial_command '{6}' --nnimanager_ip '{7}' --nnimanager_port '{8}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
`
export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export type KubeflowTFJobType = 'Created' | 'Running' | 'Failed' | 'Succeeded';
\ No newline at end of file
...@@ -30,15 +30,14 @@ import { EventEmitter } from 'events'; ...@@ -30,15 +30,14 @@ import { EventEmitter } from 'events';
import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo'; import { getExperimentId, getInitTrialSequenceId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors'; import { MethodNotImplementedError } from '../../common/errors';
import { String } from 'typescript-string-operations';
import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { import {
JobApplicationForm, TrainingService, TrialJobApplicationForm, JobApplicationForm, TrainingService, TrialJobApplicationForm,
TrialJobDetail, TrialJobMetric TrialJobDetail, TrialJobMetric
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, uniqueString } from '../../common/utils';
import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, KubeflowTrialConfigTemplate, NFSConfig } from './kubeflowConfig'; import { KubeflowClusterConfig, kubeflowOperatorMap, KubeflowTrialConfig, NFSConfig } from './kubeflowConfig';
import { KubeflowTrialJobDetail, KUBEFLOW_RUN_SHELL_FORMAT } from './kubeflowData'; import { KubeflowTrialJobDetail } from './kubeflowData';
import { KubeflowJobRestServer } from './kubeflowJobRestServer'; import { KubeflowJobRestServer } from './kubeflowJobRestServer';
import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector'; import { KubeflowJobInfoCollector } from './kubeflowJobInfoCollector';
...@@ -455,18 +454,46 @@ class KubeflowTrainingService implements TrainingService { ...@@ -455,18 +454,46 @@ class KubeflowTrainingService implements TrainingService {
*/ */
private genereateRunScript(trialJobId: string, trialWorkingFolder: string, private genereateRunScript(trialJobId: string, trialWorkingFolder: string,
command: string, trialSequenceId: string, roleType: DistTrainRole): string { command: string, trialSequenceId: string, roleType: DistTrainRole): string {
return String.Format( const runScriptLines: string[] = [];
KUBEFLOW_RUN_SHELL_FORMAT,
`$PWD/nni/${trialJobId}`, runScriptLines.push('#!/bin/bash');
path.join(trialWorkingFolder, `${roleType}_output`), runScriptLines.push('export NNI_PLATFORM=kubeflow');
trialJobId, runScriptLines.push(`export NNI_SYS_DIR=$PWD/nni/${trialJobId}`);
getExperimentId(), runScriptLines.push(`export NNI_OUTPUT_DIR=${path.join(trialWorkingFolder, 'output', `${roleType}_output`)}`);
trialWorkingFolder, runScriptLines.push('export MULTI_PHASE=false');
trialSequenceId, runScriptLines.push(`export NNI_TRIAL_JOB_ID=${trialJobId}`);
command, runScriptLines.push(`export NNI_EXP_ID=${getExperimentId()}`);
getIPV4Address(), runScriptLines.push(`export NNI_CODE_DIR=${trialWorkingFolder}`);
this.kubeflowRestServerPort runScriptLines.push(`export NNI_TRIAL_SEQ_ID=${trialSequenceId}`);
);
// Nvidia devcie plugin for K8S has a known issue that requesting zero GPUs allocates all GPUs
// Refer https://github.com/NVIDIA/k8s-device-plugin/issues/61
// So we have to explicitly set CUDA_VISIBLE_DEVICES to empty if user sets gpuNum to 0 in NNI config file
if(this.kubeflowTrialConfig) {
switch(roleType) {
case 'ps':
if(this.kubeflowTrialConfig.ps && this.kubeflowTrialConfig.ps.gpuNum == 0) {
runScriptLines.push(`export CUDA_VISIBLE_DEVICES=''`);
}
break;
case 'worker':
if(this.kubeflowTrialConfig.worker && this.kubeflowTrialConfig.worker.gpuNum == 0) {
runScriptLines.push(`export CUDA_VISIBLE_DEVICES=''`);
}
break;
default:
break;
}
}
runScriptLines.push('mkdir -p $NNI_SYS_DIR');
runScriptLines.push('mkdir -p $NNI_OUTPUT_DIR');
runScriptLines.push('cp -rT $NNI_CODE_DIR $NNI_SYS_DIR');
runScriptLines.push('cd $NNI_SYS_DIR');
runScriptLines.push('sh install_nni.sh # Check and install NNI pkg');
runScriptLines.push(`python3 -m nni_trial_tool.trial_keeper --trial_command '${command}' --nnimanager_ip '${getIPV4Address()}' --nnimanager_port '${this.kubeflowRestServerPort}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`);
return runScriptLines.join('\n');
} }
private generateSequenceId(): number { private generateSequenceId(): number {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment