Unverified Commit ee25377d authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Update pai yaml merge method (#2369)

parent f5dbcd81
...@@ -92,8 +92,18 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod ...@@ -92,8 +92,18 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
* Required key. Set the mount path in your container used in PAI. * Required key. Set the mount path in your container used in PAI.
* paiStoragePlugin * paiStoragePlugin
* Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field. * Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
* command
* Optional key. Set the commands used in PAI container.
* paiConfigPath * paiConfigPath
* Optional key. Set the file path of pai job configuration, the file is in yaml format. * Optional key. Set the file path of pai job configuration, the file is in yaml format.
If users set `paiConfigPath` in NNI's configuration file, no need to specify the fields `command`, `paiStoragePlugin`, `virtualCluster`, `image`, `memoryMB`, `cpuNum`, `gpuNum` in `trial` configuration. These fields will use the values from the config file specified by `paiConfigPath`.
```
Note:
1. The job name in PAI's configuration file will be replaced by a new job name, the new job name is created by NNI, the name format is nni_exp_${this.experimentId}_trial_${trialJobId}.
2. If users set multiple taskRoles in PAI's configuration file, NNI will wrap all of these taksRoles and start multiple tasks in one trial job, users should ensure that only one taskRole report metric to NNI, otherwise there might be some conflict error.
```
Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
"azure-storage": "^2.10.2", "azure-storage": "^2.10.2",
"chai-as-promised": "^7.1.1", "chai-as-promised": "^7.1.1",
"child-process-promise": "^2.2.1", "child-process-promise": "^2.2.1",
"deepmerge": "^4.2.2",
"express": "^4.16.3", "express": "^4.16.3",
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.0",
"js-base64": "^2.4.9", "js-base64": "^2.4.9",
......
...@@ -44,7 +44,6 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig'; ...@@ -44,7 +44,6 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig';
import { PAIJobRestServer } from '../paiJobRestServer'; import { PAIJobRestServer } from '../paiJobRestServer';
const yaml = require('js-yaml'); const yaml = require('js-yaml');
const deepmerge = require('deepmerge');
/** /**
* Training Service implementation for OpenPAI (Open Platform for AI) * Training Service implementation for OpenPAI (Open Platform for AI)
...@@ -53,9 +52,11 @@ const deepmerge = require('deepmerge'); ...@@ -53,9 +52,11 @@ const deepmerge = require('deepmerge');
@component.Singleton @component.Singleton
class PAIK8STrainingService extends PAITrainingService { class PAIK8STrainingService extends PAITrainingService {
protected paiTrialConfig: NNIPAIK8STrialConfig | undefined; protected paiTrialConfig: NNIPAIK8STrialConfig | undefined;
private paiJobConfig: undefined;
private nniVersion: string | undefined;
constructor() { constructor() {
super(); super();
} }
public async setClusterMetadata(key: string, value: string): Promise<void> { public async setClusterMetadata(key: string, value: string): Promise<void> {
...@@ -84,9 +85,13 @@ class PAIK8STrainingService extends PAITrainingService { ...@@ -84,9 +85,13 @@ class PAIK8STrainingService extends PAITrainingService {
this.paiTrialConfig = <NNIPAIK8STrialConfig>JSON.parse(value); this.paiTrialConfig = <NNIPAIK8STrialConfig>JSON.parse(value);
// Validate to make sure codeDir doesn't have too many files // Validate to make sure codeDir doesn't have too many files
await validateCodeDir(this.paiTrialConfig.codeDir); await validateCodeDir(this.paiTrialConfig.codeDir);
if (this.paiTrialConfig.paiConfigPath) {
this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
}
break; break;
case TrialConfigMetadataKey.VERSION_CHECK: case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True'); this.versionCheck = (value === 'true' || value === 'True');
this.nniVersion = this.versionCheck ? await getVersion() : '';
break; break;
case TrialConfigMetadataKey.LOG_COLLECTION: case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value; this.logCollection = value;
...@@ -141,72 +146,100 @@ class PAIK8STrainingService extends PAITrainingService { ...@@ -141,72 +146,100 @@ class PAIK8STrainingService extends PAITrainingService {
return trialJobDetail; return trialJobDetail;
} }
public generateJobConfigInYamlFormat(trialJobId: string, command: string) { private generateNNITrialCommand(trialJobDetail: PAITrialJobDetail, command: string) {
if (this.paiTrialConfig === undefined) { if (this.paiTrialConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
} }
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobId}` const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobDetail.id}`;
const paiJobConfig: any = { const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
protocolVersion: 2, const nniPaiTrialCommand: string = String.Format(
name: jobName, PAI_K8S_TRIAL_COMMAND_FORMAT,
type: 'job', `${containerWorkingDir}`,
jobRetryCount: 0, `${containerWorkingDir}/nnioutput`,
prerequisites: [ trialJobDetail.id,
{ this.experimentId,
type: 'dockerimage', trialJobDetail.form.sequenceId,
uri: this.paiTrialConfig.image, this.isMultiPhase,
name: 'docker_image_0' command,
} nniManagerIp,
], this.paiRestServerPort,
taskRoles: { this.nniVersion,
taskrole: { this.logCollection
instances: 1, )
completion: { .replace(/\r\n|\n|\r/gm, '');
minFailedInstances: 1,
minSucceededInstances: -1 return nniPaiTrialCommand;
},
taskRetryCount: 0, }
dockerImage: 'docker_image_0',
resourcePerInstance: { private generateJobConfigInYamlFormat(trialJobDetail: PAITrialJobDetail) {
gpu: this.paiTrialConfig.gpuNum, if (this.paiTrialConfig === undefined) {
cpu: this.paiTrialConfig.cpuNum, throw new Error('trial config is not initialized');
memoryMB: this.paiTrialConfig.memoryMB
},
commands: [
command
]
}
},
extras: {
'com.microsoft.pai.runtimeplugin': [
{
plugin: this.paiTrialConfig.paiStoragePlugin
}
],
submitFrom: 'submit-job-v2'
}
}
if (this.paiTrialConfig.virtualCluster) {
paiJobConfig.defaults= {
virtualCluster: this.paiTrialConfig.virtualCluster
}
} }
const jobName = `nni_exp_${this.experimentId}_trial_${trialJobDetail.id}`
let nniJobConfig: any = undefined;
if (this.paiTrialConfig.paiConfigPath) { if (this.paiTrialConfig.paiConfigPath) {
try { nniJobConfig = this.paiJobConfig;
const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); nniJobConfig.name = jobName;
//deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result. // Each taskRole will generate new command in NNI's command format
//refer: https://github.com/TehShrike/deepmerge // Each command will be formatted to NNI style
const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray; for(const taskRoleIndex in nniJobConfig.taskRoles) {
return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge })); const commands = nniJobConfig.taskRoles[taskRoleIndex].commands
} catch (error) { const nniTrialCommand = this.generateNNITrialCommand(trialJobDetail, commands.join(" && ").replace(/(["'$`\\])/g,'\\$1'));
this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`); nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand]
} }
} else { } else {
return yaml.safeDump(paiJobConfig); nniJobConfig = {
protocolVersion: 2,
name: jobName,
type: 'job',
jobRetryCount: 0,
prerequisites: [
{
type: 'dockerimage',
uri: this.paiTrialConfig.image,
name: 'docker_image_0'
}
],
taskRoles: {
taskrole: {
instances: 1,
completion: {
minFailedInstances: 1,
minSucceededInstances: -1
},
taskRetryCount: 0,
dockerImage: 'docker_image_0',
resourcePerInstance: {
gpu: this.paiTrialConfig.gpuNum,
cpu: this.paiTrialConfig.cpuNum,
memoryMB: this.paiTrialConfig.memoryMB
},
commands: [
this.generateNNITrialCommand(trialJobDetail, this.paiTrialConfig.command)
]
}
},
extras: {
'com.microsoft.pai.runtimeplugin': [
{
plugin: this.paiTrialConfig.paiStoragePlugin
}
],
submitFrom: 'submit-job-v2'
}
}
if (this.paiTrialConfig.virtualCluster) {
nniJobConfig.defaults = {
virtualCluster: this.paiTrialConfig.virtualCluster
}
}
} }
} return yaml.safeDump(nniJobConfig);
}
protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> { protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> {
const deferred: Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
...@@ -247,29 +280,8 @@ class PAIK8STrainingService extends PAITrainingService { ...@@ -247,29 +280,8 @@ class PAIK8STrainingService extends PAITrainingService {
//Copy codeDir files to local working folder //Copy codeDir files to local working folder
await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath); await execCopydir(this.paiTrialConfig.codeDir, trialJobDetail.logPath);
//Generate Job Configuration in yaml format
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail);
const version: string = this.versionCheck ? await getVersion() : '';
const containerWorkingDir: string = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/${trialJobId}`;
const nniPaiTrialCommand: string = String.Format(
PAI_K8S_TRIAL_COMMAND_FORMAT,
`${containerWorkingDir}`,
`${containerWorkingDir}/nnioutput`,
trialJobId,
this.experimentId,
trialJobDetail.form.sequenceId,
this.isMultiPhase,
this.paiTrialConfig.command,
nniManagerIp,
this.paiRestServerPort,
version,
this.logCollection
)
.replace(/\r\n|\n|\r/gm, '');
this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand);
this.log.debug(paiJobConfig); this.log.debug(paiJobConfig);
// Step 3. Submit PAI job via Rest call // Step 3. Submit PAI job via Rest call
// Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
......
...@@ -1332,11 +1332,6 @@ deepmerge@^2.1.1: ...@@ -1332,11 +1332,6 @@ deepmerge@^2.1.1:
version "2.2.1" version "2.2.1"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170" resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170"
deepmerge@^4.2.2:
version "4.2.2"
resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955"
integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==
default-require-extensions@^3.0.0: default-require-extensions@^3.0.0:
version "3.0.0" version "3.0.0"
resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-3.0.0.tgz#e03f93aac9b2b6443fc52e5e4a37b3ad9ad8df96" resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-3.0.0.tgz#e03f93aac9b2b6443fc52e5e4a37b3ad9ad8df96"
......
...@@ -287,7 +287,7 @@ pai_trial_schema = { ...@@ -287,7 +287,7 @@ pai_trial_schema = {
'codeDir': setPathCheck('codeDir'), 'codeDir': setPathCheck('codeDir'),
'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'), 'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'),
'containerNFSMountPath': setType('containerNFSMountPath', str), 'containerNFSMountPath': setType('containerNFSMountPath', str),
'command': setType('command', str), Optional('command'): setType('command', str),
Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999), Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999), Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999),
Optional('memoryMB'): setType('memoryMB', int), Optional('memoryMB'): setType('memoryMB', int),
......
...@@ -266,35 +266,14 @@ def validate_pai_config_path(experiment_config): ...@@ -266,35 +266,14 @@ def validate_pai_config_path(experiment_config):
'''validate paiConfigPath field''' '''validate paiConfigPath field'''
if experiment_config.get('trainingServicePlatform') == 'pai': if experiment_config.get('trainingServicePlatform') == 'pai':
if experiment_config.get('trial', {}).get('paiConfigPath'): if experiment_config.get('trial', {}).get('paiConfigPath'):
# validate the file format of paiConfigPath, ensure it is yaml format # validate commands
pai_config = get_yml_content(experiment_config['trial']['paiConfigPath']) pai_config = get_yml_content(experiment_config['trial']['paiConfigPath'])
if experiment_config['trial'].get('image') is None: taskRoles_dict = pai_config.get('taskRoles')
if pai_config.get('prerequisites', [{}])[0].get('uri') is None: if not taskRoles_dict:
print_error('Please set image field, or set image uri in your own paiConfig!') print_error('Please set taskRoles in paiConfigPath config file!')
exit(1) exit(1)
experiment_config['trial']['image'] = pai_config['prerequisites'][0]['uri']
if experiment_config['trial'].get('gpuNum') is None:
if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('gpu') is None:
print_error('Please set gpuNum field, or set resourcePerInstance gpu in your own paiConfig!')
exit(1)
experiment_config['trial']['gpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['gpu']
if experiment_config['trial'].get('cpuNum') is None:
if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('cpu') is None:
print_error('Please set cpuNum field, or set resourcePerInstance cpu in your own paiConfig!')
exit(1)
experiment_config['trial']['cpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['cpu']
if experiment_config['trial'].get('memoryMB') is None:
if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('memoryMB', {}) is None:
print_error('Please set memoryMB field, or set resourcePerInstance memoryMB in your own paiConfig!')
exit(1)
experiment_config['trial']['memoryMB'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['memoryMB']
if experiment_config['trial'].get('paiStoragePlugin') is None:
if pai_config.get('extras', {}).get('com.microsoft.pai.runtimeplugin', [{}])[0].get('plugin') is None:
print_error('Please set paiStoragePlugin field, or set plugin in your own paiConfig!')
exit(1)
experiment_config['trial']['paiStoragePlugin'] = pai_config['extras']['com.microsoft.pai.runtimeplugin'][0]['plugin']
else: else:
pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin'] pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin', 'command']
for trial_field in pai_trial_fields_required_list: for trial_field in pai_trial_fields_required_list:
if experiment_config['trial'].get(trial_field) is None: if experiment_config['trial'].get(trial_field) is None:
print_error('Please set {0} in trial configuration,\ print_error('Please set {0} in trial configuration,\
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment