Merge pai config (#1965)

26aa1136 · SparkSnail · GitHub · 8e953fce · 26aa1136 · 26aa1136
Unverified Commit 26aa1136 authored Feb 07, 2020 by SparkSnail Committed by GitHub Feb 07, 2020
8 changed files
--- a/docs/en_US/TrainingService/PaiMode.md
+++ b/docs/en_US/TrainingService/PaiMode.md
@@ -48,12 +48,12 @@ Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if y

 Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys:
 * cpuNum
-    * Required key. Should be positive number based on your trial program's CPU  requirement
+    * Optional key. Should be positive number based on your trial program's CPU  requirement. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
 * memoryMB
-    * Required key. Should be positive number based on your trial program's memory requirement
+    * Optional key. Should be positive number based on your trial program's memory requirement. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
 * image
-    * Required key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run.
-    * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it.
+    * Optional key. In pai mode, your trial program will be scheduled by OpenPAI to run in [Docker container](https://www.docker.com/). This key is used to specify the Docker image used to create the container in which your trial will run.
+    * We already build a docker image [nnimsra/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
 * virtualCluster
    * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster.
 * nniManagerNFSMountPath
@@ -61,7 +61,9 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
 * containerNFSMountPath
    * Required key. Set the mount path in your container used in PAI.
 * paiStoragePlugin
-    * Required key. Set the storage plugin name used in PAI.
+    * Optional key. Set the storage plugin name used in PAI. If it is not set in trial configuration, it should be set in the config file specified in `paiConfigPath` field.
+* paiConfigPath
+    * Optional key. Set the file path of pai job configuration, the file is in yaml format.


 Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command

--- a/src/nni_manager/package.json
+++ b/src/nni_manager/package.json
@@ -13,6 +13,7 @@
    "azure-storage": "^2.10.2",
    "chai-as-promised": "^7.1.1",
    "child-process-promise": "^2.2.1",
+    "deepmerge": "^4.2.2",
    "express": "^4.16.3",
    "express-joi-validator": "^2.0.0",
    "js-base64": "^2.4.9",

--- a/src/nni_manager/rest_server/restValidationSchemas.ts
+++ b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -38,6 +38,7 @@ export namespace ValidationSchemas {
                authFile: joi.string(),
                nniManagerNFSMountPath: joi.string().min(1),
                containerNFSMountPath: joi.string().min(1),
+                paiConfigPath: joi.string(),
                paiStoragePlugin: joi.string().min(1),
                nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
                portList: joi.array().items(joi.object({

--- a/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts
@@ -31,10 +31,11 @@ export class NNIPAIK8STrialConfig extends TrialConfig {
    public readonly nniManagerNFSMountPath: string;
    public readonly containerNFSMountPath: string;
    public readonly paiStoragePlugin: string;
+    public readonly paiConfigPath?: string;

    constructor(command: string, codeDir: string, gpuNum: number, cpuNum: number, memoryMB: number,
                image: string, nniManagerNFSMountPath: string, containerNFSMountPath: string,
-                paiStoragePlugin: string, virtualCluster?: string) {
+                paiStoragePlugin: string, virtualCluster?: string, paiConfigPath?: string) {
        super(command, codeDir, gpuNum);
        this.cpuNum = cpuNum;
        this.memoryMB = memoryMB;
@@ -43,5 +44,6 @@ export class NNIPAIK8STrialConfig extends TrialConfig {
        this.nniManagerNFSMountPath = nniManagerNFSMountPath;
        this.containerNFSMountPath = containerNFSMountPath;
        this.paiStoragePlugin = paiStoragePlugin;
+        this.paiConfigPath = paiConfigPath;
    }
 }
--- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
+++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts
@@ -44,6 +44,7 @@ import { PAIClusterConfig, PAITrialJobDetail } from '../paiConfig';
 import { PAIJobRestServer } from '../paiJobRestServer';

 const yaml = require('js-yaml');
+const deepmerge = require('deepmerge');

 /**
 * Training Service implementation for OpenPAI (Open Platform for AI)
@@ -189,7 +190,19 @@ class PAIK8STrainingService extends PAITrainingService {
            }
        }

-        return yaml.safeDump(paiJobConfig);
+        if (this.paiTrialConfig.paiConfigPath) {
+            try {
+                const additionalPAIConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8'));
+                //deepmerge(x, y), if an element at the same key is present for both x and y, the value from y will appear in the result.
+                //refer: https://github.com/TehShrike/deepmerge
+                const overwriteMerge = (destinationArray: any, sourceArray: any, options: any) => sourceArray;
+                return yaml.safeDump(deepmerge(additionalPAIConfig, paiJobConfig, { arrayMerge: overwriteMerge }));
+            } catch (error) {
+                this.log.error(`Error occurs during loading and merge ${this.paiTrialConfig.paiConfigPath} : ${error}`);
+            }
+        } else {
+            return yaml.safeDump(paiJobConfig);
+        }
      }

    protected async submitTrialJobToPAI(trialJobId: string): Promise<boolean> {
@@ -258,7 +271,7 @@ class PAIK8STrainingService extends PAITrainingService {
        this.log.info(`nniPAItrial command is ${nniPaiTrialCommand.trim()}`);
        
        const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobId, nniPaiTrialCommand);
-
+        this.log.debug(paiJobConfig);
        // Step 3. Submit PAI job via Rest call
        // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API
        const submitJobRequest: request.Options = {

--- a/src/nni_manager/yarn.lock
+++ b/src/nni_manager/yarn.lock
@@ -1112,6 +1112,11 @@ deepmerge@^2.1.1:
  version "2.2.1"
  resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-2.2.1.tgz#5d3ff22a01c00f645405a2fbc17d0778a1801170"

+deepmerge@^4.2.2:
+  version "4.2.2"
+  resolved "https://registry.yarnpkg.com/deepmerge/-/deepmerge-4.2.2.tgz#44d2ea3679b8f4d4ffba33f03d865fc1e7bf4955"
+  integrity sha512-FJ3UgI4gIl+PHZm53knsuSFpE+nESMr7M4v9QcgB7S63Kj/6WqMiFQJpBBYz1Pt+66bZpP3Q7Lye0Oo9MPKEdg==
+
 default-require-extensions@^2.0.0:
  version "2.0.0"
  resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-2.0.0.tgz#f5f8fbb18a7d6d50b21f641f649ebb522cfe24f7"

--- a/tools/nni_cmd/config_schema.py
+++ b/tools/nni_cmd/config_schema.py
@@ -271,16 +271,17 @@ pai_yarn_config_schema = {

 pai_trial_schema = {
    'trial':{
-        'command': setType('command', str),
        'codeDir': setPathCheck('codeDir'),
-        'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
-        'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
-        'memoryMB': setType('memoryMB', int),
-        'image': setType('image', str),
-        Optional('virtualCluster'): setType('virtualCluster', str),
        'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'),
        'containerNFSMountPath': setType('containerNFSMountPath', str),
-        'paiStoragePlugin': setType('paiStoragePlugin', str)
+        'command': setType('command', str),
+        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999),
+        Optional('memoryMB'): setType('memoryMB', int),
+        Optional('image'): setType('image', str),
+        Optional('virtualCluster'): setType('virtualCluster', str),
+        Optional('paiStoragePlugin'): setType('paiStoragePlugin', str),
+        Optional('paiConfigPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'paiConfigPath')
    }
 }


--- a/tools/nni_cmd/launcher_utils.py
+++ b/tools/nni_cmd/launcher_utils.py
@@ -7,7 +7,7 @@ from schema import SchemaError
 from schema import Schema
 from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_YARN_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\
                           FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict
-from .common_utils import print_error, print_warning, print_normal
+from .common_utils import print_error, print_warning, print_normal, get_yml_content

 def expand_path(experiment_config, key):
    '''Change '~' to user home directory'''
@@ -63,6 +63,8 @@ def parse_path(experiment_config, config_path):
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            expand_path(experiment_config['machineList'][index], 'sshKeyPath')
+    if experiment_config['trial'].get('paiConfigPath'):
+        expand_path(experiment_config['trial'], 'paiConfigPath')

    #if users use relative path, convert it to absolute path
    root_path = os.path.dirname(config_path)
@@ -94,6 +96,8 @@ def parse_path(experiment_config, config_path):
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            parse_relative_path(root_path, experiment_config['machineList'][index], 'sshKeyPath')
+    if experiment_config['trial'].get('paiConfigPath'):
+        parse_relative_path(root_path, experiment_config['trial'], 'paiConfigPath')

 def validate_search_space_content(experiment_config):
    '''Validate searchspace content,
@@ -254,6 +258,45 @@ def validate_machine_list(experiment_config):
        print_error('Please set machineList!')
        exit(1)

+def validate_pai_config_path(experiment_config):
+    '''validate paiConfigPath field'''
+    if experiment_config.get('trainingServicePlatform') == 'pai':
+        if experiment_config.get('trial', {}).get('paiConfigPath'):
+            # validate the file format of paiConfigPath, ensure it is yaml format
+            pai_config = get_yml_content(experiment_config['trial']['paiConfigPath'])
+            if experiment_config['trial'].get('image') is None:
+                if pai_config.get('prerequisites', [{}])[0].get('uri') is None:
+                    print_error('Please set image field, or set image uri in your own paiConfig!')
+                    exit(1)
+                experiment_config['trial']['image'] = pai_config['prerequisites'][0]['uri']
+            if experiment_config['trial'].get('gpuNum') is None:
+                if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('gpu') is None:
+                    print_error('Please set gpuNum field, or set resourcePerInstance gpu in your own paiConfig!')
+                    exit(1)
+                experiment_config['trial']['gpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['gpu']
+            if experiment_config['trial'].get('cpuNum') is None:
+                if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('cpu') is None:
+                    print_error('Please set cpuNum field, or set resourcePerInstance cpu in your own paiConfig!')
+                    exit(1)
+                experiment_config['trial']['cpuNum'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['cpu']
+            if experiment_config['trial'].get('memoryMB') is None:
+                if pai_config.get('taskRoles', {}).get('taskrole', {}).get('resourcePerInstance', {}).get('memoryMB', {}) is None:
+                    print_error('Please set memoryMB field, or set resourcePerInstance memoryMB in your own paiConfig!')
+                    exit(1)
+                experiment_config['trial']['memoryMB'] = pai_config['taskRoles']['taskrole']['resourcePerInstance']['memoryMB']
+            if experiment_config['trial'].get('paiStoragePlugin') is None:
+                if pai_config.get('extras', {}).get('com.microsoft.pai.runtimeplugin', [{}])[0].get('plugin') is None:
+                    print_error('Please set paiStoragePlugin field, or set plugin in your own paiConfig!')
+                    exit(1)
+                experiment_config['trial']['paiStoragePlugin'] = pai_config['extras']['com.microsoft.pai.runtimeplugin'][0]['plugin']
+        else:
+            pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin']
+            for trial_field in pai_trial_fields_required_list:
+                if experiment_config['trial'].get(trial_field) is None:
+                    print_error('Please set {0} in trial configuration,\
+                                or set additional pai configuration file path in paiConfigPath!'.format(trial_field))
+                    exit(1)
+
 def validate_pai_trial_conifg(experiment_config):
    '''validate the trial config in pai platform'''
    if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']:
@@ -269,6 +312,7 @@ def validate_pai_trial_conifg(experiment_config):
            print_warning(warning_information.format('dataDir'))
        if experiment_config.get('trial').get('outputDir'):
            print_warning(warning_information.format('outputDir'))
+        validate_pai_config_path(experiment_config)

 def validate_all_content(experiment_config, config_path):
    '''Validate whether experiment_config is valid'''