support specifying gpus for tuner and advisor (#1556)

* support specifying gpu for tuner and advisor

support specifying gpus for tuner and advisor (#1556)
* support specifying gpu for tuner and advisor
0b7d6260 · QuanluZhang · GitHub · 04d2d7cb · 0b7d6260 · 0b7d6260
Unverified Commit 0b7d6260 authored Sep 20, 2019 by QuanluZhang Committed by GitHub Sep 20, 2019
8 changed files
--- a/docs/en_US/Tutorial/ExperimentConfig.md
+++ b/docs/en_US/Tutorial/ExperimentConfig.md
@@ -35,7 +35,7 @@ tuner:
  classArgs:
    #choice: maximize, minimize
    optimize_mode:
-  gpuNum:
+  gpuIndices:
 trial:
  command:
  codeDir:
@@ -71,14 +71,13 @@ tuner:
  classArgs:
    #choice: maximize, minimize
    optimize_mode:
-  gpuNum:
+  gpuIndices:
 assessor:
  #choice: Medianstop
  builtinAssessorName:
  classArgs:
    #choice: maximize, minimize
    optimize_mode:
-  gpuNum:
 trial:
  command:
  codeDir:
@@ -113,14 +112,13 @@ tuner:
  classArgs:
    #choice: maximize, minimize
    optimize_mode:
-  gpuNum:
+  gpuIndices:
 assessor:
  #choice: Medianstop
  builtinAssessorName:
  classArgs:
    #choice: maximize, minimize
    optimize_mode:
-  gpuNum:
 trial:
  command:
  codeDir:
@@ -245,11 +243,11 @@ machineList:
  * __builtinTunerName__ and __classArgs__
    * __builtinTunerName__

-      __builtinTunerName__ specifies the name of system tuner, NNI sdk provides four kinds of tuner, including {__TPE__, __Random__, __Anneal__, __Evolution__, __BatchTuner__, __GridSearch__}
+      __builtinTunerName__ specifies the name of system tuner, NNI sdk provides different tuners introduced [here](../Tuner/BuiltinTuner.md).

    * __classArgs__

-      __classArgs__ specifies the arguments of tuner algorithm. If the __builtinTunerName__ is in {__TPE__, __Random__, __Anneal__, __Evolution__}, user should set __optimize_mode__.
+      __classArgs__ specifies the arguments of tuner algorithm. Please refer to [this file](../Tuner/BuiltinTuner.md) for the configurable arguments of each built-in tuner.
  * __codeDir__, __classFileName__, __className__ and __classArgs__
    * __codeDir__

@@ -264,16 +262,16 @@ machineList:

      __classArgs__ specifies the arguments of tuner algorithm.

-  * __gpuNum__
-
-      __gpuNum__ specifies the gpu number to run the tuner process. The value of this field should be a positive number. If the field is not set, NNI will not set `CUDA_VISIBLE_DEVICES` in script (that is, will not control the visibility of GPUs on trial command through `CUDA_VISIBLE_DEVICES`), and will not manage gpu resource.
+  * __gpuIndices__

-      Note: users could only specify one way to set tuner, for example, set {tunerName, optimizationMode} or {tunerCommand, tunerCwd}, and could not set them both.
+      __gpuIndices__ specifies the gpus that can be used by the tuner process. Single or multiple GPU indices can be specified, multiple GPU indices are seperated by comma(,), such as `1` or `0,1,3`. If the field is not set, `CUDA_VISIBLE_DEVICES` will be '' in script, that is, no GPU is visible to tuner.

  * __includeIntermediateResults__

      If __includeIntermediateResults__ is true, the last intermediate result of the trial that is early stopped by assessor is sent to tuner as final result. The default value of __includeIntermediateResults__ is false.

+  Note: users could only use one way to specify tuner, either specifying `builtinTunerName` and `classArgs`, or specifying `codeDir`, `classFileName`, `className` and `classArgs`.
+
 * __assessor__

  * Description
@@ -282,7 +280,7 @@ machineList:
  * __builtinAssessorName__ and __classArgs__
    * __builtinAssessorName__

-      __builtinAssessorName__ specifies the name of system assessor, NNI sdk provides one kind of assessor {__Medianstop__}
+      __builtinAssessorName__ specifies the name of built-in assessor, NNI sdk provides different assessors introducted [here](../Assessor/BuiltinAssessor.md).
    * __classArgs__

      __classArgs__ specifies the arguments of assessor algorithm
@@ -305,11 +303,39 @@ machineList:

      __classArgs__ specifies the arguments of assessor algorithm.

-  * __gpuNum__
+  Note: users could only use one way to specify assessor, either specifying `builtinAssessorName` and `classArgs`, or specifying `codeDir`, `classFileName`, `className` and `classArgs`. If users do not want to use assessor, assessor fileld should leave to empty.
+
+* __advisor__
+  * Description

-    __gpuNum__ specifies the gpu number to run the assessor process. The value of this field should be a positive number.
+    __advisor__ specifies the advisor algorithm in the experiment, there are two kinds of ways to specify advisor. One way is to use advisor provided by NNI sdk, need to set __builtinAdvisorName__ and __classArgs__. Another way is to use users' own advisor file, and need to set __codeDirectory__, __classFileName__, __className__ and __classArgs__.
+  * __builtinAdvisorName__ and __classArgs__
+    * __builtinAdvisorName__

-    Note: users' could only specify one way to set assessor, for example,set {assessorName, optimizationMode} or {assessorCommand, assessorCwd}, and users could not set them both.If users do not want to use assessor, assessor fileld should leave to empty.
+      __builtinAdvisorName__ specifies the name of a built-in advisor, NNI sdk provides [different advisors](../Tuner/BuiltinTuner.md).
+
+    * __classArgs__
+
+      __classArgs__ specifies the arguments of the advisor algorithm. Please refer to [this file](../Tuner/BuiltinTuner.md) for the configurable arguments of each built-in advisor.
+  * __codeDir__, __classFileName__, __className__ and __classArgs__
+    * __codeDir__
+
+      __codeDir__ specifies the directory of advisor code.
+    * __classFileName__
+
+      __classFileName__ specifies the name of advisor file.
+    * __className__
+
+      __className__ specifies the name of advisor class.
+    * __classArgs__
+
+      __classArgs__ specifies the arguments of advisor algorithm.
+
+  * __gpuIndices__
+
+      __gpuIndices__ specifies the gpus that can be used by the tuner process. Single or multiple GPU indices can be specified, multiple GPU indices are seperated by comma(,), such as `1` or `0,1,3`. If the field is not set, `CUDA_VISIBLE_DEVICES` will be '' in script, that is, no GPU is visible to tuner.
+
+  Note: users could only use one way to specify advisor, either specifying `builtinAdvisorName` and `classArgs`, or specifying `codeDir`, `classFileName`, `className` and `classArgs`.

 * __trial(local, remote)__

@@ -560,7 +586,6 @@ machineList:
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  trial:
    command: python3 mnist.py
    codeDir: /nni/mnist
@@ -586,14 +611,12 @@ machineList:
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  assessor:
    #choice: Medianstop
    builtinAssessorName: Medianstop
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  trial:
    command: python3 mnist.py
    codeDir: /nni/mnist
@@ -620,7 +643,6 @@ machineList:
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  assessor:
    codeDir: /nni/assessor
    classFileName: myassessor.py
@@ -628,7 +650,6 @@ machineList:
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  trial:
    command: python3 mnist.py
    codeDir: /nni/mnist
@@ -656,7 +677,6 @@ machineList:
    classArgs:
      #choice: maximize, minimize
      optimize_mode: maximize
-    gpuNum: 0
  trial:
    command: python3 mnist.py
    codeDir: /nni/mnist
@@ -780,7 +800,6 @@ machineList:
    builtinAssessorName: Medianstop
    classArgs:
      optimize_mode: maximize
-    gpuNum: 0
  trial:
    codeDir: .
    worker:

--- a/examples/trials/nas_cifar10/config_ppo.yml
+++ b/examples/trials/nas_cifar10/config_ppo.yml
@@ -15,6 +15,9 @@ tuner:
    trials_per_update: 60
    epochs_per_update: 12
    minibatch_size: 10
+  #could use the No. 0 gpu for this tuner
+  #if want to specify multiple gpus, here is an example of specifying three gpus: 0,1,2
+  gpuIndices: 0
 trial:
  command: sh ./macro_cifar10.sh
  codeDir: ./

--- a/src/nni_manager/common/utils.ts
+++ b/src/nni_manager/common/utils.ts
@@ -219,6 +219,11 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
        if (advisor.classFileName !== undefined && advisor.classFileName.length > 1) {
            command += ` --advisor_class_filename ${advisor.classFileName}`;
        }
+        if (advisor.gpuIndices !== undefined) {
+            command = `CUDA_VISIBLE_DEVICES=${advisor.gpuIndices} ` + command;
+        } else {
+            command = `CUDA_VISIBLE_DEVICES='' ` + command;
+        }
    } else {
        command += ` --tuner_class_name ${tuner.className}`;
        if (tuner.classArgs !== undefined) {
@@ -243,6 +248,12 @@ function getMsgDispatcherCommand(tuner: any, assessor: any, advisor: any, multiP
                command += ` --assessor_class_filename ${assessor.classFileName}`;
            }
        }
+
+        if (tuner.gpuIndices !== undefined) {
+            command = `CUDA_VISIBLE_DEVICES=${tuner.gpuIndices} ` + command;
+        } else {
+            command = `CUDA_VISIBLE_DEVICES='' ` + command;
+        }
    }

    return command;

--- a/src/nni_manager/rest_server/restValidationSchemas.ts
+++ b/src/nni_manager/rest_server/restValidationSchemas.ts
@@ -170,8 +170,8 @@ export namespace ValidationSchemas {
                classFileName: joi.string(),
                className: joi.string(),
                classArgs: joi.any(),
-                gpuNum: joi.number().min(0),
-                checkpointDir: joi.string().allow('')
+                checkpointDir: joi.string().allow(''),
+                gpuIndices: joi.string()
            }),
            tuner: joi.object({
                builtinTunerName: joi.string().valid('TPE', 'Random', 'Anneal', 'Evolution', 'SMAC', 'BatchTuner', 'GridSearch', 'NetworkMorphism', 'MetisTuner', 'GPTuner', 'PPOTuner'),
@@ -179,9 +179,9 @@ export namespace ValidationSchemas {
                classFileName: joi.string(),
                className: joi.string(),
                classArgs: joi.any(),
-                gpuNum: joi.number().min(0),
                checkpointDir: joi.string().allow(''),
-                includeIntermediateResults: joi.boolean()
+                includeIntermediateResults: joi.boolean(),
+                gpuIndices: joi.string()
            }),
            assessor: joi.object({
                builtinAssessorName: joi.string().valid('Medianstop', 'Curvefitting'),
@@ -189,7 +189,6 @@ export namespace ValidationSchemas {
                classFileName: joi.string(),
                className: joi.string(),
                classArgs: joi.any(),
-                gpuNum: joi.number().min(0),
                checkpointDir: joi.string().allow('')
            }),
            clusterMetaData: joi.array().items(joi.object({

--- a/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py
+++ b/src/sdk/pynni/nni/ppo_tuner/ppo_tuner.py
@@ -23,7 +23,6 @@ ppo_tuner.py including:
 """

 import os
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import copy
 import logging
 import numpy as np

--- a/test/naive_test/local.yml
+++ b/test/naive_test/local.yml
@@ -14,14 +14,12 @@ tuner:
    className: NaiveTuner
    classArgs:
        optimize_mode: maximize
-    gpuNum: 0
 assessor:
    codeDir: .
    classFileName: naive_assessor.py
    className: NaiveAssessor
    classArgs:
        optimize_mode: maximize
-    gpuNum: 0
 trial:
    command: python3 naive_trial.py
    codeDir: .

--- a/tools/nni_cmd/config_schema.py
+++ b/tools/nni_cmd/config_schema.py
@@ -76,7 +76,7 @@ tuner_schema_dict = {
            'optimize_mode': setChoice('optimize_mode', 'maximize', 'minimize'),
        },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    ('Evolution'): {
        'builtinTunerName': setChoice('builtinTunerName', 'Evolution'),
@@ -85,12 +85,12 @@ tuner_schema_dict = {
            Optional('population_size'): setNumberRange('population_size', int, 0, 99999),
        },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    ('BatchTuner', 'GridSearch', 'Random'): {
        'builtinTunerName': setChoice('builtinTunerName', 'BatchTuner', 'GridSearch', 'Random'),
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'TPE': {
        'builtinTunerName': 'TPE',
@@ -100,7 +100,7 @@ tuner_schema_dict = {
            Optional('constant_liar_type'): setChoice('constant_liar_type', 'min', 'max', 'mean')
        },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'NetworkMorphism': {
        'builtinTunerName': 'NetworkMorphism',
@@ -112,7 +112,7 @@ tuner_schema_dict = {
            Optional('n_output_node'): setType('n_output_node', int),
            },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'MetisTuner': {
        'builtinTunerName': 'MetisTuner',
@@ -124,7 +124,7 @@ tuner_schema_dict = {
            Optional('cold_start_num'): setType('cold_start_num', int),
            },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'GPTuner': {
        'builtinTunerName': 'GPTuner',
@@ -140,7 +140,7 @@ tuner_schema_dict = {
            Optional('selection_num_starting_points'):  setType('selection_num_starting_points', int),
            },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool), 
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'PPOTuner': {
        'builtinTunerName': 'PPOTuner',
@@ -158,7 +158,7 @@ tuner_schema_dict = {
            Optional('cliprange'): setType('cliprange', float),
        },
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'customized': {
        'codeDir': setPathCheck('codeDir'),
@@ -166,7 +166,7 @@ tuner_schema_dict = {
        'className': setType('className', str),
        Optional('classArgs'): dict,
        Optional('includeIntermediateResults'): setType('includeIntermediateResults', bool),
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    }
 }

@@ -178,7 +178,7 @@ advisor_schema_dict = {
            Optional('R'): setType('R', int),
            Optional('eta'): setType('eta', int)
        },
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'BOHB':{
        'builtinAdvisorName': Or('BOHB'),
@@ -194,14 +194,14 @@ advisor_schema_dict = {
            Optional('bandwidth_factor'): setNumberRange('bandwidth_factor', float, 0, 9999),
            Optional('min_bandwidth'): setNumberRange('min_bandwidth', float, 0, 9999),
        },
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    },
    'customized':{
        'codeDir': setPathCheck('codeDir'),
        'classFileName': setType('classFileName', str),
        'className': setType('className', str),
        Optional('classArgs'): dict,
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('gpuIndices'): Or(int, And(str, lambda x: len([int(i) for i in x.split(',')]) > 0), error='gpuIndex format error!'),
    }
 }

@@ -212,7 +212,6 @@ assessor_schema_dict = {
            Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
            Optional('start_step'): setNumberRange('start_step', int, 0, 9999),
        },
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
    },
    'Curvefitting': {
        'builtinAssessorName': 'Curvefitting',
@@ -223,14 +222,12 @@ assessor_schema_dict = {
            Optional('threshold'): setNumberRange('threshold', float, 0, 9999),
            Optional('gap'): setNumberRange('gap', int, 1, 9999),
        },
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
    },
    'customized': {
        'codeDir': setPathCheck('codeDir'),
        'classFileName': setType('classFileName', str),
        'className': setType('className', str),
        Optional('classArgs'): dict,
-        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999)
    }
 }


--- a/tools/nni_cmd/launcher.py
+++ b/tools/nni_cmd/launcher.py
@@ -296,10 +296,20 @@ def set_experiment(experiment_config, mode, port, config_file_name):
        request_data['multiThread'] = experiment_config.get('multiThread')
    if experiment_config.get('advisor'):
        request_data['advisor'] = experiment_config['advisor']
+        if request_data['advisor'].get('gpuNum'):
+            print_error('gpuNum is deprecated, please use gpuIndices instead.')
+        if request_data['advisor'].get('gpuIndices') and isinstance(request_data['advisor'].get('gpuIndices'), int):
+            request_data['advisor']['gpuIndices'] = str(request_data['advisor'].get('gpuIndices'))
    else:
        request_data['tuner'] = experiment_config['tuner']
+        if request_data['tuner'].get('gpuNum'):
+            print_error('gpuNum is deprecated, please use gpuIndices instead.')
+        if request_data['tuner'].get('gpuIndices') and isinstance(request_data['tuner'].get('gpuIndices'), int):
+            request_data['tuner']['gpuIndices'] = str(request_data['tuner'].get('gpuIndices'))
        if 'assessor' in experiment_config:
            request_data['assessor'] = experiment_config['assessor']
+            if request_data['assessor'].get('gpuNum'):
+                print_error('gpuNum is deprecated, please remove it from your config file.')
    #debug mode should disable version check
    if experiment_config.get('debug') is not None:
        request_data['versionCheck'] = not experiment_config.get('debug')