Reusable environment support GPU scheduler, add test cases and refactoring. (#2627)

143c6615 · Chi Song · GitHub · 8a20c348 · 143c6615 · 143c6615
Unverified Commit 143c6615 authored Jul 30, 2020 by Chi Song Committed by GitHub Jul 30, 2020
10 changed files
--- a/src/nni_manager/training_service/reusable/trial.ts
+++ b/src/nni_manager/training_service/reusable/trial.ts
@@ -25,7 +25,7 @@ export class TrialDetail implements TrialJobDetail {
    // it's used to aggregate node status for multiple node trial
    public nodes: Map<string, NodeInfomation>;
    // assigned GPUs for multi-trial scheduled.
-    public assignedGpus: GPUInfo[] = [];
+    public assignedGpus: GPUInfo[] | undefined;

    public readonly TRIAL_METADATA_DIR = ".nni";


--- a/src/nni_manager/training_service/reusable/trialDispatcher.ts
+++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts
--- a/src/nni_manager/yarn.lock
+++ b/src/nni_manager/yarn.lock
@@ -262,6 +262,10 @@
  version "2.3.1"
  resolved "https://registry.yarnpkg.com/@types/js-base64/-/js-base64-2.3.1.tgz#c39f14f129408a3d96a1105a650d8b2b6eeb4168"

+"@types/js-yaml@^3.12.5":
+  version "3.12.5"
+  resolved "https://registry.yarnpkg.com/@types/js-yaml/-/js-yaml-3.12.5.tgz#136d5e6a57a931e1cce6f9d8126aa98a9c92a6bb"
+
 "@types/json-schema@^7.0.3":
  version "7.0.3"
  resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.3.tgz#bdfd69d61e464dcc81b25159c270d75a73c1a636"
@@ -277,7 +281,6 @@
 "@types/minipass@*":
  version "2.2.0"
  resolved "https://registry.yarnpkg.com/@types/minipass/-/minipass-2.2.0.tgz#51ad404e8eb1fa961f75ec61205796807b6f9651"
-  integrity sha512-wuzZksN4w4kyfoOv/dlpov4NOunwutLA/q7uc00xU02ZyUY+aoM5PWIXEKBMnm0NHd4a+N71BMjq+x7+2Af1fg==
  dependencies:
    "@types/node" "*"

@@ -430,7 +433,6 @@
 "@types/tar@^4.0.3":
  version "4.0.3"
  resolved "https://registry.yarnpkg.com/@types/tar/-/tar-4.0.3.tgz#e2cce0b8ff4f285293243f5971bd7199176ac489"
-  integrity sha512-Z7AVMMlkI8NTWF0qGhC4QIX0zkV/+y0J8x7b/RsHrN0310+YNjoJd8UrApCiGBCWtKjxS9QhNqLi2UJNToh5hA==
  dependencies:
    "@types/minipass" "*"
    "@types/node" "*"
@@ -1017,7 +1019,6 @@ chownr@^1.1.2, chownr@^1.1.3:
 chownr@^2.0.0:
  version "2.0.0"
  resolved "https://registry.yarnpkg.com/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece"
-  integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==

 ci-info@^1.5.0:
  version "1.6.0"
@@ -1912,7 +1913,6 @@ fs-minipass@^1.2.5:
 fs-minipass@^2.0.0:
  version "2.1.0"
  resolved "https://registry.yarnpkg.com/fs-minipass/-/fs-minipass-2.1.0.tgz#7f5036fdbf12c63c169190cbe4199c852271f9fb"
-  integrity sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==
  dependencies:
    minipass "^3.0.0"

@@ -2331,7 +2331,6 @@ ignore@^4.0.6:
 ignore@^5.1.4:
  version "5.1.4"
  resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.1.4.tgz#84b7b3dbe64552b6ef0eca99f6743dbec6d97adf"
-  integrity sha512-MzbUSahkTW1u7JpKKjY7LCARd1fU5W2rLdxlM4kdkayuCwZImjkpluF9CM1aLewYJguPDqewLam18Y6AU69A8A==

 import-fresh@^3.0.0:
  version "3.2.1"
@@ -2650,7 +2649,6 @@ istanbul-lib-source-maps@^4.0.0:
 istanbul-reports@^3.0.2:
  version "3.0.2"
  resolved "https://registry.yarnpkg.com/istanbul-reports/-/istanbul-reports-3.0.2.tgz#d593210e5000683750cb09fc0644e4b6e27fd53b"
-  integrity sha512-9tZvz7AiR3PEDNGiV9vIouQ/EAcqMXFmkcA1CDFTwOB98OZVDL0PH9glHotf5Ugp6GCOTypfzGWI/OqjWNCRUw==
  dependencies:
    html-escaper "^2.0.0"
    istanbul-lib-report "^3.0.0"
@@ -3193,7 +3191,6 @@ minipass@^2.3.5, minipass@^2.8.6, minipass@^2.9.0:
 minipass@^3.0.0:
  version "3.1.3"
  resolved "https://registry.yarnpkg.com/minipass/-/minipass-3.1.3.tgz#7d42ff1f39635482e15f9cdb53184deebd5815fd"
-  integrity sha512-Mgd2GdMVzY+x3IJ+oHnVM+KG3lA5c8tnabyJKmHSaG2kAGpudxuOf8ToDkhumF7UzME7DecbQE9uOZhNm7PuJg==
  dependencies:
    yallist "^4.0.0"

@@ -3212,7 +3209,6 @@ minizlib@^1.2.1:
 minizlib@^2.1.0:
  version "2.1.0"
  resolved "https://registry.yarnpkg.com/minizlib/-/minizlib-2.1.0.tgz#fd52c645301ef09a63a2c209697c294c6ce02cf3"
-  integrity sha512-EzTZN/fjSvifSX0SlqUERCN39o6T40AMarPbv0MrarSFtIITCBh7bi+dU8nxGFHuqs9jdIAeoYoKuQAAASsPPA==
  dependencies:
    minipass "^3.0.0"
    yallist "^4.0.0"
@@ -3249,7 +3245,6 @@ mkdirp@^0.5.1:
 mkdirp@^1.0.3:
  version "1.0.4"
  resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-1.0.4.tgz#3eb5ed62622756d79a5f0e2a221dfebad75c2f7e"
-  integrity sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==

 mocha@^7.1.1:
  version "7.1.1"
@@ -3707,7 +3702,6 @@ number-is-nan@^1.0.0:
 nyc@^15.0.0:
  version "15.0.1"
  resolved "https://registry.yarnpkg.com/nyc/-/nyc-15.0.1.tgz#bd4d5c2b17f2ec04370365a5ca1fc0ed26f9f93d"
-  integrity sha512-n0MBXYBYRqa67IVt62qW1r/d9UH/Qtr7SF1w/nQLJ9KxvWF6b2xCHImRAixHN9tnMMYHC2P14uo6KddNGwMgGg==
  dependencies:
    "@istanbuljs/load-nyc-config" "^1.0.0"
    "@istanbuljs/schema" "^0.1.2"
@@ -5065,7 +5059,6 @@ tar@^4.4.10, tar@^4.4.12, tar@^4.4.13:
 tar@^6.0.2:
  version "6.0.2"
  resolved "https://registry.yarnpkg.com/tar/-/tar-6.0.2.tgz#5df17813468a6264ff14f766886c622b84ae2f39"
-  integrity sha512-Glo3jkRtPcvpDlAs/0+hozav78yoXKFr+c4wgw62NNMO3oo4AaJdCo21Uu7lcwr55h39W2XD1LMERc64wtbItg==
  dependencies:
    chownr "^2.0.0"
    fs-minipass "^2.0.0"
@@ -5541,7 +5534,6 @@ yallist@^3.0.2, yallist@^3.0.3:
 yallist@^4.0.0:
  version "4.0.0"
  resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72"
-  integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==

 yargs-parser@13.1.2, yargs-parser@^13.1.2:
  version "13.1.2"

--- a/test/nni_test/nnitest/generate_ts_config.py
+++ b/test/nni_test/nnitest/generate_ts_config.py
@@ -35,6 +35,8 @@ def update_training_service_config(args):
            config[args.ts]['paiConfig']['host'] = args.pai_host
        if args.pai_token is not None:
            config[args.ts]['paiConfig']['token'] = args.pai_token
+        if args.pai_reuse is not None:
+            config[args.ts]['paiConfig']['reuse'] = args.pai_reuse.lower() == 'true'
        if args.nni_docker_image is not None:
            config[args.ts]['trial']['image'] = args.nni_docker_image
        if args.nni_manager_nfs_mount_path is not None:
@@ -101,6 +103,7 @@ if __name__ == '__main__':
    parser.add_argument("--output_dir", type=str)
    parser.add_argument("--vc", type=str)
    parser.add_argument("--pai_token", type=str)
+    parser.add_argument("--pai_reuse", type=str)
    parser.add_argument("--pai_storage_config_name", type=str)
    parser.add_argument("--nni_manager_nfs_mount_path", type=str)
    parser.add_argument("--container_nfs_mount_path", type=str)

--- a/test/pipelines/pipelines-it-pai.yml
+++ b/test/pipelines/pipelines-it-pai.yml
@@ -57,7 +57,7 @@ jobs:

      echo "TEST_IMG:$TEST_IMG"
      cd test
-      python3 nni_test/nnitest/generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_config_name $(pai_storage_config_name)\
+      python3 nni_test/nnitest/generate_ts_config.py --ts pai --pai_reuse $(pai_reuse) --pai_host $(pai_host) --pai_user $(pai_user) --nni_docker_image $TEST_IMG --pai_storage_config_name $(pai_storage_config_name)\
       --pai_token $(pai_token) --nni_manager_nfs_mount_path $(nni_manager_nfs_mount_path) --container_nfs_mount_path $(container_nfs_mount_path) --nni_manager_ip $(nni_manager_ip) --vc $(virtual_cluster)
      PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
    displayName: 'integration test'
--- a/tools/nni_cmd/config_schema.py
+++ b/tools/nni_cmd/config_schema.py
@@ -14,10 +14,12 @@ def setType(key, valueType):
    '''check key type'''
    return And(valueType, error=SCHEMA_TYPE_ERROR % (key, valueType.__name__))

+
 def setChoice(key, *args):
    '''check choice'''
    return And(lambda n: n in args, error=SCHEMA_RANGE_ERROR % (key, str(args)))

+
 def setNumberRange(key, keyType, start, end):
    '''check number range'''
    return And(
@@ -25,16 +27,19 @@ def setNumberRange(key, keyType, start, end):
        And(lambda n: start <= n <= end, error=SCHEMA_RANGE_ERROR % (key, '(%s,%s)' % (start, end))),
    )

+
 def setPathCheck(key):
    '''check if path exist'''
    return And(os.path.exists, error=SCHEMA_PATH_ERROR % key)

+
 class AlgoSchema:
    """
    This class is the schema of 'tuner', 'assessor' and 'advisor' sections of experiment configuraion file.
    For example:
    AlgoSchema('tuner') creates the schema of tuner section.
    """
+
    def __init__(self, algo_type):
        """
        Parameters:
@@ -108,6 +113,7 @@ class AlgoSchema:
        Schema(self.algo_schema).validate(data)
        self.validate_extras(data, self.algo_type)

+
 common_schema = {
    'authorName': setType('authorName', str),
    'experimentName': setType('experimentName', str),
@@ -138,7 +144,7 @@ common_schema = {
 }

 common_trial_schema = {
-    'trial':{
+    'trial': {
        'command': setType('command', str),
        'codeDir': setPathCheck('codeDir'),
        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
@@ -147,7 +153,7 @@ common_trial_schema = {
 }

 pai_yarn_trial_schema = {
-    'trial':{
+    'trial': {
        'command': setType('command', str),
        'codeDir': setPathCheck('codeDir'),
        'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
@@ -156,10 +162,10 @@ pai_yarn_trial_schema = {
        'image': setType('image', str),
        Optional('authFile'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'authFile'),
        Optional('shmMB'): setType('shmMB', int),
-        Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
-                            error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
-        Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
-                            error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
+        Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),
+                                 error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
+        Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),
+                                   error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
        Optional('virtualCluster'): setType('virtualCluster', str),
        Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
        Optional('portList'): [{
@@ -184,7 +190,7 @@ pai_yarn_config_schema = {


 pai_trial_schema = {
-    'trial':{
+    'trial': {
        'codeDir': setPathCheck('codeDir'),
        'nniManagerNFSMountPath': setPathCheck('nniManagerNFSMountPath'),
        'containerNFSMountPath': setType('containerNFSMountPath', str),
@@ -200,21 +206,21 @@ pai_trial_schema = {
 }

 pai_config_schema = {
-    'paiConfig': Or({
-        'userName': setType('userName', str),
-        'passWord': setType('passWord', str),
-        'host': setType('host', str),
-        Optional('reuse'): setType('reuse', bool)
-    }, {
+    'paiConfig': {
        'userName': setType('userName', str),
-        'token': setType('token', str),
+        Or('passWord', 'token', only_one=True): str,
        'host': setType('host', str),
-        Optional('reuse'): setType('reuse', bool)
-    })
+        Optional('reuse'): setType('reuse', bool),
+        Optional('gpuNum'): setNumberRange('gpuNum', int, 0, 99999),
+        Optional('cpuNum'): setNumberRange('cpuNum', int, 0, 99999),
+        Optional('memoryMB'): setType('memoryMB', int),
+        Optional('maxTrialNumPerGpu'): setType('maxTrialNumPerGpu', int),
+        Optional('useActiveGpu'): setType('useActiveGpu', bool),
+    }
 }

 dlts_trial_schema = {
-    'trial':{
+    'trial': {
        'command': setType('command', str),
        'codeDir': setPathCheck('codeDir'),
        'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
@@ -235,7 +241,7 @@ dlts_config_schema = {
 }

 aml_trial_schema = {
-    'trial':{
+    'trial': {
        'codeDir': setPathCheck('codeDir'),
        'command': setType('command', str),
        'image': setType('image', str),
@@ -252,7 +258,7 @@ aml_config_schema = {
 }

 kubeflow_trial_schema = {
-    'trial':{
+    'trial': {
        'codeDir':  setPathCheck('codeDir'),
        Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
        Optional('ps'): {
@@ -273,7 +279,7 @@ kubeflow_trial_schema = {
            'image': setType('image', str),
            Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
        },
-        Optional('worker'):{
+        Optional('worker'): {
            'replicas': setType('replicas', int),
            'command': setType('command', str),
            'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
@@ -286,7 +292,7 @@ kubeflow_trial_schema = {
 }

 kubeflow_config_schema = {
-    'kubeflowConfig':Or({
+    'kubeflowConfig': Or({
        'operator': setChoice('operator', 'tf-operator', 'pytorch-operator'),
        'apiVersion': setType('apiVersion', str),
        Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'),
@@ -299,23 +305,23 @@ kubeflow_config_schema = {
        'apiVersion': setType('apiVersion', str),
        Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'),
        'keyVault': {
-            'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),\
-                         error='ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'),
-            'name': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),\
-                    error='ERROR: name format error, name support using (0-9|a-z|A-Z|-)')
+            'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
+                             error='ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'),
+            'name': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
+                        error='ERROR: name format error, name support using (0-9|a-z|A-Z|-)')
        },
        'azureStorage': {
-            'accountName': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,31}'),\
-                           error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
-            'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
-                          error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
+            'accountName': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,31}'),
+                               error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
+            'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),
+                              error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
        },
        Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
    })
 }

 frameworkcontroller_trial_schema = {
-    'trial':{
+    'trial': {
        'codeDir':  setPathCheck('codeDir'),
        'taskRoles': [{
            'name': setType('name', str),
@@ -335,7 +341,7 @@ frameworkcontroller_trial_schema = {
 }

 frameworkcontroller_config_schema = {
-    'frameworkcontrollerConfig':Or({
+    'frameworkcontrollerConfig': Or({
        Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'),
        Optional('serviceAccountName'): setType('serviceAccountName', str),
        'nfs': {
@@ -346,23 +352,23 @@ frameworkcontroller_config_schema = {
        Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'),
        Optional('serviceAccountName'): setType('serviceAccountName', str),
        'keyVault': {
-            'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),\
-                         error='ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'),
-            'name': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),\
-                    error='ERROR: name format error, name support using (0-9|a-z|A-Z|-)')
+            'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
+                             error='ERROR: vaultName format error, vaultName support using (0-9|a-z|A-Z|-)'),
+            'name': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
+                        error='ERROR: name format error, name support using (0-9|a-z|A-Z|-)')
        },
        'azureStorage': {
-            'accountName': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,31}'),\
-                           error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
-            'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
-                          error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
+            'accountName': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,31}'),
+                               error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
+            'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),
+                              error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
        },
        Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
    })
 }

 machine_list_schema = {
-    'machineList':[Or(
+    'machineList': [Or(
        {
            'ip': setType('ip', str),
            Optional('port'): setNumberRange('port', int, 1, 65535),
@@ -395,6 +401,7 @@ training_service_schema_dict = {
    'dlts': Schema({**common_schema, **dlts_trial_schema, **dlts_config_schema}),
 }

+
 class NNIConfigSchema:
    def validate(self, data):
        train_service = data['trainingServicePlatform']
@@ -483,19 +490,25 @@ class NNIConfigSchema:
                if not taskRoles_dict:
                    raise SchemaError('Please set taskRoles in paiConfigPath config file!')
            else:
-                pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStorageConfigName', 'command']
+                pai_trial_fields_required_list = ['image', 'paiStorageConfigName', 'command']
                for trial_field in pai_trial_fields_required_list:
                    if experiment_config['trial'].get(trial_field) is None:
                        raise SchemaError('Please set {0} in trial configuration,\
                                    or set additional pai configuration file path in paiConfigPath!'.format(trial_field))
+                pai_resource_fields_required_list = ['gpuNum', 'cpuNum', 'memoryMB']
+                for required_field in pai_resource_fields_required_list:
+                    if experiment_config['trial'].get(required_field) is None and \
+                            experiment_config['paiConfig'].get(required_field) is None:
+                        raise SchemaError('Please set {0} in trial or paiConfig configuration,\
+                                    or set additional pai configuration file path in paiConfigPath!'.format(required_field))

    def validate_pai_trial_conifg(self, experiment_config):
        '''validate the trial config in pai platform'''
        if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']:
            if experiment_config.get('trial').get('shmMB') and \
-            experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
+                    experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
                raise SchemaError('shmMB should be no more than memoryMB!')
-            #backward compatibility
+            # backward compatibility
            warning_information = '{0} is not supported in NNI anymore, please remove the field in config file!\
            please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment\
            for the practices of how to get data and output model in trial code'
@@ -508,6 +521,6 @@ class NNIConfigSchema:
    def validate_eth0_device(self, experiment_config):
        '''validate whether the machine has eth0 device'''
        if experiment_config.get('trainingServicePlatform') not in ['local'] \
-        and not experiment_config.get('nniManagerIp') \
-        and 'eth0' not in netifaces.interfaces():
+                and not experiment_config.get('nniManagerIp') \
+                and 'eth0' not in netifaces.interfaces():
            raise SchemaError('This machine does not contain eth0 network device, please set nniManagerIp in config file!')
--- a/tools/nni_trial_tool/base_channel.py
+++ b/tools/nni_trial_tool/base_channel.py
@@ -57,7 +57,11 @@ class BaseChannel(ABC):

    def close(self):
        self.is_running = False
-        self._inner_close()
+        try:
+            self._inner_close()
+        except Exception as err:
+            # ignore any error on closing
+            print("error on closing channel: %s" % err)

    def send(self, command, data):
        """Send command to Training Service.

--- a/tools/nni_trial_tool/log_utils.py
+++ b/tools/nni_trial_tool/log_utils.py
@@ -82,7 +82,11 @@ class RemoteLogger(object):
        '''
        constructor
        '''
-        self.logger = logging.getLogger('nni_syslog_{}'.format(tag))
+        logger_name = 'nni_syslog_{}'.format(tag)
+        # to prevent multiple trial logged in same logger
+        if trial_id is not None:
+            logger_name = '{}_{}'.format(logger_name, trial_id)
+        self.logger = logging.getLogger(logger_name)
        self.log_level = log_level
        self.logger.setLevel(self.log_level)
        self.pipeReader = None

--- a/tools/nni_trial_tool/trial.py
+++ b/tools/nni_trial_tool/trial.py
@@ -86,11 +86,17 @@ class Trial:
                    break
                time.sleep(0.1)

+        trial_command = self.args.trial_command
+
+        gpuIndices = self.data.get('gpuIndices')
+        if (gpuIndices is not None):
+            trial_command = 'CUDA_VISIBLE_DEVICES="%s " %s' % (gpuIndices, trial_command)
+
        self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader()
-        self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout,
+        self.process = Popen(trial_command, shell=True, stdout=self.log_pipe_stdout,
                             stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=dict(environ))
        nni_log(LogType.Info, '{0}: spawns a subprocess (pid {1}) to run command: {2}'.
-                format(self.name, self.process.pid, shlex.split(self.args.trial_command)))
+                format(self.name, self.process.pid, shlex.split(trial_command)))

    def save_parameter_file(self, command_data):
        parameters = command_data["parameters"]

--- a/tools/nni_trial_tool/web_channel.py
+++ b/tools/nni_trial_tool/web_channel.py
@@ -37,9 +37,9 @@ class WebChannel(BaseChannel):
    def _inner_close(self):
        if self.client is not None:
            self.client.close()
-            if self._event_loop.is_running():
-                self._event_loop.close()
            self.client = None
+            if self._event_loop.is_running():
+                self._event_loop.stop()
            self._event_loop = None

    def _inner_send(self, message):