launcher_utils.py 16.4 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4

import os
5
import json
chicm-ms's avatar
chicm-ms committed
6
7
from schema import SchemaError
from schema import Schema
George Cheng's avatar
George Cheng committed
8
9
10
from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_YARN_CONFIG_SCHEMA, \
                           DLTS_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, \
                           tuner_schema_dict, advisor_schema_dict, assessor_schema_dict
SparkSnail's avatar
SparkSnail committed
11
from .common_utils import print_error, print_warning, print_normal, get_yml_content
12
13
14
15
16
17
18
19
20

def expand_path(experiment_config, key):
    '''Change '~' to user home directory'''
    if experiment_config.get(key):
        experiment_config[key] = os.path.expanduser(experiment_config[key])

def parse_relative_path(root_path, experiment_config, key):
    '''Change relative path to absolute path'''
    if experiment_config.get(key) and not os.path.isabs(experiment_config.get(key)):
SparkSnail's avatar
SparkSnail committed
21
        absolute_path = os.path.join(root_path, experiment_config.get(key))
SparkSnail's avatar
SparkSnail committed
22
        print_normal('expand %s: %s to %s ' % (key, experiment_config[key], absolute_path))
SparkSnail's avatar
SparkSnail committed
23
24
        experiment_config[key] = absolute_path

25
26
27
def parse_time(time):
    '''Change the time to seconds'''
    unit = time[-1]
Deshui Yu's avatar
Deshui Yu committed
28
    if unit not in ['s', 'm', 'h', 'd']:
29
30
        print_error('the unit of time could only from {s, m, h, d}')
        exit(1)
31
    time = time[:-1]
Deshui Yu's avatar
Deshui Yu committed
32
    if not time.isdigit():
33
34
        print_error('time format error!')
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
35
    parse_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
36
    return int(time) * parse_dict[unit]
Deshui Yu's avatar
Deshui Yu committed
37

38
39
40
41
42
def parse_path(experiment_config, config_path):
    '''Parse path in config file'''
    expand_path(experiment_config, 'searchSpacePath')
    if experiment_config.get('trial'):
        expand_path(experiment_config['trial'], 'codeDir')
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
        if experiment_config['trial'].get('authFile'):
            expand_path(experiment_config['trial'], 'authFile')
        if experiment_config['trial'].get('ps'):
            if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['ps'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('master'):
            if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['master'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('worker'):
            if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['worker'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('taskRoles'):
            for index in range(len(experiment_config['trial']['taskRoles'])):
                if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
                    expand_path(experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
58
59
60
61
    if experiment_config.get('tuner'):
        expand_path(experiment_config['tuner'], 'codeDir')
    if experiment_config.get('assessor'):
        expand_path(experiment_config['assessor'], 'codeDir')
QuanluZhang's avatar
QuanluZhang committed
62
63
    if experiment_config.get('advisor'):
        expand_path(experiment_config['advisor'], 'codeDir')
64
65
66
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            expand_path(experiment_config['machineList'][index], 'sshKeyPath')
SparkSnail's avatar
SparkSnail committed
67
68
    if experiment_config['trial'].get('paiConfigPath'):
        expand_path(experiment_config['trial'], 'paiConfigPath')
69

70
71
72
73
74
75
    #if users use relative path, convert it to absolute path
    root_path = os.path.dirname(config_path)
    if experiment_config.get('searchSpacePath'):
        parse_relative_path(root_path, experiment_config, 'searchSpacePath')
    if experiment_config.get('trial'):
        parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        if experiment_config['trial'].get('authFile'):
            parse_relative_path(root_path, experiment_config['trial'], 'authFile')
        if experiment_config['trial'].get('ps'):
            if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['ps'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('master'):
            if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['master'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('worker'):
            if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['worker'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('taskRoles'):
            for index in range(len(experiment_config['trial']['taskRoles'])):
                if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
                    parse_relative_path(root_path, experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
91
92
93
94
    if experiment_config.get('tuner'):
        parse_relative_path(root_path, experiment_config['tuner'], 'codeDir')
    if experiment_config.get('assessor'):
        parse_relative_path(root_path, experiment_config['assessor'], 'codeDir')
QuanluZhang's avatar
QuanluZhang committed
95
96
    if experiment_config.get('advisor'):
        parse_relative_path(root_path, experiment_config['advisor'], 'codeDir')
SparkSnail's avatar
SparkSnail committed
97
98
99
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            parse_relative_path(root_path, experiment_config['machineList'][index], 'sshKeyPath')
SparkSnail's avatar
SparkSnail committed
100
101
    if experiment_config['trial'].get('paiConfigPath'):
        parse_relative_path(root_path, experiment_config['trial'], 'paiConfigPath')
102
103

def validate_search_space_content(experiment_config):
104
105
    '''Validate searchspace content,
       if the searchspace file is not json format or its values does not contain _type and _value which must be specified,
106
107
108
109
110
       it will not be a valid searchspace file'''
    try:
        search_space_content = json.load(open(experiment_config.get('searchSpacePath'), 'r'))
        for value in search_space_content.values():
            if not value.get('_type') or not value.get('_value'):
111
112
                print_error('please use _type and _value to specify searchspace!')
                exit(1)
113
    except:
114
115
116
117
118
119
120
121
122
123
        print_error('searchspace file is not a valid json format!')
        exit(1)

def validate_kubeflow_operators(experiment_config):
    '''Validate whether the kubeflow operators are valid'''
    if experiment_config.get('kubeflowConfig'):
        if experiment_config.get('kubeflowConfig').get('operator') == 'tf-operator':
            if experiment_config.get('trial').get('master') is not None:
                print_error('kubeflow with tf-operator can not set master')
                exit(1)
124
125
126
            if experiment_config.get('trial').get('worker') is None:
                print_error('kubeflow with tf-operator must set worker')
                exit(1)
127
128
129
130
        elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator':
            if experiment_config.get('trial').get('ps') is not None:
                print_error('kubeflow with pytorch-operator can not set ps')
                exit(1)
131
132
133
            if experiment_config.get('trial').get('master') is None:
                print_error('kubeflow with pytorch-operator must set master')
                exit(1)
134

135
136
137
138
139
140
141
142
143
144
145
146
        if experiment_config.get('kubeflowConfig').get('storage') == 'nfs':
            if experiment_config.get('kubeflowConfig').get('nfs') is None:
                print_error('please set nfs configuration!')
                exit(1)
        elif experiment_config.get('kubeflowConfig').get('storage') == 'azureStorage':
            if experiment_config.get('kubeflowConfig').get('azureStorage') is None:
                print_error('please set azureStorage configuration!')
                exit(1)
        elif experiment_config.get('kubeflowConfig').get('storage') is None:
            if experiment_config.get('kubeflowConfig').get('azureStorage'):
                print_error('please set storage type!')
                exit(1)
147

Deshui Yu's avatar
Deshui Yu committed
148
149
def validate_common_content(experiment_config):
    '''Validate whether the common values in experiment_config is valid'''
150
    if not experiment_config.get('trainingServicePlatform') or \
George Cheng's avatar
George Cheng committed
151
152
153
        experiment_config.get('trainingServicePlatform') not in [
                'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts'
        ]:
154
        print_error('Please set correct trainingServicePlatform!')
155
        exit(1)
156
    schema_dict = {
chicm-ms's avatar
chicm-ms committed
157
158
159
        'local': LOCAL_CONFIG_SCHEMA,
        'remote': REMOTE_CONFIG_SCHEMA,
        'pai': PAI_CONFIG_SCHEMA,
160
        'paiYarn': PAI_YARN_CONFIG_SCHEMA,
chicm-ms's avatar
chicm-ms committed
161
        'kubeflow': KUBEFLOW_CONFIG_SCHEMA,
George Cheng's avatar
George Cheng committed
162
163
        'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA,
        'dlts': DLTS_CONFIG_SCHEMA,
164
        }
165
166
167
168
169
170
171
172
173
174
    separate_schema_dict = {
        'tuner': tuner_schema_dict,
        'advisor': advisor_schema_dict,
        'assessor': assessor_schema_dict
    }
    separate_builtInName_dict = {
        'tuner': 'builtinTunerName',
        'advisor': 'builtinAdvisorName',
        'assessor': 'builtinAssessorName'
    }
175
    try:
176
        schema_dict.get(experiment_config['trainingServicePlatform']).validate(experiment_config)
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        for separate_key in separate_schema_dict.keys():
            if experiment_config.get(separate_key):
                if experiment_config[separate_key].get(separate_builtInName_dict[separate_key]):
                    validate = False
                    for key in separate_schema_dict[separate_key].keys():
                        if key.__contains__(experiment_config[separate_key][separate_builtInName_dict[separate_key]]):
                            Schema({**separate_schema_dict[separate_key][key]}).validate(experiment_config[separate_key])
                            validate = True
                            break
                    if not validate:
                        print_error('%s %s error!' % (separate_key, separate_builtInName_dict[separate_key]))
                        exit(1)
                else:
                    Schema({**separate_schema_dict[separate_key]['customized']}).validate(experiment_config[separate_key])
    except SchemaError as error:
        print_error('Your config file is not correct, please check your config file content!')
193
        print_error(error.code)
194
        exit(1)
195

196
197
198
199
200
201
202
203
204
    #set default value
    if experiment_config.get('maxExecDuration') is None:
        experiment_config['maxExecDuration'] = '999d'
    if experiment_config.get('maxTrialNum') is None:
        experiment_config['maxTrialNum'] = 99999
    if experiment_config['trainingServicePlatform'] == 'remote':
        for index in range(len(experiment_config['machineList'])):
            if experiment_config['machineList'][index].get('port') is None:
                experiment_config['machineList'][index]['port'] = 22
Deshui Yu's avatar
Deshui Yu committed
205

QuanluZhang's avatar
QuanluZhang committed
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def validate_customized_file(experiment_config, spec_key):
    '''
    check whether the file of customized tuner/assessor/advisor exists
    spec_key: 'tuner', 'assessor', 'advisor'
    '''
    if experiment_config[spec_key].get('codeDir') and \
        experiment_config[spec_key].get('classFileName') and \
        experiment_config[spec_key].get('className'):
        if not os.path.exists(os.path.join(
                experiment_config[spec_key]['codeDir'],
                experiment_config[spec_key]['classFileName'])):
            print_error('%s file directory is not valid!'%(spec_key))
            exit(1)
    else:
        print_error('%s file directory is not valid!'%(spec_key))
        exit(1)

223
def parse_tuner_content(experiment_config):
Deshui Yu's avatar
Deshui Yu committed
224
    '''Validate whether tuner in experiment_config is valid'''
chicm-ms's avatar
chicm-ms committed
225
    if not experiment_config['tuner'].get('builtinTunerName'):
QuanluZhang's avatar
QuanluZhang committed
226
        validate_customized_file(experiment_config, 'tuner')
Deshui Yu's avatar
Deshui Yu committed
227

228
def parse_assessor_content(experiment_config):
Deshui Yu's avatar
Deshui Yu committed
229
    '''Validate whether assessor in experiment_config is valid'''
230
    if experiment_config.get('assessor'):
chicm-ms's avatar
chicm-ms committed
231
        if not experiment_config['assessor'].get('builtinAssessorName'):
QuanluZhang's avatar
QuanluZhang committed
232
            validate_customized_file(experiment_config, 'assessor')
Deshui Yu's avatar
Deshui Yu committed
233

QuanluZhang's avatar
QuanluZhang committed
234
235
def parse_advisor_content(experiment_config):
    '''Validate whether advisor in experiment_config is valid'''
chicm-ms's avatar
chicm-ms committed
236
    if not experiment_config['advisor'].get('builtinAdvisorName'):
QuanluZhang's avatar
QuanluZhang committed
237
238
239
240
241
242
243
244
        validate_customized_file(experiment_config, 'advisor')

def validate_annotation_content(experiment_config, spec_key, builtin_name):
    '''
    Valid whether useAnnotation and searchSpacePath is coexist
    spec_key: 'advisor' or 'tuner'
    builtin_name: 'builtinAdvisorName' or 'builtinTunerName'
    '''
Deshui Yu's avatar
Deshui Yu committed
245
246
    if experiment_config.get('useAnnotation'):
        if experiment_config.get('searchSpacePath'):
247
248
            print_error('If you set useAnnotation=true, please leave searchSpacePath empty')
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
249
250
    else:
        # validate searchSpaceFile
Lee's avatar
Lee committed
251
252
        if experiment_config[spec_key].get(builtin_name) == 'NetworkMorphism':
            return
QuanluZhang's avatar
QuanluZhang committed
253
        if experiment_config[spec_key].get(builtin_name):
254
            if experiment_config.get('searchSpacePath') is None:
255
                print_error('Please set searchSpacePath!')
256
                exit(1)
257
            validate_search_space_content(experiment_config)
Deshui Yu's avatar
Deshui Yu committed
258

259
260
261
def validate_machine_list(experiment_config):
    '''Validate machine list'''
    if experiment_config.get('trainingServicePlatform') == 'remote' and experiment_config.get('machineList') is None:
262
263
        print_error('Please set machineList!')
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
264

SparkSnail's avatar
SparkSnail committed
265
266
267
268
def validate_pai_config_path(experiment_config):
    '''validate paiConfigPath field'''
    if experiment_config.get('trainingServicePlatform') == 'pai':
        if experiment_config.get('trial', {}).get('paiConfigPath'):
269
            # validate commands
SparkSnail's avatar
SparkSnail committed
270
            pai_config = get_yml_content(experiment_config['trial']['paiConfigPath'])
271
272
273
274
            taskRoles_dict = pai_config.get('taskRoles')
            if not taskRoles_dict:
                print_error('Please set taskRoles in paiConfigPath config file!')
                exit(1)
SparkSnail's avatar
SparkSnail committed
275
        else:
276
            pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStorageConfigName', 'command']
SparkSnail's avatar
SparkSnail committed
277
278
279
280
281
282
            for trial_field in pai_trial_fields_required_list:
                if experiment_config['trial'].get(trial_field) is None:
                    print_error('Please set {0} in trial configuration,\
                                or set additional pai configuration file path in paiConfigPath!'.format(trial_field))
                    exit(1)

283
284
def validate_pai_trial_conifg(experiment_config):
    '''validate the trial config in pai platform'''
285
    if experiment_config.get('trainingServicePlatform') in ['pai', 'paiYarn']:
286
287
288
289
        if experiment_config.get('trial').get('shmMB') and \
        experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
            print_error('shmMB should be no more than memoryMB!')
            exit(1)
290
291
292
293
294
295
296
297
        #backward compatibility
        warning_information = '{0} is not supported in NNI anymore, please remove the field in config file!\
        please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment\
        for the practices of how to get data and output model in trial code'
        if experiment_config.get('trial').get('dataDir'):
            print_warning(warning_information.format('dataDir'))
        if experiment_config.get('trial').get('outputDir'):
            print_warning(warning_information.format('outputDir'))
SparkSnail's avatar
SparkSnail committed
298
        validate_pai_config_path(experiment_config)
299

300
def validate_all_content(experiment_config, config_path):
Deshui Yu's avatar
Deshui Yu committed
301
    '''Validate whether experiment_config is valid'''
302
    parse_path(experiment_config, config_path)
Deshui Yu's avatar
Deshui Yu committed
303
    validate_common_content(experiment_config)
304
    validate_pai_trial_conifg(experiment_config)
305
    experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
QuanluZhang's avatar
QuanluZhang committed
306
    if experiment_config.get('advisor'):
307
308
309
        if experiment_config.get('assessor') or experiment_config.get('tuner'):
            print_error('advisor could not be set with assessor or tuner simultaneously!')
            exit(1)
QuanluZhang's avatar
QuanluZhang committed
310
311
312
313
314
315
316
317
        parse_advisor_content(experiment_config)
        validate_annotation_content(experiment_config, 'advisor', 'builtinAdvisorName')
    else:
        if not experiment_config.get('tuner'):
            raise Exception('Please provide tuner spec!')
        parse_tuner_content(experiment_config)
        parse_assessor_content(experiment_config)
        validate_annotation_content(experiment_config, 'tuner', 'builtinTunerName')