launcher_utils.py 14.9 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4

import os
5
import json
chicm-ms's avatar
chicm-ms committed
6
7
8
9
10
from schema import SchemaError
from schema import Schema
from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA,\
                           FRAMEWORKCONTROLLER_CONFIG_SCHEMA, tuner_schema_dict, advisor_schema_dict, assessor_schema_dict
from .common_utils import print_error, print_warning, print_normal
11
12
13
14
15
16
17
18
19

def expand_path(experiment_config, key):
    '''Change '~' to user home directory'''
    if experiment_config.get(key):
        experiment_config[key] = os.path.expanduser(experiment_config[key])

def parse_relative_path(root_path, experiment_config, key):
    '''Change relative path to absolute path'''
    if experiment_config.get(key) and not os.path.isabs(experiment_config.get(key)):
SparkSnail's avatar
SparkSnail committed
20
        absolute_path = os.path.join(root_path, experiment_config.get(key))
SparkSnail's avatar
SparkSnail committed
21
        print_normal('expand %s: %s to %s ' % (key, experiment_config[key], absolute_path))
SparkSnail's avatar
SparkSnail committed
22
23
        experiment_config[key] = absolute_path

24
25
26
def parse_time(time):
    '''Change the time to seconds'''
    unit = time[-1]
Deshui Yu's avatar
Deshui Yu committed
27
    if unit not in ['s', 'm', 'h', 'd']:
28
29
        print_error('the unit of time could only from {s, m, h, d}')
        exit(1)
30
    time = time[:-1]
Deshui Yu's avatar
Deshui Yu committed
31
    if not time.isdigit():
32
33
        print_error('time format error!')
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
34
    parse_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
35
    return int(time) * parse_dict[unit]
Deshui Yu's avatar
Deshui Yu committed
36

37
38
39
40
41
def parse_path(experiment_config, config_path):
    '''Parse path in config file'''
    expand_path(experiment_config, 'searchSpacePath')
    if experiment_config.get('trial'):
        expand_path(experiment_config['trial'], 'codeDir')
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        if experiment_config['trial'].get('authFile'):
            expand_path(experiment_config['trial'], 'authFile')
        if experiment_config['trial'].get('ps'):
            if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['ps'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('master'):
            if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['master'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('worker'):
            if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
                expand_path(experiment_config['trial']['worker'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('taskRoles'):
            for index in range(len(experiment_config['trial']['taskRoles'])):
                if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
                    expand_path(experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
57
58
59
60
    if experiment_config.get('tuner'):
        expand_path(experiment_config['tuner'], 'codeDir')
    if experiment_config.get('assessor'):
        expand_path(experiment_config['assessor'], 'codeDir')
QuanluZhang's avatar
QuanluZhang committed
61
62
    if experiment_config.get('advisor'):
        expand_path(experiment_config['advisor'], 'codeDir')
63
64
65
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            expand_path(experiment_config['machineList'][index], 'sshKeyPath')
66

67
68
69
70
71
72
    #if users use relative path, convert it to absolute path
    root_path = os.path.dirname(config_path)
    if experiment_config.get('searchSpacePath'):
        parse_relative_path(root_path, experiment_config, 'searchSpacePath')
    if experiment_config.get('trial'):
        parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
        if experiment_config['trial'].get('authFile'):
            parse_relative_path(root_path, experiment_config['trial'], 'authFile')
        if experiment_config['trial'].get('ps'):
            if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['ps'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('master'):
            if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['master'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('worker'):
            if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
                parse_relative_path(root_path, experiment_config['trial']['worker'], 'privateRegistryAuthPath')
        if experiment_config['trial'].get('taskRoles'):
            for index in range(len(experiment_config['trial']['taskRoles'])):
                if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
                    parse_relative_path(root_path, experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
88
89
90
91
    if experiment_config.get('tuner'):
        parse_relative_path(root_path, experiment_config['tuner'], 'codeDir')
    if experiment_config.get('assessor'):
        parse_relative_path(root_path, experiment_config['assessor'], 'codeDir')
QuanluZhang's avatar
QuanluZhang committed
92
93
    if experiment_config.get('advisor'):
        parse_relative_path(root_path, experiment_config['advisor'], 'codeDir')
SparkSnail's avatar
SparkSnail committed
94
95
96
    if experiment_config.get('machineList'):
        for index in range(len(experiment_config['machineList'])):
            parse_relative_path(root_path, experiment_config['machineList'][index], 'sshKeyPath')
97
98

def validate_search_space_content(experiment_config):
99
100
    '''Validate searchspace content,
       if the searchspace file is not json format or its values does not contain _type and _value which must be specified,
101
102
103
104
105
       it will not be a valid searchspace file'''
    try:
        search_space_content = json.load(open(experiment_config.get('searchSpacePath'), 'r'))
        for value in search_space_content.values():
            if not value.get('_type') or not value.get('_value'):
106
107
                print_error('please use _type and _value to specify searchspace!')
                exit(1)
108
    except:
109
110
111
112
113
114
115
116
117
118
        print_error('searchspace file is not a valid json format!')
        exit(1)

def validate_kubeflow_operators(experiment_config):
    '''Validate whether the kubeflow operators are valid'''
    if experiment_config.get('kubeflowConfig'):
        if experiment_config.get('kubeflowConfig').get('operator') == 'tf-operator':
            if experiment_config.get('trial').get('master') is not None:
                print_error('kubeflow with tf-operator can not set master')
                exit(1)
119
120
121
            if experiment_config.get('trial').get('worker') is None:
                print_error('kubeflow with tf-operator must set worker')
                exit(1)
122
123
124
125
        elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator':
            if experiment_config.get('trial').get('ps') is not None:
                print_error('kubeflow with pytorch-operator can not set ps')
                exit(1)
126
127
128
            if experiment_config.get('trial').get('master') is None:
                print_error('kubeflow with pytorch-operator must set master')
                exit(1)
129

130
131
132
133
134
135
136
137
138
139
140
141
        if experiment_config.get('kubeflowConfig').get('storage') == 'nfs':
            if experiment_config.get('kubeflowConfig').get('nfs') is None:
                print_error('please set nfs configuration!')
                exit(1)
        elif experiment_config.get('kubeflowConfig').get('storage') == 'azureStorage':
            if experiment_config.get('kubeflowConfig').get('azureStorage') is None:
                print_error('please set azureStorage configuration!')
                exit(1)
        elif experiment_config.get('kubeflowConfig').get('storage') is None:
            if experiment_config.get('kubeflowConfig').get('azureStorage'):
                print_error('please set storage type!')
                exit(1)
142

Deshui Yu's avatar
Deshui Yu committed
143
144
def validate_common_content(experiment_config):
    '''Validate whether the common values in experiment_config is valid'''
145
    if not experiment_config.get('trainingServicePlatform') or \
146
        experiment_config.get('trainingServicePlatform') not in ['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller']:
147
        print_error('Please set correct trainingServicePlatform!')
148
        exit(1)
149
    schema_dict = {
chicm-ms's avatar
chicm-ms committed
150
151
152
153
154
        'local': LOCAL_CONFIG_SCHEMA,
        'remote': REMOTE_CONFIG_SCHEMA,
        'pai': PAI_CONFIG_SCHEMA,
        'kubeflow': KUBEFLOW_CONFIG_SCHEMA,
        'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA
155
        }
156
157
158
159
160
161
162
163
164
165
    separate_schema_dict = {
        'tuner': tuner_schema_dict,
        'advisor': advisor_schema_dict,
        'assessor': assessor_schema_dict
    }
    separate_builtInName_dict = {
        'tuner': 'builtinTunerName',
        'advisor': 'builtinAdvisorName',
        'assessor': 'builtinAssessorName'
    }
166
    try:
167
        schema_dict.get(experiment_config['trainingServicePlatform']).validate(experiment_config)
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
        for separate_key in separate_schema_dict.keys():
            if experiment_config.get(separate_key):
                if experiment_config[separate_key].get(separate_builtInName_dict[separate_key]):
                    validate = False
                    for key in separate_schema_dict[separate_key].keys():
                        if key.__contains__(experiment_config[separate_key][separate_builtInName_dict[separate_key]]):
                            Schema({**separate_schema_dict[separate_key][key]}).validate(experiment_config[separate_key])
                            validate = True
                            break
                    if not validate:
                        print_error('%s %s error!' % (separate_key, separate_builtInName_dict[separate_key]))
                        exit(1)
                else:
                    Schema({**separate_schema_dict[separate_key]['customized']}).validate(experiment_config[separate_key])
    except SchemaError as error:
        print_error('Your config file is not correct, please check your config file content!')
184
        print_error(error.code)
185
        exit(1)
186

187
188
189
190
191
192
193
194
195
    #set default value
    if experiment_config.get('maxExecDuration') is None:
        experiment_config['maxExecDuration'] = '999d'
    if experiment_config.get('maxTrialNum') is None:
        experiment_config['maxTrialNum'] = 99999
    if experiment_config['trainingServicePlatform'] == 'remote':
        for index in range(len(experiment_config['machineList'])):
            if experiment_config['machineList'][index].get('port') is None:
                experiment_config['machineList'][index]['port'] = 22
Deshui Yu's avatar
Deshui Yu committed
196

QuanluZhang's avatar
QuanluZhang committed
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
def validate_customized_file(experiment_config, spec_key):
    '''
    check whether the file of customized tuner/assessor/advisor exists
    spec_key: 'tuner', 'assessor', 'advisor'
    '''
    if experiment_config[spec_key].get('codeDir') and \
        experiment_config[spec_key].get('classFileName') and \
        experiment_config[spec_key].get('className'):
        if not os.path.exists(os.path.join(
                experiment_config[spec_key]['codeDir'],
                experiment_config[spec_key]['classFileName'])):
            print_error('%s file directory is not valid!'%(spec_key))
            exit(1)
    else:
        print_error('%s file directory is not valid!'%(spec_key))
        exit(1)

214
def parse_tuner_content(experiment_config):
Deshui Yu's avatar
Deshui Yu committed
215
    '''Validate whether tuner in experiment_config is valid'''
chicm-ms's avatar
chicm-ms committed
216
    if not experiment_config['tuner'].get('builtinTunerName'):
QuanluZhang's avatar
QuanluZhang committed
217
        validate_customized_file(experiment_config, 'tuner')
Deshui Yu's avatar
Deshui Yu committed
218

219
def parse_assessor_content(experiment_config):
Deshui Yu's avatar
Deshui Yu committed
220
    '''Validate whether assessor in experiment_config is valid'''
221
    if experiment_config.get('assessor'):
chicm-ms's avatar
chicm-ms committed
222
        if not experiment_config['assessor'].get('builtinAssessorName'):
QuanluZhang's avatar
QuanluZhang committed
223
            validate_customized_file(experiment_config, 'assessor')
Deshui Yu's avatar
Deshui Yu committed
224

QuanluZhang's avatar
QuanluZhang committed
225
226
def parse_advisor_content(experiment_config):
    '''Validate whether advisor in experiment_config is valid'''
chicm-ms's avatar
chicm-ms committed
227
    if not experiment_config['advisor'].get('builtinAdvisorName'):
QuanluZhang's avatar
QuanluZhang committed
228
229
230
231
232
233
234
235
        validate_customized_file(experiment_config, 'advisor')

def validate_annotation_content(experiment_config, spec_key, builtin_name):
    '''
    Valid whether useAnnotation and searchSpacePath is coexist
    spec_key: 'advisor' or 'tuner'
    builtin_name: 'builtinAdvisorName' or 'builtinTunerName'
    '''
Deshui Yu's avatar
Deshui Yu committed
236
237
    if experiment_config.get('useAnnotation'):
        if experiment_config.get('searchSpacePath'):
238
239
            print_error('If you set useAnnotation=true, please leave searchSpacePath empty')
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
240
241
    else:
        # validate searchSpaceFile
Lee's avatar
Lee committed
242
243
        if experiment_config[spec_key].get(builtin_name) == 'NetworkMorphism':
            return
QuanluZhang's avatar
QuanluZhang committed
244
        if experiment_config[spec_key].get(builtin_name):
245
            if experiment_config.get('searchSpacePath') is None:
246
                print_error('Please set searchSpacePath!')
247
                exit(1)
248
            validate_search_space_content(experiment_config)
Deshui Yu's avatar
Deshui Yu committed
249

250
251
252
def validate_machine_list(experiment_config):
    '''Validate machine list'''
    if experiment_config.get('trainingServicePlatform') == 'remote' and experiment_config.get('machineList') is None:
253
254
        print_error('Please set machineList!')
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
255

256
257
258
259
260
261
262
def validate_pai_trial_conifg(experiment_config):
    '''validate the trial config in pai platform'''
    if experiment_config.get('trainingServicePlatform') == 'pai':
        if experiment_config.get('trial').get('shmMB') and \
        experiment_config['trial']['shmMB'] > experiment_config['trial']['memoryMB']:
            print_error('shmMB should be no more than memoryMB!')
            exit(1)
263
264
265
266
267
268
269
270
        #backward compatibility
        warning_information = '{0} is not supported in NNI anymore, please remove the field in config file!\
        please refer https://github.com/microsoft/nni/blob/master/docs/en_US/TrainingService/PaiMode.md#run-an-experiment\
        for the practices of how to get data and output model in trial code'
        if experiment_config.get('trial').get('dataDir'):
            print_warning(warning_information.format('dataDir'))
        if experiment_config.get('trial').get('outputDir'):
            print_warning(warning_information.format('outputDir'))
271

272
def validate_all_content(experiment_config, config_path):
Deshui Yu's avatar
Deshui Yu committed
273
    '''Validate whether experiment_config is valid'''
274
    parse_path(experiment_config, config_path)
Deshui Yu's avatar
Deshui Yu committed
275
    validate_common_content(experiment_config)
276
    validate_pai_trial_conifg(experiment_config)
277
    experiment_config['maxExecDuration'] = parse_time(experiment_config['maxExecDuration'])
QuanluZhang's avatar
QuanluZhang committed
278
    if experiment_config.get('advisor'):
279
280
281
        if experiment_config.get('assessor') or experiment_config.get('tuner'):
            print_error('advisor could not be set with assessor or tuner simultaneously!')
            exit(1)
QuanluZhang's avatar
QuanluZhang committed
282
283
284
285
286
287
288
289
        parse_advisor_content(experiment_config)
        validate_annotation_content(experiment_config, 'advisor', 'builtinAdvisorName')
    else:
        if not experiment_config.get('tuner'):
            raise Exception('Please provide tuner spec!')
        parse_tuner_content(experiment_config)
        parse_assessor_content(experiment_config)
        validate_annotation_content(experiment_config, 'tuner', 'builtinTunerName')