launcher.py 14.7 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import json
import os
import shutil
25
import string
26
from subprocess import Popen, PIPE, call
Deshui Yu's avatar
Deshui Yu committed
27
import tempfile
28
from nni_annotation import *
Deshui Yu's avatar
Deshui Yu committed
29
from .launcher_utils import validate_all_content
30
from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response
Deshui Yu's avatar
Deshui Yu committed
31
from .url_utils import cluster_metadata_url, experiment_url
32
33
from .config_utils import Config, Experiments
from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process, detect_port
34
from .constants import *
35
36
from .webui_utils import *
import time
Deshui Yu's avatar
Deshui Yu committed
37

38
def start_rest_server(port, platform, mode, experiment_id=None):
Deshui Yu's avatar
Deshui Yu committed
39
    '''Run nni manager process'''
40
    print_normal('Checking environment...')
goooxu's avatar
goooxu committed
41
    nni_config = Config(port)
Deshui Yu's avatar
Deshui Yu committed
42
    rest_port = nni_config.get_config('restServerPort')
43
44
    running, _ = check_rest_server_quick(rest_port)
    if rest_port and running:
45
46
47
48
49
50
        print_error(EXPERIMENT_START_FAILED_INFO % port)
        exit(1)
    
    if detect_port(port):
        print_error('Port %s is used by another process, please reset the port!' % port)
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
51
52

    print_normal('Starting restful server...')
53
    manager = os.environ.get('NNI_MANAGER', 'nnimanager')
Deshui Yu's avatar
Deshui Yu committed
54
55
56
    cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode]
    if mode == 'resume':
        cmds += ['--experiment_id', experiment_id]
57
58
    stdout_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stdout')
    stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr')
goooxu's avatar
goooxu committed
59
60
    stdout_file = open(stdout_full_path, 'a+')
    stderr_file = open(stderr_full_path, 'a+')
61
62
63
64
65
    time_now = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    #add time information in the header of log files
    log_header = LOG_HEADER % str(time_now)
    stdout_file.write(log_header)
    stderr_file.write(log_header)
Deshui Yu's avatar
Deshui Yu committed
66
    process = Popen(cmds, stdout=stdout_file, stderr=stderr_file)
67
    return process, str(time_now)
Deshui Yu's avatar
Deshui Yu committed
68

69
70
def set_trial_config(experiment_config, port):
    '''set trial configuration'''
Deshui Yu's avatar
Deshui Yu committed
71
    request_data = dict()
72
    value_dict = dict()
73
74
75
    value_dict['command'] = experiment_config['trial']['command']
    value_dict['codeDir'] = experiment_config['trial']['codeDir']
    value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
76
77
78
79
80
81
82
83
84
85
    if experiment_config['trial'].get('cpuNum'):
        value_dict['cpuNum'] = experiment_config['trial']['cpuNum']
    if experiment_config['trial'].get('memoryMB'):
        value_dict['memoryMB'] = experiment_config['trial']['memoryMB']
    if experiment_config['trial'].get('image'):
        value_dict['image'] = experiment_config['trial']['image']
    if experiment_config['trial'].get('dataDir'):
        value_dict['dataDir'] = experiment_config['trial']['dataDir']
    if experiment_config['trial'].get('outputDir'):
        value_dict['outputDir'] = experiment_config['trial']['outputDir']
86
    request_data['trial_config'] = value_dict
Deshui Yu's avatar
Deshui Yu committed
87
    response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
88
89
90
    if check_response(response):
        return True
    else:
91
        print('Error message is {}'.format(response.text))
92
        stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr')
goooxu's avatar
goooxu committed
93
        with open(stderr_full_path, 'a+') as fout:
94
95
            fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':')))
        return False
96
97
98
99

def set_local_config(experiment_config, port):
    '''set local configuration'''
    return set_trial_config(experiment_config, port)
Deshui Yu's avatar
Deshui Yu committed
100
101
102
103
104
105
106

def set_remote_config(experiment_config, port):
    '''Call setClusterMetadata to pass trial'''
    #set machine_list
    request_data = dict()
    request_data['machine_list'] = experiment_config['machineList']
    response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
107
    err_message = ''
108
    if not response or not check_response(response):
109
110
        if response is not None:
            err_message = response.text
111
            stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr')
goooxu's avatar
goooxu committed
112
            with open(stderr_full_path, 'a+') as fout:
113
                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
114
        return False, err_message
Deshui Yu's avatar
Deshui Yu committed
115
116

    #set trial_config
117
    return set_trial_config(experiment_config, port), err_message
Deshui Yu's avatar
Deshui Yu committed
118

119
120
121
122
123
def set_pai_config(experiment_config, port):
    '''set pai configuration''' 
    pai_config_data = dict()
    pai_config_data['pai_config'] = experiment_config['paiConfig']
    response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), 20)
124
    err_message = None
125
126
127
    if not response or not response.status_code == 200:
        if response is not None:
            err_message = response.text
128
129
            stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr')
            with open(stderr_full_path, 'a+') as fout:
chicm-ms's avatar
chicm-ms committed
130
                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
131
132
133
134
135
        return False, err_message

    #set trial_config
    return set_trial_config(experiment_config, port), err_message

Deshui Yu's avatar
Deshui Yu committed
136
137
138
139
140
141
142
143
def set_experiment(experiment_config, mode, port):
    '''Call startExperiment (rest POST /experiment) with yaml file content'''
    request_data = dict()
    request_data['authorName'] = experiment_config['authorName']
    request_data['experimentName'] = experiment_config['experimentName']
    request_data['trialConcurrency'] = experiment_config['trialConcurrency']
    request_data['maxExecDuration'] = experiment_config['maxExecDuration']
    request_data['maxTrialNum'] = experiment_config['maxTrialNum']
144
    request_data['searchSpace'] = experiment_config.get('searchSpace')
145
146
147
148
    request_data['trainingServicePlatform'] = experiment_config.get('trainingServicePlatform')

    if experiment_config.get('description'):
        request_data['description'] = experiment_config['description']
chicm-ms's avatar
chicm-ms committed
149
150
    if experiment_config.get('multiPhase'):
        request_data['multiPhase'] = experiment_config.get('multiPhase')
Deshui Yu's avatar
Deshui Yu committed
151
152
153
154
155
156
157
    request_data['tuner'] = experiment_config['tuner']
    if 'assessor' in experiment_config:
        request_data['assessor'] = experiment_config['assessor']

    request_data['clusterMetaData'] = []
    if experiment_config['trainingServicePlatform'] == 'local':
        request_data['clusterMetaData'].append(
158
            {'key':'codeDir', 'value':experiment_config['trial']['codeDir']})
Deshui Yu's avatar
Deshui Yu committed
159
        request_data['clusterMetaData'].append(
160
            {'key': 'command', 'value': experiment_config['trial']['command']})
161
    elif experiment_config['trainingServicePlatform'] == 'remote':
Deshui Yu's avatar
Deshui Yu committed
162
163
164
        request_data['clusterMetaData'].append(
            {'key': 'machine_list', 'value': experiment_config['machineList']})
        value_dict = dict()
165
166
167
        value_dict['command'] = experiment_config['trial']['command']
        value_dict['codeDir'] = experiment_config['trial']['codeDir']
        value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
Deshui Yu's avatar
Deshui Yu committed
168
169
        request_data['clusterMetaData'].append(
            {'key': 'trial_config', 'value': value_dict})
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
    elif experiment_config['trainingServicePlatform'] == 'pai':
        request_data['clusterMetaData'].append(
            {'key': 'pai_config', 'value': experiment_config['paiConfig']})
        value_dict = dict()
        value_dict['command'] = experiment_config['trial']['command']
        value_dict['codeDir'] = experiment_config['trial']['codeDir']
        value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
        if experiment_config['trial'].get('cpuNum'):
            value_dict['cpuNum'] = experiment_config['trial']['cpuNum']
        if experiment_config['trial'].get('memoryMB'):
            value_dict['memoryMB'] = experiment_config['trial']['memoryMB']
        if experiment_config['trial'].get('image'):
            value_dict['image'] = experiment_config['trial']['image']
        if experiment_config['trial'].get('dataDir'):
            value_dict['dataDir'] = experiment_config['trial']['dataDir']
        if experiment_config['trial'].get('outputDir'):
            value_dict['outputDir'] = experiment_config['trial']['outputDir']
        request_data['clusterMetaData'].append(
            {'key': 'trial_config', 'value': value_dict})
Deshui Yu's avatar
Deshui Yu committed
189
190

    response = rest_post(experiment_url(port), json.dumps(request_data), 20)
191
192
193
    if check_response(response):
        return response
    else:
194
        stderr_full_path = os.path.join(NNICTL_HOME_DIR, str(port), 'stderr')
goooxu's avatar
goooxu committed
195
        with open(stderr_full_path, 'a+') as fout:
196
            fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':')))
197
        print_error('Setting experiment error, error message is {}'.format(response.text))
198
        return None
Deshui Yu's avatar
Deshui Yu committed
199

goooxu's avatar
goooxu committed
200
def launch_experiment(args, experiment_config, mode, experiment_id=None):
Deshui Yu's avatar
Deshui Yu committed
201
    '''follow steps to start rest server and start experiment'''
goooxu's avatar
goooxu committed
202
    nni_config = Config(args.port)
Deshui Yu's avatar
Deshui Yu committed
203
    # start rest server
204
    rest_process, start_time = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id)
Deshui Yu's avatar
Deshui Yu committed
205
206
207
    nni_config.set_config('restServerPid', rest_process.pid)
    # Deal with annotation
    if experiment_config.get('useAnnotation'):
liuzhe-lz's avatar
liuzhe-lz committed
208
        path = os.path.join(tempfile.gettempdir(), 'nni', 'annotation')
QuanluZhang's avatar
QuanluZhang committed
209
210
        if not os.path.isdir(path):
            os.makedirs(path)
liuzhe-lz's avatar
liuzhe-lz committed
211
212
213
214
        path = tempfile.mkdtemp(dir=path)
        code_dir = expand_annotations(experiment_config['trial']['codeDir'], path)
        experiment_config['trial']['codeDir'] = code_dir
        search_space = generate_search_space(code_dir)
215
        experiment_config['searchSpace'] = json.dumps(search_space)
Deshui Yu's avatar
Deshui Yu committed
216
        assert search_space, ERROR_INFO % 'Generated search space is empty'
217
218
219
    elif experiment_config.get('searchSpacePath'):
            search_space = get_json_content(experiment_config.get('searchSpacePath'))
            experiment_config['searchSpace'] = json.dumps(search_space)
Deshui Yu's avatar
Deshui Yu committed
220
    else:
221
        experiment_config['searchSpace'] = json.dumps('')
Deshui Yu's avatar
Deshui Yu committed
222
223

    # check rest server
goooxu's avatar
goooxu committed
224
    running, _ = check_rest_server(args.port)
225
    if running:
226
        print_normal('Successfully started Restful server!')
Deshui Yu's avatar
Deshui Yu committed
227
228
229
    else:
        print_error('Restful server start failed!')
        try:
230
231
            cmds = ['pkill', '-P', str(rest_process.pid)]
            call(cmds)
Deshui Yu's avatar
Deshui Yu committed
232
233
        except Exception:
            raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
234
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
235
236
237
238

    # set remote config
    if experiment_config['trainingServicePlatform'] == 'remote':
        print_normal('Setting remote config...')
goooxu's avatar
goooxu committed
239
        config_result, err_msg = set_remote_config(experiment_config, args.port)
240
        if config_result:
241
            print_normal('Successfully set remote config!')
Deshui Yu's avatar
Deshui Yu committed
242
        else:
243
            print_error('Failed! Error is: {}'.format(err_msg))
Deshui Yu's avatar
Deshui Yu committed
244
            try:
245
246
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
Deshui Yu's avatar
Deshui Yu committed
247
248
            except Exception:
                raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
249
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
250
251
252
253

    # set local config
    if experiment_config['trainingServicePlatform'] == 'local':
        print_normal('Setting local config...')
goooxu's avatar
goooxu committed
254
        if set_local_config(experiment_config, args.port):
255
            print_normal('Successfully set local config!')
Deshui Yu's avatar
Deshui Yu committed
256
257
258
        else:
            print_error('Failed!')
            try:
259
260
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
Deshui Yu's avatar
Deshui Yu committed
261
262
            except Exception:
                raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
263
            exit(1)
264
265
266
267
    
    #set pai config
    if experiment_config['trainingServicePlatform'] == 'pai':
        print_normal('Setting pai config...')
goooxu's avatar
goooxu committed
268
        config_result, err_msg = set_pai_config(experiment_config, args.port)
269
        if config_result:
270
            print_normal('Successfully set pai config!')
271
        else:
272
273
            if err_msg:
                print_error('Failed! Error is: {}'.format(err_msg))
274
275
276
277
            try:
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
            except Exception:
278
                raise Exception(ERROR_INFO % 'Restful server stopped!')
goooxu's avatar
goooxu committed
279
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
280
281
282

    # start a new experiment
    print_normal('Starting experiment...')
goooxu's avatar
goooxu committed
283
    response = set_experiment(experiment_config, mode, args.port)
Deshui Yu's avatar
Deshui Yu committed
284
285
286
287
288
289
290
    if response:
        if experiment_id is None:
            experiment_id = json.loads(response.text).get('experiment_id')
        nni_config.set_config('experimentId', experiment_id)
    else:
        print_error('Failed!')
        try:
291
292
            cmds = ['pkill', '-P', str(rest_process.pid)]
            call(cmds)
Deshui Yu's avatar
Deshui Yu committed
293
        except Exception:
294
            raise Exception(ERROR_INFO % 'Restful server stopped!')
goooxu's avatar
goooxu committed
295
        exit(1)
296
297
298
299
300
301
302
    web_ui_url_list = get_web_ui_urls(args.port)
    
    #save experiment information
    experiment_config = Experiments()
    experiment_config.add_experiment(experiment_id, args.port, start_time)

    print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, '   '.join(web_ui_url_list)))
Deshui Yu's avatar
Deshui Yu committed
303
304
305

def resume_experiment(args):
    '''resume an experiment'''
goooxu's avatar
goooxu committed
306
    nni_config = Config(args.port)
Deshui Yu's avatar
Deshui Yu committed
307
308
    experiment_config = nni_config.get_config('experimentConfig')
    experiment_id = nni_config.get_config('experimentId')
goooxu's avatar
goooxu committed
309
    launch_experiment(args, experiment_config, 'resume', experiment_id)
Deshui Yu's avatar
Deshui Yu committed
310
311
312

def create_experiment(args):
    '''start a new experiment'''
goooxu's avatar
goooxu committed
313
    nni_config = Config(args.port)
314
    config_path = os.path.abspath(args.config)
goooxu's avatar
goooxu committed
315
316
317
    if not os.path.exists(config_path):
        print_error('Please set correct config path!')
        exit(1)
318
319
    experiment_config = get_yml_content(config_path)
    validate_all_content(experiment_config, config_path)
Deshui Yu's avatar
Deshui Yu committed
320
321

    nni_config.set_config('experimentConfig', experiment_config)
goooxu's avatar
goooxu committed
322
323
    launch_experiment(args, experiment_config, 'new')
    nni_config.set_config('restServerPort', args.port)