launcher.py 14.3 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


import json
import os
import shutil
25
import string
26
from subprocess import Popen, PIPE, call
Deshui Yu's avatar
Deshui Yu committed
27
import tempfile
28
from nni_annotation import *
Deshui Yu's avatar
Deshui Yu committed
29
from .launcher_utils import validate_all_content
30
from .rest_utils import rest_put, rest_post, check_rest_server, check_rest_server_quick, check_response
Deshui Yu's avatar
Deshui Yu committed
31
32
from .url_utils import cluster_metadata_url, experiment_url
from .config_utils import Config
33
34
from .common_utils import get_yml_content, get_json_content, print_error, print_normal, print_warning, detect_process
from .constants import *
Deshui Yu's avatar
Deshui Yu committed
35

36
def start_rest_server(port, platform, mode, experiment_id=None):
Deshui Yu's avatar
Deshui Yu committed
37
    '''Run nni manager process'''
38
    print_normal('Checking environment...')
goooxu's avatar
goooxu committed
39
    nni_config = Config(port)
Deshui Yu's avatar
Deshui Yu committed
40
    rest_port = nni_config.get_config('restServerPort')
41
42
    running, _ = check_rest_server_quick(rest_port)
    if rest_port and running:
Deshui Yu's avatar
Deshui Yu committed
43
44
45
46
47
        print_error('There is an experiment running, please stop it first...')
        print_normal('You can use \'nnictl stop\' command to stop an experiment!')
        exit(0)

    print_normal('Starting restful server...')
48
    manager = os.environ.get('NNI_MANAGER', 'nnimanager')
Deshui Yu's avatar
Deshui Yu committed
49
50
51
    cmds = [manager, '--port', str(port), '--mode', platform, '--start_mode', mode]
    if mode == 'resume':
        cmds += ['--experiment_id', experiment_id]
goooxu's avatar
goooxu committed
52
53
54
55
    stdout_full_path = os.path.join(HOME_DIR, str(port), 'stdout')
    stderr_full_path = os.path.join(HOME_DIR, str(port), 'stderr')
    stdout_file = open(stdout_full_path, 'a+')
    stderr_file = open(stderr_full_path, 'a+')
Deshui Yu's avatar
Deshui Yu committed
56
57
58
    process = Popen(cmds, stdout=stdout_file, stderr=stderr_file)
    return process

59
60
def set_trial_config(experiment_config, port):
    '''set trial configuration'''
Deshui Yu's avatar
Deshui Yu committed
61
    request_data = dict()
62
    value_dict = dict()
63
64
65
    value_dict['command'] = experiment_config['trial']['command']
    value_dict['codeDir'] = experiment_config['trial']['codeDir']
    value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
66
67
68
69
70
71
72
73
74
75
    if experiment_config['trial'].get('cpuNum'):
        value_dict['cpuNum'] = experiment_config['trial']['cpuNum']
    if experiment_config['trial'].get('memoryMB'):
        value_dict['memoryMB'] = experiment_config['trial']['memoryMB']
    if experiment_config['trial'].get('image'):
        value_dict['image'] = experiment_config['trial']['image']
    if experiment_config['trial'].get('dataDir'):
        value_dict['dataDir'] = experiment_config['trial']['dataDir']
    if experiment_config['trial'].get('outputDir'):
        value_dict['outputDir'] = experiment_config['trial']['outputDir']
76
    request_data['trial_config'] = value_dict
Deshui Yu's avatar
Deshui Yu committed
77
    response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
78
79
80
    if check_response(response):
        return True
    else:
81
        print('Error message is {}'.format(response.text))
goooxu's avatar
goooxu committed
82
83
        stderr_full_path = os.path.join(HOME_DIR, str(port), 'stderr')
        with open(stderr_full_path, 'a+') as fout:
84
85
            fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':')))
        return False
86
87
88
89

def set_local_config(experiment_config, port):
    '''set local configuration'''
    return set_trial_config(experiment_config, port)
Deshui Yu's avatar
Deshui Yu committed
90
91
92
93
94
95
96

def set_remote_config(experiment_config, port):
    '''Call setClusterMetadata to pass trial'''
    #set machine_list
    request_data = dict()
    request_data['machine_list'] = experiment_config['machineList']
    response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 20)
97
    err_message = ''
98
    if not response or not check_response(response):
99
100
        if response is not None:
            err_message = response.text
goooxu's avatar
goooxu committed
101
102
            stderr_full_path = os.path.join(HOME_DIR, str(port), 'stderr')
            with open(stderr_full_path, 'a+') as fout:
103
                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
104
        return False, err_message
Deshui Yu's avatar
Deshui Yu committed
105
106

    #set trial_config
107
    return set_trial_config(experiment_config, port), err_message
Deshui Yu's avatar
Deshui Yu committed
108

109
110
111
112
113
def set_pai_config(experiment_config, port):
    '''set pai configuration''' 
    pai_config_data = dict()
    pai_config_data['pai_config'] = experiment_config['paiConfig']
    response = rest_put(cluster_metadata_url(port), json.dumps(pai_config_data), 20)
114
    err_message = None
115
116
117
    if not response or not response.status_code == 200:
        if response is not None:
            err_message = response.text
chicm-ms's avatar
chicm-ms committed
118
119
            with open(STDERR_FULL_PATH, 'a+') as fout:
                fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':')))
120
121
122
123
124
        return False, err_message

    #set trial_config
    return set_trial_config(experiment_config, port), err_message

Deshui Yu's avatar
Deshui Yu committed
125
126
127
128
129
130
131
132
def set_experiment(experiment_config, mode, port):
    '''Call startExperiment (rest POST /experiment) with yaml file content'''
    request_data = dict()
    request_data['authorName'] = experiment_config['authorName']
    request_data['experimentName'] = experiment_config['experimentName']
    request_data['trialConcurrency'] = experiment_config['trialConcurrency']
    request_data['maxExecDuration'] = experiment_config['maxExecDuration']
    request_data['maxTrialNum'] = experiment_config['maxTrialNum']
133
    request_data['searchSpace'] = experiment_config.get('searchSpace')
134
135
136
137
    request_data['trainingServicePlatform'] = experiment_config.get('trainingServicePlatform')

    if experiment_config.get('description'):
        request_data['description'] = experiment_config['description']
chicm-ms's avatar
chicm-ms committed
138
139
    if experiment_config.get('multiPhase'):
        request_data['multiPhase'] = experiment_config.get('multiPhase')
Deshui Yu's avatar
Deshui Yu committed
140
141
142
143
144
145
146
    request_data['tuner'] = experiment_config['tuner']
    if 'assessor' in experiment_config:
        request_data['assessor'] = experiment_config['assessor']

    request_data['clusterMetaData'] = []
    if experiment_config['trainingServicePlatform'] == 'local':
        request_data['clusterMetaData'].append(
147
            {'key':'codeDir', 'value':experiment_config['trial']['codeDir']})
Deshui Yu's avatar
Deshui Yu committed
148
        request_data['clusterMetaData'].append(
149
            {'key': 'command', 'value': experiment_config['trial']['command']})
150
    elif experiment_config['trainingServicePlatform'] == 'remote':
Deshui Yu's avatar
Deshui Yu committed
151
152
153
        request_data['clusterMetaData'].append(
            {'key': 'machine_list', 'value': experiment_config['machineList']})
        value_dict = dict()
154
155
156
        value_dict['command'] = experiment_config['trial']['command']
        value_dict['codeDir'] = experiment_config['trial']['codeDir']
        value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
Deshui Yu's avatar
Deshui Yu committed
157
158
        request_data['clusterMetaData'].append(
            {'key': 'trial_config', 'value': value_dict})
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
    elif experiment_config['trainingServicePlatform'] == 'pai':
        request_data['clusterMetaData'].append(
            {'key': 'pai_config', 'value': experiment_config['paiConfig']})
        value_dict = dict()
        value_dict['command'] = experiment_config['trial']['command']
        value_dict['codeDir'] = experiment_config['trial']['codeDir']
        value_dict['gpuNum'] = experiment_config['trial']['gpuNum']
        if experiment_config['trial'].get('cpuNum'):
            value_dict['cpuNum'] = experiment_config['trial']['cpuNum']
        if experiment_config['trial'].get('memoryMB'):
            value_dict['memoryMB'] = experiment_config['trial']['memoryMB']
        if experiment_config['trial'].get('image'):
            value_dict['image'] = experiment_config['trial']['image']
        if experiment_config['trial'].get('dataDir'):
            value_dict['dataDir'] = experiment_config['trial']['dataDir']
        if experiment_config['trial'].get('outputDir'):
            value_dict['outputDir'] = experiment_config['trial']['outputDir']
        request_data['clusterMetaData'].append(
            {'key': 'trial_config', 'value': value_dict})
Deshui Yu's avatar
Deshui Yu committed
178
179

    response = rest_post(experiment_url(port), json.dumps(request_data), 20)
180
181
182
    if check_response(response):
        return response
    else:
goooxu's avatar
goooxu committed
183
184
        stderr_full_path = os.path.join(HOME_DIR, str(port), 'stderr')
        with open(stderr_full_path, 'a+') as fout:
185
            fout.write(json.dumps(json.loads(response.text), indent=4, sort_keys=True, separators=(',', ':')))
186
        print_error('Setting experiment error, error message is {}'.format(response.text))
187
        return None
Deshui Yu's avatar
Deshui Yu committed
188

goooxu's avatar
goooxu committed
189
def launch_experiment(args, experiment_config, mode, experiment_id=None):
Deshui Yu's avatar
Deshui Yu committed
190
    '''follow steps to start rest server and start experiment'''
goooxu's avatar
goooxu committed
191
    nni_config = Config(args.port)
192
193
194
195
196
    #Check if there is an experiment running
    origin_rest_pid = nni_config.get_config('restServerPid')
    if origin_rest_pid and detect_process(origin_rest_pid):
        print_error('There is an experiment running, please stop it first...')
        print_normal('You can use \'nnictl stop\' command to stop an experiment!')
goooxu's avatar
goooxu committed
197
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
198
    # start rest server
goooxu's avatar
goooxu committed
199
    rest_process = start_rest_server(args.port, experiment_config['trainingServicePlatform'], mode, experiment_id)
Deshui Yu's avatar
Deshui Yu committed
200
201
202
    nni_config.set_config('restServerPid', rest_process.pid)
    # Deal with annotation
    if experiment_config.get('useAnnotation'):
liuzhe-lz's avatar
liuzhe-lz committed
203
        path = os.path.join(tempfile.gettempdir(), 'nni', 'annotation')
QuanluZhang's avatar
QuanluZhang committed
204
205
        if not os.path.isdir(path):
            os.makedirs(path)
liuzhe-lz's avatar
liuzhe-lz committed
206
207
208
209
        path = tempfile.mkdtemp(dir=path)
        code_dir = expand_annotations(experiment_config['trial']['codeDir'], path)
        experiment_config['trial']['codeDir'] = code_dir
        search_space = generate_search_space(code_dir)
210
        experiment_config['searchSpace'] = json.dumps(search_space)
Deshui Yu's avatar
Deshui Yu committed
211
        assert search_space, ERROR_INFO % 'Generated search space is empty'
212
213
214
    elif experiment_config.get('searchSpacePath'):
            search_space = get_json_content(experiment_config.get('searchSpacePath'))
            experiment_config['searchSpace'] = json.dumps(search_space)
Deshui Yu's avatar
Deshui Yu committed
215
    else:
216
        experiment_config['searchSpace'] = json.dumps('')
Deshui Yu's avatar
Deshui Yu committed
217
218

    # check rest server
goooxu's avatar
goooxu committed
219
    running, _ = check_rest_server(args.port)
220
    if running:
221
        print_normal('Successfully started Restful server!')
Deshui Yu's avatar
Deshui Yu committed
222
223
224
    else:
        print_error('Restful server start failed!')
        try:
225
226
            cmds = ['pkill', '-P', str(rest_process.pid)]
            call(cmds)
Deshui Yu's avatar
Deshui Yu committed
227
228
        except Exception:
            raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
229
        exit(1)
Deshui Yu's avatar
Deshui Yu committed
230
231
232
233

    # set remote config
    if experiment_config['trainingServicePlatform'] == 'remote':
        print_normal('Setting remote config...')
goooxu's avatar
goooxu committed
234
        config_result, err_msg = set_remote_config(experiment_config, args.port)
235
        if config_result:
Deshui Yu's avatar
Deshui Yu committed
236
237
            print_normal('Success!')
        else:
238
            print_error('Failed! Error is: {}'.format(err_msg))
Deshui Yu's avatar
Deshui Yu committed
239
            try:
240
241
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
Deshui Yu's avatar
Deshui Yu committed
242
243
            except Exception:
                raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
244
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
245
246
247
248

    # set local config
    if experiment_config['trainingServicePlatform'] == 'local':
        print_normal('Setting local config...')
goooxu's avatar
goooxu committed
249
        if set_local_config(experiment_config, args.port):
250
            print_normal('Successfully set local config!')
Deshui Yu's avatar
Deshui Yu committed
251
252
253
        else:
            print_error('Failed!')
            try:
254
255
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
Deshui Yu's avatar
Deshui Yu committed
256
257
            except Exception:
                raise Exception(ERROR_INFO % 'Rest server stopped!')
goooxu's avatar
goooxu committed
258
            exit(1)
259
260
261
262
    
    #set pai config
    if experiment_config['trainingServicePlatform'] == 'pai':
        print_normal('Setting pai config...')
goooxu's avatar
goooxu committed
263
        config_result, err_msg = set_pai_config(experiment_config, args.port)
264
        if config_result:
265
            print_normal('Successfully set pai config!')
266
        else:
267
268
            if err_msg:
                print_error('Failed! Error is: {}'.format(err_msg))
269
270
271
272
            try:
                cmds = ['pkill', '-P', str(rest_process.pid)]
                call(cmds)
            except Exception:
273
                raise Exception(ERROR_INFO % 'Restful server stopped!')
goooxu's avatar
goooxu committed
274
            exit(1)
Deshui Yu's avatar
Deshui Yu committed
275
276
277

    # start a new experiment
    print_normal('Starting experiment...')
goooxu's avatar
goooxu committed
278
    response = set_experiment(experiment_config, mode, args.port)
Deshui Yu's avatar
Deshui Yu committed
279
280
281
282
283
284
285
    if response:
        if experiment_id is None:
            experiment_id = json.loads(response.text).get('experiment_id')
        nni_config.set_config('experimentId', experiment_id)
    else:
        print_error('Failed!')
        try:
286
287
            cmds = ['pkill', '-P', str(rest_process.pid)]
            call(cmds)
Deshui Yu's avatar
Deshui Yu committed
288
        except Exception:
289
            raise Exception(ERROR_INFO % 'Restful server stopped!')
goooxu's avatar
goooxu committed
290
291
        exit(1)
    print_normal(EXPERIMENT_SUCCESS_INFO % (experiment_id, args.port))
Deshui Yu's avatar
Deshui Yu committed
292
293
294

def resume_experiment(args):
    '''resume an experiment'''
goooxu's avatar
goooxu committed
295
    nni_config = Config(args.port)
Deshui Yu's avatar
Deshui Yu committed
296
297
    experiment_config = nni_config.get_config('experimentConfig')
    experiment_id = nni_config.get_config('experimentId')
goooxu's avatar
goooxu committed
298
    launch_experiment(args, experiment_config, 'resume', experiment_id)
Deshui Yu's avatar
Deshui Yu committed
299
300
301

def create_experiment(args):
    '''start a new experiment'''
goooxu's avatar
goooxu committed
302
    nni_config = Config(args.port)
303
    config_path = os.path.abspath(args.config)
goooxu's avatar
goooxu committed
304
305
306
    if not os.path.exists(config_path):
        print_error('Please set correct config path!')
        exit(1)
307
308
    experiment_config = get_yml_content(config_path)
    validate_all_content(experiment_config, config_path)
Deshui Yu's avatar
Deshui Yu committed
309
310

    nni_config.set_config('experimentConfig', experiment_config)
goooxu's avatar
goooxu committed
311
312
    launch_experiment(args, experiment_config, 'new')
    nni_config.set_config('restServerPort', args.port)