"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "c7d194ea568679c95b418a796c07f57271ffd3dc"
Unverified Commit a3f48b8b authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix localTrainingService cancel logic and nnictl logic (#334)

Fix nnictl stop logic
Fix localTrainingService cancelJob logic
Show port information in "nnictl experiment list" cmd.
Show more information when config file validate failed.
Add nnictl detect adjacent port logic if the platform is pai
parent 55493edf
......@@ -120,14 +120,17 @@ class LocalTrainingService implements TrainingService {
while (!this.stopping) {
while (this.jobQueue.length !== 0) {
const trialJobId: string = this.jobQueue[0];
const trialJobDeatil = this.jobMap.get(trialJobId)
if (trialJobDeatil !== undefined && trialJobDeatil.status === 'WAITING'){
const [success, resource] = this.tryGetAvailableResource();
if (!success) {
break;
}
this.occupyResource(resource);
this.jobQueue.shift();
await this.runTrialJob(trialJobId, resource);
}
this.jobQueue.shift();
}
await delay(5000);
}
}
......@@ -249,6 +252,10 @@ class LocalTrainingService implements TrainingService {
if (trialJob === undefined) {
throw new NNIError(NNIErrorNames.NOT_FOUND, 'Trial job not found');
}
if (trialJob.pid === undefined){
this.setTrialJobStatus(trialJob, 'USER_CANCELED');
return;
}
if (trialJob.form.jobType === 'TRIAL') {
await tkill(trialJob.pid, 'SIGKILL');
} else if (trialJob.form.jobType === 'HOST') {
......
......@@ -59,7 +59,7 @@ EXPERIMENT_INFORMATION_FORMAT = '-----------------------------------------------
'%s\n' \
'-----------------------------------------------------------------------\n'
EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s StartTime: %s EndTime: %s \n'
EXPERIMENT_DETAIL_FORMAT = 'Id: %s Status: %s Port: %s StartTime: %s EndTime: %s \n'
PACKAGE_REQUIREMENTS = {
'SMAC': 'smac_tuner'
......
......@@ -62,7 +62,14 @@ def start_rest_server(port, platform, mode, config_file_name, experiment_id=None
'''Run nni manager process'''
nni_config = Config(config_file_name)
if detect_port(port):
print_error('Port %s is used by another process, please reset the port!' % port)
print_error('Port %s is used by another process, please reset the port!\n' \
'You could use \'nnictl create --help\' to get help information' % port)
exit(1)
if platform == 'pai' and detect_port(int(port) + 1):
print_error('PAI mode need an additional adjacent port %d, and the port %d is used by another process!\n' \
'You could set another port to start experiment!\n' \
'You could use \'nnictl create --help\' to get help information' % ((int(port) + 1), (int(port) + 1)))
exit(1)
print_normal('Starting restful server...')
......
......@@ -103,7 +103,8 @@ def validate_common_content(experiment_config):
experiment_config['machineList'][index]['port'] = 22
except Exception as exception:
raise Exception(exception)
print_error('Your config file is not correct, please check your config file content!\n%s' % exception)
exit(1)
def parse_tuner_content(experiment_config):
'''Validate whether tuner in experiment_config is valid'''
......
......@@ -53,7 +53,7 @@ def check_experiment_id(args):
experiment_information = ""
for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1)
elif not running_experiment_list:
......@@ -96,7 +96,7 @@ def parse_ids(args):
experiment_information = ""
for key in running_experiment_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
experiment_dict[key]['port'], experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
exit(1)
else:
......@@ -184,7 +184,10 @@ def stop_experiment(args):
if running:
response = rest_delete(experiment_url(rest_port), 20)
if not response or not check_response(response):
print_error('Stop experiment failed!')
if response:
print_error(response.text)
else:
print_error('No response from restful server!')
stop_rest_result = False
#sleep to wait rest handler done
time.sleep(3)
......@@ -365,7 +368,7 @@ def experiment_list(args):
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list all\' to list all stopped experiments!')
experiment_information = ""
for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], \
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
experiment_dict[key]['startTime'], experiment_dict[key]['endTime']))
print(EXPERIMENT_INFORMATION_FORMAT % experiment_information)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment