Commit eaf42120 authored by suiguoxin's avatar suiguoxin
Browse files

squash commits in v1.0 first round bug bash

parent f721b431
...@@ -103,11 +103,11 @@ def stop_experiment_test(): ...@@ -103,11 +103,11 @@ def stop_experiment_test():
snooze() snooze()
assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id
# test cmd `nnictl stop all` # test cmd `nnictl stop --all`
proc = subprocess.run(['nnictl', 'stop', 'all']) proc = subprocess.run(['nnictl', 'stop', '--all'])
assert proc.returncode == 0, '`nnictl stop all` failed with code %d' % proc.returncode assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode
snooze() snooze()
assert not detect_port(8888) and not detect_port(8989), '`nnictl stop all` failed to stop experiments' assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments'
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -54,7 +54,7 @@ def start_container(image, name, nnimanager_os): ...@@ -54,7 +54,7 @@ def start_container(image, name, nnimanager_os):
else: else:
return '/tmp/nni/dist/{0}'.format(wheel_name) return '/tmp/nni/dist/{0}'.format(wheel_name)
pip_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--upgrade', 'pip'] pip_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--upgrade', 'pip', 'setuptools==39.1.0']
check_call(pip_cmds) check_call(pip_cmds)
sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', get_dist(wheel_name)] sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', get_dist(wheel_name)]
check_call(sdk_cmds) check_call(sdk_cmds)
......
...@@ -7,7 +7,7 @@ __nnictl_update_searchspace_cmds="--filename" ...@@ -7,7 +7,7 @@ __nnictl_update_searchspace_cmds="--filename"
__nnictl_update_concurrency_cmds="--value" __nnictl_update_concurrency_cmds="--value"
__nnictl_update_duration_cmds="--value" __nnictl_update_duration_cmds="--value"
__nnictl_update_trialnum_cmds="--value" __nnictl_update_trialnum_cmds="--value"
__nnictl_stop_cmds="--port all" __nnictl_stop_cmds="--port --all"
__nnictl_trial_cmds="ls kill codegen" __nnictl_trial_cmds="ls kill codegen"
__nnictl_trial_kill_cmds="--trial_id" __nnictl_trial_kill_cmds="--trial_id"
__nnictl_trial_codegen_cmds="--trial_id" __nnictl_trial_codegen_cmds="--trial_id"
......
...@@ -70,8 +70,8 @@ common_schema = { ...@@ -70,8 +70,8 @@ common_schema = {
} }
} }
tuner_schema_dict = { tuner_schema_dict = {
('TPE', 'Anneal', 'SMAC'): { ('Anneal', 'SMAC'): {
'builtinTunerName': setChoice('builtinTunerName', 'TPE', 'Anneal', 'SMAC'), 'builtinTunerName': setChoice('builtinTunerName', 'Anneal', 'SMAC'),
Optional('classArgs'): { Optional('classArgs'): {
'optimize_mode': setChoice('optimize_mode', 'maximize', 'minimize'), 'optimize_mode': setChoice('optimize_mode', 'maximize', 'minimize'),
}, },
...@@ -94,7 +94,7 @@ tuner_schema_dict = { ...@@ -94,7 +94,7 @@ tuner_schema_dict = {
}, },
'TPE': { 'TPE': {
'builtinTunerName': 'TPE', 'builtinTunerName': 'TPE',
'classArgs': { Optional('classArgs'): {
Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
Optional('parallel_optimize'): setType('parallel_optimize', bool), Optional('parallel_optimize'): setType('parallel_optimize', bool),
Optional('constant_liar_type'): setChoice('constant_liar_type', 'min', 'max', 'mean') Optional('constant_liar_type'): setChoice('constant_liar_type', 'min', 'max', 'mean')
...@@ -104,7 +104,7 @@ tuner_schema_dict = { ...@@ -104,7 +104,7 @@ tuner_schema_dict = {
}, },
'NetworkMorphism': { 'NetworkMorphism': {
'builtinTunerName': 'NetworkMorphism', 'builtinTunerName': 'NetworkMorphism',
'classArgs': { Optional('classArgs'): {
Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
Optional('task'): setChoice('task', 'cv','nlp','common'), Optional('task'): setChoice('task', 'cv','nlp','common'),
Optional('input_width'): setType('input_width', int), Optional('input_width'): setType('input_width', int),
...@@ -116,7 +116,7 @@ tuner_schema_dict = { ...@@ -116,7 +116,7 @@ tuner_schema_dict = {
}, },
'MetisTuner': { 'MetisTuner': {
'builtinTunerName': 'MetisTuner', 'builtinTunerName': 'MetisTuner',
'classArgs': { Optional('classArgs'): {
Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
Optional('no_resampling'): setType('no_resampling', bool), Optional('no_resampling'): setType('no_resampling', bool),
Optional('no_candidates'): setType('no_candidates', bool), Optional('no_candidates'): setType('no_candidates', bool),
...@@ -128,7 +128,7 @@ tuner_schema_dict = { ...@@ -128,7 +128,7 @@ tuner_schema_dict = {
}, },
'GPTuner': { 'GPTuner': {
'builtinTunerName': 'GPTuner', 'builtinTunerName': 'GPTuner',
'classArgs': { Optional('classArgs'): {
Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'), Optional('optimize_mode'): setChoice('optimize_mode', 'maximize', 'minimize'),
Optional('utility'): setChoice('utility', 'ei', 'ucb', 'poi'), Optional('utility'): setChoice('utility', 'ei', 'ucb', 'poi'),
Optional('kappa'): setType('kappa', float), Optional('kappa'): setType('kappa', float),
......
...@@ -519,14 +519,14 @@ def resume_experiment(args): ...@@ -519,14 +519,14 @@ def resume_experiment(args):
#find the latest stopped experiment #find the latest stopped experiment
if not args.id: if not args.id:
print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \ print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \
'You could use \'nnictl experiment list all\' to show all of stopped experiments!') 'You could use \'nnictl experiment list --all\' to show all experiments!')
exit(1) exit(1)
else: else:
if experiment_dict.get(args.id) is None: if experiment_dict.get(args.id) is None:
print_error('Id %s not exist!' % args.id) print_error('Id %s not exist!' % args.id)
exit(1) exit(1)
if experiment_dict[args.id]['status'] != 'STOPPED': if experiment_dict[args.id]['status'] != 'STOPPED':
print_error('Experiment %s is running!' % args.id) print_error('Only stopped experiments can be resumed!')
exit(1) exit(1)
experiment_id = args.id experiment_id = args.id
print_normal('Resuming experiment %s...' % experiment_id) print_normal('Resuming experiment %s...' % experiment_id)
......
...@@ -91,6 +91,7 @@ def parse_args(): ...@@ -91,6 +91,7 @@ def parse_args():
parser_stop = subparsers.add_parser('stop', help='stop the experiment') parser_stop = subparsers.add_parser('stop', help='stop the experiment')
parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments') parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments')
parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server') parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server')
parser_stop.add_argument('--all', '-a', action='store_true', help='stop all of experiments')
parser_stop.set_defaults(func=stop_experiment) parser_stop.set_defaults(func=stop_experiment)
#parse trial command #parse trial command
......
...@@ -22,7 +22,7 @@ import csv ...@@ -22,7 +22,7 @@ import csv
import os import os
import psutil import psutil
import json import json
import datetime from datetime import datetime, timezone
import time import time
import re import re
from pathlib import Path from pathlib import Path
...@@ -142,6 +142,8 @@ def parse_ids(args): ...@@ -142,6 +142,8 @@ def parse_ids(args):
elif isinstance(experiment_dict[key], list): elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file # if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key) experiment_config.remove_experiment(key)
if args.all:
return running_experiment_list
if args.port is not None: if args.port is not None:
for key in running_experiment_list: for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port: if str(experiment_dict[key]['port']) == args.port:
...@@ -160,8 +162,6 @@ def parse_ids(args): ...@@ -160,8 +162,6 @@ def parse_ids(args):
exit(1) exit(1)
else: else:
result_list = running_experiment_list result_list = running_experiment_list
elif args.id == 'all':
result_list = running_experiment_list
elif args.id.endswith('*'): elif args.id.endswith('*'):
for id in running_experiment_list: for id in running_experiment_list:
if id.startswith(args.id[:-1]): if id.startswith(args.id[:-1]):
...@@ -175,7 +175,7 @@ def parse_ids(args): ...@@ -175,7 +175,7 @@ def parse_ids(args):
if len(result_list) > 1: if len(result_list) > 1:
print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) )
return None return None
if not result_list and ((args.id and args.id != 'all') or args.port): if not result_list and (args.id or args.port):
print_error('There are no experiments matched, please set correct experiment id or restful server port') print_error('There are no experiments matched, please set correct experiment id or restful server port')
elif not result_list: elif not result_list:
print_error('There is no experiment running...') print_error('There is no experiment running...')
...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content): ...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp = content.get('startTime') start_time_stamp = content.get('startTime')
end_time_stamp = content.get('endTime') end_time_stamp = content.get('endTime')
if start_time_stamp: if start_time_stamp:
start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") start_time = datetime.fromtimestamp(start_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['startTime'] = str(start_time) content['startTime'] = str(start_time)
if end_time_stamp: if end_time_stamp:
end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") end_time = datetime.fromtimestamp(end_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['endTime'] = str(end_time) content['endTime'] = str(end_time)
return content return content
...@@ -225,6 +225,9 @@ def check_rest(args): ...@@ -225,6 +225,9 @@ def check_rest(args):
def stop_experiment(args): def stop_experiment(args):
'''Stop the experiment which is running''' '''Stop the experiment which is running'''
if args.id and args.id == 'all':
print_warning('\'nnictl stop all\' is abolished, please use \'nnictl stop --all\' to stop all of experiments!')
exit(1)
experiment_id_list = parse_ids(args) experiment_id_list = parse_ids(args)
if experiment_id_list: if experiment_id_list:
experiment_config = Experiments() experiment_config = Experiments()
...@@ -568,7 +571,7 @@ def experiment_list(args): ...@@ -568,7 +571,7 @@ def experiment_list(args):
if experiment_dict[key]['status'] != 'STOPPED': if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key) experiment_id_list.append(key)
if not experiment_id_list: if not experiment_id_list:
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments.') print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all experiments.')
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2): ...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp #convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S')) time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S')) time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds seconds = (datetime.fromtimestamp(time2) - datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second #convert seconds to day:hour:minute:second
days = seconds / 86400 days = seconds / 86400
seconds %= 86400 seconds %= 86400
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import subprocess import subprocess
import sys import sys
import time import time
import traceback
from xml.dom import minidom from xml.dom import minidom
...@@ -33,7 +34,7 @@ def check_ready_to_run(): ...@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid()) pidList.remove(os.getpid())
return len(pidList) == 0 return len(pidList) == 0
else: else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = [] pidList = []
for pid in pgrep_output.splitlines(): for pid in pgrep_output.splitlines():
pidList.append(int(pid)) pidList.append(int(pid))
...@@ -45,23 +46,21 @@ def main(argv): ...@@ -45,23 +46,21 @@ def main(argv):
if check_ready_to_run() == False: if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit # GPU metrics collector is already running. Exit
exit(2) exit(2)
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: cmd = 'nvidia-smi -q -x'.split()
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
while(True): while(True):
try: try:
smi_output = subprocess.check_output(cmd, shell=True) smi_output = subprocess.check_output(cmd)
except Exception:
traceback.print_exc()
gen_empty_gpu_metric(metrics_output_dir)
break
parse_nvidia_smi_result(smi_output, metrics_output_dir) parse_nvidia_smi_result(smi_output, metrics_output_dir)
except:
exception = sys.exc_info()
for e in exception:
print("job exporter error {}".format(e))
# TODO: change to sleep time configurable via arguments # TODO: change to sleep time configurable via arguments
time.sleep(5) time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir): def parse_nvidia_smi_result(smi, outputDir):
try: try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi) xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu') gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir): ...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except : except :
e_info = sys.exc_info() e_info = sys.exc_info()
print('xmldoc paring error') print('xmldoc paring error')
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()
finally:
os.umask(old_umask)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment