Commit 19173aa4 authored by Guoxin's avatar Guoxin Committed by QuanluZhang
Browse files

merge v1.0(bug bash) back to master (#1462)

* squash commits in v1.0 first round bug bash
parent f721b431
...@@ -519,14 +519,14 @@ def resume_experiment(args): ...@@ -519,14 +519,14 @@ def resume_experiment(args):
#find the latest stopped experiment #find the latest stopped experiment
if not args.id: if not args.id:
print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \ print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \
'You could use \'nnictl experiment list all\' to show all of stopped experiments!') 'You could use \'nnictl experiment list --all\' to show all experiments!')
exit(1) exit(1)
else: else:
if experiment_dict.get(args.id) is None: if experiment_dict.get(args.id) is None:
print_error('Id %s not exist!' % args.id) print_error('Id %s not exist!' % args.id)
exit(1) exit(1)
if experiment_dict[args.id]['status'] != 'STOPPED': if experiment_dict[args.id]['status'] != 'STOPPED':
print_error('Experiment %s is running!' % args.id) print_error('Only stopped experiments can be resumed!')
exit(1) exit(1)
experiment_id = args.id experiment_id = args.id
print_normal('Resuming experiment %s...' % experiment_id) print_normal('Resuming experiment %s...' % experiment_id)
......
...@@ -56,12 +56,30 @@ def parse_path(experiment_config, config_path): ...@@ -56,12 +56,30 @@ def parse_path(experiment_config, config_path):
expand_path(experiment_config, 'searchSpacePath') expand_path(experiment_config, 'searchSpacePath')
if experiment_config.get('trial'): if experiment_config.get('trial'):
expand_path(experiment_config['trial'], 'codeDir') expand_path(experiment_config['trial'], 'codeDir')
if experiment_config['trial'].get('authFile'):
expand_path(experiment_config['trial'], 'authFile')
if experiment_config['trial'].get('ps'):
if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['ps'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('master'):
if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['master'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('worker'):
if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['worker'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('taskRoles'):
for index in range(len(experiment_config['trial']['taskRoles'])):
if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
if experiment_config.get('tuner'): if experiment_config.get('tuner'):
expand_path(experiment_config['tuner'], 'codeDir') expand_path(experiment_config['tuner'], 'codeDir')
if experiment_config.get('assessor'): if experiment_config.get('assessor'):
expand_path(experiment_config['assessor'], 'codeDir') expand_path(experiment_config['assessor'], 'codeDir')
if experiment_config.get('advisor'): if experiment_config.get('advisor'):
expand_path(experiment_config['advisor'], 'codeDir') expand_path(experiment_config['advisor'], 'codeDir')
if experiment_config.get('machineList'):
for index in range(len(experiment_config['machineList'])):
expand_path(experiment_config['machineList'][index], 'sshKeyPath')
#if users use relative path, convert it to absolute path #if users use relative path, convert it to absolute path
root_path = os.path.dirname(config_path) root_path = os.path.dirname(config_path)
...@@ -69,6 +87,21 @@ def parse_path(experiment_config, config_path): ...@@ -69,6 +87,21 @@ def parse_path(experiment_config, config_path):
parse_relative_path(root_path, experiment_config, 'searchSpacePath') parse_relative_path(root_path, experiment_config, 'searchSpacePath')
if experiment_config.get('trial'): if experiment_config.get('trial'):
parse_relative_path(root_path, experiment_config['trial'], 'codeDir') parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
if experiment_config['trial'].get('authFile'):
parse_relative_path(root_path, experiment_config['trial'], 'authFile')
if experiment_config['trial'].get('ps'):
if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['ps'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('master'):
if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['master'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('worker'):
if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['worker'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('taskRoles'):
for index in range(len(experiment_config['trial']['taskRoles'])):
if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
if experiment_config.get('tuner'): if experiment_config.get('tuner'):
parse_relative_path(root_path, experiment_config['tuner'], 'codeDir') parse_relative_path(root_path, experiment_config['tuner'], 'codeDir')
if experiment_config.get('assessor'): if experiment_config.get('assessor'):
......
...@@ -91,6 +91,7 @@ def parse_args(): ...@@ -91,6 +91,7 @@ def parse_args():
parser_stop = subparsers.add_parser('stop', help='stop the experiment') parser_stop = subparsers.add_parser('stop', help='stop the experiment')
parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments') parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments')
parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server') parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server')
parser_stop.add_argument('--all', '-a', action='store_true', help='stop all of experiments')
parser_stop.set_defaults(func=stop_experiment) parser_stop.set_defaults(func=stop_experiment)
#parse trial command #parse trial command
......
...@@ -22,7 +22,7 @@ import csv ...@@ -22,7 +22,7 @@ import csv
import os import os
import psutil import psutil
import json import json
import datetime from datetime import datetime, timezone
import time import time
import re import re
from pathlib import Path from pathlib import Path
...@@ -142,6 +142,8 @@ def parse_ids(args): ...@@ -142,6 +142,8 @@ def parse_ids(args):
elif isinstance(experiment_dict[key], list): elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file # if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key) experiment_config.remove_experiment(key)
if args.all:
return running_experiment_list
if args.port is not None: if args.port is not None:
for key in running_experiment_list: for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port: if str(experiment_dict[key]['port']) == args.port:
...@@ -160,8 +162,6 @@ def parse_ids(args): ...@@ -160,8 +162,6 @@ def parse_ids(args):
exit(1) exit(1)
else: else:
result_list = running_experiment_list result_list = running_experiment_list
elif args.id == 'all':
result_list = running_experiment_list
elif args.id.endswith('*'): elif args.id.endswith('*'):
for id in running_experiment_list: for id in running_experiment_list:
if id.startswith(args.id[:-1]): if id.startswith(args.id[:-1]):
...@@ -175,7 +175,7 @@ def parse_ids(args): ...@@ -175,7 +175,7 @@ def parse_ids(args):
if len(result_list) > 1: if len(result_list) > 1:
print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) )
return None return None
if not result_list and ((args.id and args.id != 'all') or args.port): if not result_list and (args.id or args.port):
print_error('There are no experiments matched, please set correct experiment id or restful server port') print_error('There are no experiments matched, please set correct experiment id or restful server port')
elif not result_list: elif not result_list:
print_error('There is no experiment running...') print_error('There is no experiment running...')
...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content): ...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp = content.get('startTime') start_time_stamp = content.get('startTime')
end_time_stamp = content.get('endTime') end_time_stamp = content.get('endTime')
if start_time_stamp: if start_time_stamp:
start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") start_time = datetime.fromtimestamp(start_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['startTime'] = str(start_time) content['startTime'] = str(start_time)
if end_time_stamp: if end_time_stamp:
end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") end_time = datetime.fromtimestamp(end_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['endTime'] = str(end_time) content['endTime'] = str(end_time)
return content return content
...@@ -225,6 +225,9 @@ def check_rest(args): ...@@ -225,6 +225,9 @@ def check_rest(args):
def stop_experiment(args): def stop_experiment(args):
'''Stop the experiment which is running''' '''Stop the experiment which is running'''
if args.id and args.id == 'all':
print_warning('\'nnictl stop all\' is abolished, please use \'nnictl stop --all\' to stop all of experiments!')
exit(1)
experiment_id_list = parse_ids(args) experiment_id_list = parse_ids(args)
if experiment_id_list: if experiment_id_list:
experiment_config = Experiments() experiment_config = Experiments()
...@@ -568,7 +571,7 @@ def experiment_list(args): ...@@ -568,7 +571,7 @@ def experiment_list(args):
if experiment_dict[key]['status'] != 'STOPPED': if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key) experiment_id_list.append(key)
if not experiment_id_list: if not experiment_id_list:
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments.') print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all experiments.')
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2): ...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp #convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S')) time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S')) time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds seconds = (datetime.fromtimestamp(time2) - datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second #convert seconds to day:hour:minute:second
days = seconds / 86400 days = seconds / 86400
seconds %= 86400 seconds %= 86400
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import subprocess import subprocess
import sys import sys
import time import time
import traceback
from xml.dom import minidom from xml.dom import minidom
...@@ -33,7 +34,7 @@ def check_ready_to_run(): ...@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid()) pidList.remove(os.getpid())
return len(pidList) == 0 return len(pidList) == 0
else: else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = [] pidList = []
for pid in pgrep_output.splitlines(): for pid in pgrep_output.splitlines():
pidList.append(int(pid)) pidList.append(int(pid))
...@@ -45,23 +46,21 @@ def main(argv): ...@@ -45,23 +46,21 @@ def main(argv):
if check_ready_to_run() == False: if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit # GPU metrics collector is already running. Exit
exit(2) exit(2)
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: cmd = 'nvidia-smi -q -x'.split()
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
while(True): while(True):
try: try:
smi_output = subprocess.check_output(cmd, shell=True) smi_output = subprocess.check_output(cmd)
parse_nvidia_smi_result(smi_output, metrics_output_dir) except Exception:
except: traceback.print_exc()
exception = sys.exc_info() gen_empty_gpu_metric(metrics_output_dir)
for e in exception: break
print("job exporter error {}".format(e)) parse_nvidia_smi_result(smi_output, metrics_output_dir)
# TODO: change to sleep time configurable via arguments # TODO: change to sleep time configurable via arguments
time.sleep(5) time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir): def parse_nvidia_smi_result(smi, outputDir):
try: try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi) xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu') gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir): ...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except : except :
e_info = sys.exc_info() e_info = sys.exc_info()
print('xmldoc paring error') print('xmldoc paring error')
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()
finally:
os.umask(old_umask)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment