Commit 19173aa4 authored by Guoxin's avatar Guoxin Committed by QuanluZhang
Browse files

merge v1.0(bug bash) back to master (#1462)

* squash commits in v1.0 first round bug bash
parent f721b431
......@@ -519,14 +519,14 @@ def resume_experiment(args):
#find the latest stopped experiment
if not args.id:
print_error('Please set experiment id! \nYou could use \'nnictl resume {id}\' to resume a stopped experiment!\n' \
'You could use \'nnictl experiment list all\' to show all of stopped experiments!')
'You could use \'nnictl experiment list --all\' to show all experiments!')
exit(1)
else:
if experiment_dict.get(args.id) is None:
print_error('Id %s not exist!' % args.id)
exit(1)
if experiment_dict[args.id]['status'] != 'STOPPED':
print_error('Experiment %s is running!' % args.id)
print_error('Only stopped experiments can be resumed!')
exit(1)
experiment_id = args.id
print_normal('Resuming experiment %s...' % experiment_id)
......
......@@ -56,12 +56,30 @@ def parse_path(experiment_config, config_path):
expand_path(experiment_config, 'searchSpacePath')
if experiment_config.get('trial'):
expand_path(experiment_config['trial'], 'codeDir')
if experiment_config['trial'].get('authFile'):
expand_path(experiment_config['trial'], 'authFile')
if experiment_config['trial'].get('ps'):
if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['ps'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('master'):
if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['master'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('worker'):
if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['worker'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('taskRoles'):
for index in range(len(experiment_config['trial']['taskRoles'])):
if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
expand_path(experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
if experiment_config.get('tuner'):
expand_path(experiment_config['tuner'], 'codeDir')
if experiment_config.get('assessor'):
expand_path(experiment_config['assessor'], 'codeDir')
if experiment_config.get('advisor'):
expand_path(experiment_config['advisor'], 'codeDir')
if experiment_config.get('machineList'):
for index in range(len(experiment_config['machineList'])):
expand_path(experiment_config['machineList'][index], 'sshKeyPath')
#if users use relative path, convert it to absolute path
root_path = os.path.dirname(config_path)
......@@ -69,6 +87,21 @@ def parse_path(experiment_config, config_path):
parse_relative_path(root_path, experiment_config, 'searchSpacePath')
if experiment_config.get('trial'):
parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
if experiment_config['trial'].get('authFile'):
parse_relative_path(root_path, experiment_config['trial'], 'authFile')
if experiment_config['trial'].get('ps'):
if experiment_config['trial']['ps'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['ps'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('master'):
if experiment_config['trial']['master'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['master'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('worker'):
if experiment_config['trial']['worker'].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['worker'], 'privateRegistryAuthPath')
if experiment_config['trial'].get('taskRoles'):
for index in range(len(experiment_config['trial']['taskRoles'])):
if experiment_config['trial']['taskRoles'][index].get('privateRegistryAuthPath'):
parse_relative_path(root_path, experiment_config['trial']['taskRoles'][index], 'privateRegistryAuthPath')
if experiment_config.get('tuner'):
parse_relative_path(root_path, experiment_config['tuner'], 'codeDir')
if experiment_config.get('assessor'):
......
......@@ -91,6 +91,7 @@ def parse_args():
parser_stop = subparsers.add_parser('stop', help='stop the experiment')
parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments')
parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server')
parser_stop.add_argument('--all', '-a', action='store_true', help='stop all of experiments')
parser_stop.set_defaults(func=stop_experiment)
#parse trial command
......
......@@ -22,7 +22,7 @@ import csv
import os
import psutil
import json
import datetime
from datetime import datetime, timezone
import time
import re
from pathlib import Path
......@@ -142,6 +142,8 @@ def parse_ids(args):
elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key)
if args.all:
return running_experiment_list
if args.port is not None:
for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port:
......@@ -160,8 +162,6 @@ def parse_ids(args):
exit(1)
else:
result_list = running_experiment_list
elif args.id == 'all':
result_list = running_experiment_list
elif args.id.endswith('*'):
for id in running_experiment_list:
if id.startswith(args.id[:-1]):
......@@ -175,7 +175,7 @@ def parse_ids(args):
if len(result_list) > 1:
print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) )
return None
if not result_list and ((args.id and args.id != 'all') or args.port):
if not result_list and (args.id or args.port):
print_error('There are no experiments matched, please set correct experiment id or restful server port')
elif not result_list:
print_error('There is no experiment running...')
......@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp = content.get('startTime')
end_time_stamp = content.get('endTime')
if start_time_stamp:
start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S")
start_time = datetime.fromtimestamp(start_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['startTime'] = str(start_time)
if end_time_stamp:
end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S")
end_time = datetime.fromtimestamp(end_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['endTime'] = str(end_time)
return content
......@@ -225,6 +225,9 @@ def check_rest(args):
def stop_experiment(args):
'''Stop the experiment which is running'''
if args.id and args.id == 'all':
print_warning('\'nnictl stop all\' is abolished, please use \'nnictl stop --all\' to stop all of experiments!')
exit(1)
experiment_id_list = parse_ids(args)
if experiment_id_list:
experiment_config = Experiments()
......@@ -568,7 +571,7 @@ def experiment_list(args):
if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key)
if not experiment_id_list:
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments.')
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all experiments.')
experiment_information = ""
for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
......@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds
seconds = (datetime.fromtimestamp(time2) - datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second
days = seconds / 86400
seconds %= 86400
......
......@@ -21,6 +21,7 @@ import os
import subprocess
import sys
import time
import traceback
from xml.dom import minidom
......@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid())
return len(pidList) == 0
else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = []
for pid in pgrep_output.splitlines():
pidList.append(int(pid))
......@@ -45,23 +46,21 @@ def main(argv):
if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit
exit(2)
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile:
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
cmd = 'nvidia-smi -q -x'.split()
while(True):
try:
smi_output = subprocess.check_output(cmd, shell=True)
parse_nvidia_smi_result(smi_output, metrics_output_dir)
except:
exception = sys.exc_info()
for e in exception:
print("job exporter error {}".format(e))
smi_output = subprocess.check_output(cmd)
except Exception:
traceback.print_exc()
gen_empty_gpu_metric(metrics_output_dir)
break
parse_nvidia_smi_result(smi_output, metrics_output_dir)
# TODO: change to sleep time configurable via arguments
time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir):
try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
......@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except :
e_info = sys.exc_info()
print('xmldoc paring error')
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()
finally:
os.umask(old_umask)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment