Unverified Commit 5b0034e4 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #204 from microsoft/master

merge master
parents 704b50e2 19173aa4
...@@ -22,7 +22,7 @@ import csv ...@@ -22,7 +22,7 @@ import csv
import os import os
import psutil import psutil
import json import json
import datetime from datetime import datetime, timezone
import time import time
import re import re
from pathlib import Path from pathlib import Path
...@@ -142,6 +142,8 @@ def parse_ids(args): ...@@ -142,6 +142,8 @@ def parse_ids(args):
elif isinstance(experiment_dict[key], list): elif isinstance(experiment_dict[key], list):
# if the config file is old version, remove the configuration from file # if the config file is old version, remove the configuration from file
experiment_config.remove_experiment(key) experiment_config.remove_experiment(key)
if args.all:
return running_experiment_list
if args.port is not None: if args.port is not None:
for key in running_experiment_list: for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port: if str(experiment_dict[key]['port']) == args.port:
...@@ -160,8 +162,6 @@ def parse_ids(args): ...@@ -160,8 +162,6 @@ def parse_ids(args):
exit(1) exit(1)
else: else:
result_list = running_experiment_list result_list = running_experiment_list
elif args.id == 'all':
result_list = running_experiment_list
elif args.id.endswith('*'): elif args.id.endswith('*'):
for id in running_experiment_list: for id in running_experiment_list:
if id.startswith(args.id[:-1]): if id.startswith(args.id[:-1]):
...@@ -175,7 +175,7 @@ def parse_ids(args): ...@@ -175,7 +175,7 @@ def parse_ids(args):
if len(result_list) > 1: if len(result_list) > 1:
print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) ) print_error(args.id + ' is ambiguous, please choose ' + ' '.join(result_list) )
return None return None
if not result_list and ((args.id and args.id != 'all') or args.port): if not result_list and (args.id or args.port):
print_error('There are no experiments matched, please set correct experiment id or restful server port') print_error('There are no experiments matched, please set correct experiment id or restful server port')
elif not result_list: elif not result_list:
print_error('There is no experiment running...') print_error('There is no experiment running...')
...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content): ...@@ -206,10 +206,10 @@ def convert_time_stamp_to_date(content):
start_time_stamp = content.get('startTime') start_time_stamp = content.get('startTime')
end_time_stamp = content.get('endTime') end_time_stamp = content.get('endTime')
if start_time_stamp: if start_time_stamp:
start_time = datetime.datetime.utcfromtimestamp(start_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") start_time = datetime.fromtimestamp(start_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['startTime'] = str(start_time) content['startTime'] = str(start_time)
if end_time_stamp: if end_time_stamp:
end_time = datetime.datetime.utcfromtimestamp(end_time_stamp // 1000).strftime("%Y/%m/%d %H:%M:%S") end_time = datetime.fromtimestamp(end_time_stamp // 1000, timezone.utc).astimezone().strftime("%Y/%m/%d %H:%M:%S")
content['endTime'] = str(end_time) content['endTime'] = str(end_time)
return content return content
...@@ -225,6 +225,9 @@ def check_rest(args): ...@@ -225,6 +225,9 @@ def check_rest(args):
def stop_experiment(args): def stop_experiment(args):
'''Stop the experiment which is running''' '''Stop the experiment which is running'''
if args.id and args.id == 'all':
print_warning('\'nnictl stop all\' is abolished, please use \'nnictl stop --all\' to stop all of experiments!')
exit(1)
experiment_id_list = parse_ids(args) experiment_id_list = parse_ids(args)
if experiment_id_list: if experiment_id_list:
experiment_config = Experiments() experiment_config = Experiments()
...@@ -568,7 +571,7 @@ def experiment_list(args): ...@@ -568,7 +571,7 @@ def experiment_list(args):
if experiment_dict[key]['status'] != 'STOPPED': if experiment_dict[key]['status'] != 'STOPPED':
experiment_id_list.append(key) experiment_id_list.append(key)
if not experiment_id_list: if not experiment_id_list:
print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all stopped experiments.') print_warning('There is no experiment running...\nYou can use \'nnictl experiment list --all\' to list all experiments.')
experiment_information = "" experiment_information = ""
for key in experiment_id_list: for key in experiment_id_list:
experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\ experiment_information += (EXPERIMENT_DETAIL_FORMAT % (key, experiment_dict[key]['status'], experiment_dict[key]['port'],\
...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2): ...@@ -581,7 +584,7 @@ def get_time_interval(time1, time2):
#convert time to timestamp #convert time to timestamp
time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S')) time1 = time.mktime(time.strptime(time1, '%Y/%m/%d %H:%M:%S'))
time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S')) time2 = time.mktime(time.strptime(time2, '%Y/%m/%d %H:%M:%S'))
seconds = (datetime.datetime.fromtimestamp(time2) - datetime.datetime.fromtimestamp(time1)).seconds seconds = (datetime.fromtimestamp(time2) - datetime.fromtimestamp(time1)).seconds
#convert seconds to day:hour:minute:second #convert seconds to day:hour:minute:second
days = seconds / 86400 days = seconds / 86400
seconds %= 86400 seconds %= 86400
......
...@@ -21,6 +21,7 @@ import os ...@@ -21,6 +21,7 @@ import os
import subprocess import subprocess
import sys import sys
import time import time
import traceback
from xml.dom import minidom from xml.dom import minidom
...@@ -33,7 +34,7 @@ def check_ready_to_run(): ...@@ -33,7 +34,7 @@ def check_ready_to_run():
pidList.remove(os.getpid()) pidList.remove(os.getpid())
return len(pidList) == 0 return len(pidList) == 0
else: else:
pgrep_output =subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True) pgrep_output = subprocess.check_output('pgrep -fx \'python3 -m nni_gpu_tool.gpu_metrics_collector\'', shell=True)
pidList = [] pidList = []
for pid in pgrep_output.splitlines(): for pid in pgrep_output.splitlines():
pidList.append(int(pid)) pidList.append(int(pid))
...@@ -45,23 +46,21 @@ def main(argv): ...@@ -45,23 +46,21 @@ def main(argv):
if check_ready_to_run() == False: if check_ready_to_run() == False:
# GPU metrics collector is already running. Exit # GPU metrics collector is already running. Exit
exit(2) exit(2)
with open(os.path.join(metrics_output_dir, "gpu_metrics"), "w") as outputFile: cmd = 'nvidia-smi -q -x'.split()
pass
os.chmod(os.path.join(metrics_output_dir, "gpu_metrics"), 0o777)
cmd = 'nvidia-smi -q -x'
while(True): while(True):
try: try:
smi_output = subprocess.check_output(cmd, shell=True) smi_output = subprocess.check_output(cmd)
except Exception:
traceback.print_exc()
gen_empty_gpu_metric(metrics_output_dir)
break
parse_nvidia_smi_result(smi_output, metrics_output_dir) parse_nvidia_smi_result(smi_output, metrics_output_dir)
except:
exception = sys.exc_info()
for e in exception:
print("job exporter error {}".format(e))
# TODO: change to sleep time configurable via arguments # TODO: change to sleep time configurable via arguments
time.sleep(5) time.sleep(5)
def parse_nvidia_smi_result(smi, outputDir): def parse_nvidia_smi_result(smi, outputDir):
try: try:
old_umask = os.umask(0)
xmldoc = minidom.parseString(smi) xmldoc = minidom.parseString(smi)
gpuList = xmldoc.getElementsByTagName('gpu') gpuList = xmldoc.getElementsByTagName('gpu')
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile: with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir): ...@@ -85,6 +84,24 @@ def parse_nvidia_smi_result(smi, outputDir):
except : except :
e_info = sys.exc_info() e_info = sys.exc_info()
print('xmldoc paring error') print('xmldoc paring error')
finally:
os.umask(old_umask)
def gen_empty_gpu_metric(outputDir):
try:
old_umask = os.umask(0)
with open(os.path.join(outputDir, "gpu_metrics"), 'a') as outputFile:
outPut = {}
outPut["Timestamp"] = time.asctime(time.localtime())
outPut["gpuCount"] = 0
outPut["gpuInfos"] = []
print(outPut)
outputFile.write("{}\n".format(json.dumps(outPut, sort_keys=True)))
outputFile.flush()
except Exception:
traceback.print_exc()
finally:
os.umask(old_umask)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment