"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "f0eb4ebf29badee723a1a3a5bda4b49a9bc70556"
Unverified Commit c9720e95 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Refactor integration tests (#2190)

parent d5e6af27
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import argparse
import glob
import subprocess
import time
import traceback
import json
from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
parse_max_duration_time, get_succeeded_trial_num, deep_update, print_failed_job_log, get_failed_trial_jobs
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL
def gen_new_config(config_file, training_service='local'):
'''
Generates temporary config file for integration test, the file
should be deleted after testing.
'''
config = get_yml_content(config_file)
new_config_file = config_file + '.tmp'
it_config = get_yml_content('training_service.yml')
# hack for kubeflow trial config
if training_service == 'kubeflow':
it_config[training_service]['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'frameworkcontroller':
it_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, it_config['all'])
deep_update(config, it_config[training_service])
dump_yml_content(new_config_file, config)
return new_config_file, config
def run_test(config_file, training_service, local_gpu=False):
'''run test per configuration file'''
new_config_file, config = gen_new_config(config_file, training_service)
print(json.dumps(config, sort_keys=True, indent=4))
if training_service == 'local' and not local_gpu and config['trial']['gpuNum'] > 0:
print('no gpu, skiping: ', config_file)
return
try:
proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
max_duration, max_trial_num = get_max_values(new_config_file)
sleep_interval = 3
for _ in range(0, max_duration+30, sleep_interval):
time.sleep(sleep_interval)
status = get_experiment_status(STATUS_URL)
if status in ['DONE', 'ERROR'] or get_failed_trial_jobs(TRIAL_JOBS_URL):
break
print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL)
if status != 'DONE' or get_succeeded_trial_num(TRIAL_JOBS_URL) < max_trial_num:
raise AssertionError('Failed to finish in maxExecDuration')
finally:
if os.path.exists(new_config_file):
os.remove(new_config_file)
def get_max_values(config_file):
'''Get maxExecDuration and maxTrialNum of experiment'''
experiment_config = get_yml_content(config_file)
return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
def run(args):
'''test all configuration files'''
if args.config is None:
config_files = glob.glob('./config_test/**/*.test.yml')
else:
config_files = args.config.split(',')
if args.exclude is not None:
exclude_paths = args.exclude.split(',')
if exclude_paths:
for exclude_path in exclude_paths:
config_files = [x for x in config_files if exclude_path not in x]
print(config_files)
for config_file in config_files:
try:
# sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
time.sleep(5)
print(GREEN + 'Testing:' + config_file + CLEAR)
begin_time = time.time()
run_test(config_file, args.ts, args.local_gpu)
print(GREEN + 'Test %s: TEST PASS IN %d mins' % (config_file, (time.time() - begin_time)/60) + CLEAR)
except Exception as error:
print(RED + 'Test %s: TEST FAIL' % (config_file) + CLEAR)
print('%r' % error)
traceback.print_exc()
raise error
finally:
subprocess.run(['nnictl', 'stop'])
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'paiYarn', 'kubeflow', 'frameworkcontroller'], default='local')
parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
setup_experiment(args.preinstall)
run(args)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
import os.path as osp
import subprocess
import time
import traceback
import json
import requests
from utils import get_experiment_status, get_yml_content, parse_max_duration_time, get_succeeded_trial_num, print_failed_job_log
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, METRICS_URL
def run_test():
'''run metrics test'''
if sys.platform == 'win32':
config_file = osp.join('metrics_test', 'metrics_win32.test.yml')
else:
config_file = osp.join('metrics_test', 'metrics.test.yml')
print('Testing %s...' % config_file)
proc = subprocess.run(['nnictl', 'create', '--config', config_file])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
max_duration, max_trial_num = get_max_values(config_file)
sleep_interval = 3
for _ in range(0, max_duration, sleep_interval):
time.sleep(sleep_interval)
status = get_experiment_status(STATUS_URL)
#print('experiment status:', status)
if status == 'DONE':
num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL)
print_failed_job_log('local', TRIAL_JOBS_URL)
if sys.platform == "win32":
time.sleep(sleep_interval) # Windows seems to have some issues on updating in time
assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (num_succeeded, max_trial_num)
check_metrics()
break
assert status == 'DONE', 'Failed to finish in maxExecDuration'
def check_metrics():
with open(osp.join('metrics_test', 'expected_metrics.json'), 'r') as f:
expected_metrics = json.load(f)
print(expected_metrics)
metrics = requests.get(METRICS_URL).json()
intermediate_result, final_result = get_metric_results(metrics)
assert len(final_result) == 1, 'there should be 1 final result'
assert final_result[0] == expected_metrics['final_result']
assert set(intermediate_result) == set(expected_metrics['intermediate_result'])
def get_metric_results(metrics):
intermediate_result = []
final_result = []
for metric in metrics:
if metric['type'] == 'PERIODICAL':
intermediate_result.append(json.loads(metric['data']))
elif metric['type'] == 'FINAL':
final_result.append(json.loads(metric['data']))
print(intermediate_result, final_result)
return [round(float(x),6) for x in intermediate_result], [round(float(x), 6) for x in final_result]
def get_max_values(config_file):
experiment_config = get_yml_content(config_file)
return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
if __name__ == '__main__':
try:
# sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
time.sleep(5)
run_test()
print(GREEN + 'TEST PASS' + CLEAR)
except Exception as error:
print(RED + 'TEST FAIL' + CLEAR)
print('%r' % error)
traceback.print_exc()
raise error
finally:
subprocess.run(['nnictl', 'stop'])
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import time
import nni
if __name__ == '__main__':
nni.get_next_parameter()
time.sleep(1)
for i in range(10):
if i % 2 == 0:
print('report intermediate result without end of line.', end='')
else:
print('report intermediate result.')
nni.report_intermediate_result(0.1*(i+1))
time.sleep(2)
print('test final metrics not at line start.', end='')
nni.report_final_result(1.0)
print('done')
authorName: nni
experimentName: naive
trialConcurrency: 3
maxExecDuration: 1h
maxTrialNum: 10
#choice: local, remote
trainingServicePlatform: local
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
codeDir: .
classFileName: naive_tuner.py
className: NaiveTuner
classArgs:
optimize_mode: maximize
assessor:
codeDir: .
classFileName: naive_assessor.py
className: NaiveAssessor
classArgs:
optimize_mode: maximize
trial:
command: python naive_trial.py
codeDir: .
gpuNum: 0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import subprocess
import argparse
import time
import shlex
import signal
def test_foreground(args):
launch_command = 'nnictl create --config {} --foreground'.format(args.config)
print('nnictl foreground launch command: ', launch_command, flush=True)
proc = subprocess.Popen(shlex.split(launch_command))
time.sleep(args.timeout)
proc.send_signal(signal.SIGINT)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, required=True)
parser.add_argument("--timeout", type=int, default=45)
args = parser.parse_args()
test_foreground(args)
...@@ -2,11 +2,12 @@ ...@@ -2,11 +2,12 @@
# Licensed under the MIT license. # Licensed under the MIT license.
import sys import sys
import os
import glob import glob
import argparse import argparse
from utils import get_yml_content, dump_yml_content from utils import get_yml_content, dump_yml_content
TRAINING_SERVICE_FILE = 'training_service.yml' TRAINING_SERVICE_FILE = os.path.join('config', 'training_service.yml')
def update_training_service_config(args): def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE) config = get_yml_content(TRAINING_SERVICE_FILE)
...@@ -86,18 +87,6 @@ def update_training_service_config(args): ...@@ -86,18 +87,6 @@ def update_training_service_config(args):
dump_yml_content(TRAINING_SERVICE_FILE, config) dump_yml_content(TRAINING_SERVICE_FILE, config)
def convert_command():
'''convert command by platform'''
if sys.platform != 'win32':
return None
config_files = glob.glob('./**/*.yml') + glob.glob('./**/**/*.yml')
for config_file in config_files:
print('processing {}'.format(config_file))
yml_content = get_yml_content(config_file)
if yml_content.get('trial'):
if yml_content['trial'].get('command'):
yml_content['trial']['command'] = yml_content['trial']['command'].replace('python3', 'python')
dump_yml_content(config_file, yml_content)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -130,5 +119,3 @@ if __name__ == '__main__': ...@@ -130,5 +119,3 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
update_training_service_config(args) update_training_service_config(args)
if args.ts == 'local':
convert_command()
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import sys import sys
import os.path as osp import os.path as osp
import argparse
import json import json
import subprocess import subprocess
import sys import sys
...@@ -12,17 +13,16 @@ import traceback ...@@ -12,17 +13,16 @@ import traceback
from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze
from utils import GREEN, RED, CLEAR, EXPERIMENT_URL from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
def naive_test(): NNI_SOURCE_DIR = '..'
NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'config', 'naive_test')
def naive_test(args):
'''run naive integration test''' '''run naive integration test'''
to_remove = ['tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt'] to_remove = ['tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt']
to_remove = list(map(lambda file: osp.join('naive_test', file), to_remove)) to_remove = list(map(lambda file: osp.join(NAIVE_TEST_CONFIG_DIR, file), to_remove))
remove_files(to_remove) remove_files(to_remove)
if sys.platform == 'win32': proc = subprocess.run(['nnictl', 'create', '--config', args.config])
config_file = 'local_win32.yml'
else:
config_file = 'local.yml'
proc = subprocess.run(['nnictl', 'create', '--config', osp.join('naive_test' , config_file)])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
print('Spawning trials...') print('Spawning trials...')
...@@ -33,8 +33,8 @@ def naive_test(): ...@@ -33,8 +33,8 @@ def naive_test():
for _ in range(120): for _ in range(120):
time.sleep(1) time.sleep(1)
tuner_status = read_last_line(osp.join('naive_test', 'tuner_result.txt')) tuner_status = read_last_line(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt'))
assessor_status = read_last_line(osp.join('naive_test', 'assessor_result.txt')) assessor_status = read_last_line(osp.join(NAIVE_TEST_CONFIG_DIR, 'assessor_result.txt'))
experiment_status = is_experiment_done(nnimanager_log_path) experiment_status = is_experiment_done(nnimanager_log_path)
assert tuner_status != 'ERROR', 'Tuner exited with error' assert tuner_status != 'ERROR', 'Tuner exited with error'
...@@ -44,7 +44,7 @@ def naive_test(): ...@@ -44,7 +44,7 @@ def naive_test():
break break
if tuner_status is not None: if tuner_status is not None:
for line in open(osp.join('naive_test', 'tuner_result.txt')): for line in open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt')):
if line.strip() == 'ERROR': if line.strip() == 'ERROR':
break break
trial = int(line.split(' ')[0]) trial = int(line.split(' ')[0])
...@@ -54,32 +54,33 @@ def naive_test(): ...@@ -54,32 +54,33 @@ def naive_test():
assert experiment_status, 'Failed to finish in 2 min' assert experiment_status, 'Failed to finish in 2 min'
ss1 = json.load(open(osp.join('naive_test', 'search_space.json'))) ss1 = json.load(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'search_space.json')))
ss2 = json.load(open(osp.join('naive_test', 'tuner_search_space.json'))) ss2 = json.load(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_search_space.json')))
assert ss1 == ss2, 'Tuner got wrong search space' assert ss1 == ss2, 'Tuner got wrong search space'
tuner_result = set(open(osp.join('naive_test', 'tuner_result.txt'))) tuner_result = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt')))
expected = set(open(osp.join('naive_test', 'expected_tuner_result.txt'))) expected = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'expected_tuner_result.txt')))
# Trials may complete before NNI gets assessor's result, # Trials may complete before NNI gets assessor's result,
# so it is possible to have more final result than expected # so it is possible to have more final result than expected
print('Tuner result:', tuner_result) print('Tuner result:', tuner_result)
print('Expected tuner result:', expected) print('Expected tuner result:', expected)
assert tuner_result.issuperset(expected), 'Bad tuner result' assert tuner_result.issuperset(expected), 'Bad tuner result'
assessor_result = set(open(osp.join('naive_test', 'assessor_result.txt'))) assessor_result = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'assessor_result.txt')))
expected = set(open(osp.join('naive_test', 'expected_assessor_result.txt'))) expected = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'expected_assessor_result.txt')))
assert assessor_result == expected, 'Bad assessor result' assert assessor_result == expected, 'Bad assessor result'
subprocess.run(['nnictl', 'stop']) subprocess.run(['nnictl', 'stop'])
snooze() snooze()
def stop_experiment_test(): def stop_experiment_test(args):
config_file = args.config
'''Test `nnictl stop` command, including `nnictl stop exp_id` and `nnictl stop all`. '''Test `nnictl stop` command, including `nnictl stop exp_id` and `nnictl stop all`.
Simple `nnictl stop` is not tested here since it is used in all other test code''' Simple `nnictl stop` is not tested here since it is used in all other test code'''
subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8080'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8080'], check=True)
subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8888'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8888'], check=True)
subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8989'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8989'], check=True)
subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8990'], check=True) subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8990'], check=True)
# test cmd 'nnictl stop id` # test cmd 'nnictl stop id`
experiment_id = get_experiment_id(EXPERIMENT_URL) experiment_id = get_experiment_id(EXPERIMENT_URL)
...@@ -102,11 +103,14 @@ def stop_experiment_test(): ...@@ -102,11 +103,14 @@ def stop_experiment_test():
if __name__ == '__main__': if __name__ == '__main__':
installed = (sys.argv[-1] != '--preinstall') parser = argparse.ArgumentParser()
setup_experiment(installed) parser.add_argument("--config", type=str, required=True)
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
setup_experiment(not args.preinstall)
try: try:
naive_test() naive_test(args)
stop_experiment_test() stop_experiment_test(args)
# TODO: check the output of rest server # TODO: check the output of rest server
print(GREEN + 'PASS' + CLEAR) print(GREEN + 'PASS' + CLEAR)
except Exception as error: except Exception as error:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import sys
import os
import argparse
import subprocess
import time
import datetime
import shlex
import traceback
import json
import ruamel.yaml as yaml
from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \
parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \
get_experiment_dir, print_experiment_log
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, detect_port
import validators
it_variables = {}
def update_training_service_config(config, training_service):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
# hack for kubeflow trial config
if training_service == 'kubeflow':
it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'frameworkcontroller':
it_ts_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, it_ts_config['all'])
deep_update(config, it_ts_config[training_service])
def prepare_config_file(test_case_config, it_config, args):
config_path = args.nni_source_dir + test_case_config['configFile']
test_yml_config = get_yml_content(config_path)
# apply test case specific config
if test_case_config.get('config') is not None:
deep_update(test_yml_config, test_case_config['config'])
# hack for windows
if sys.platform == 'win32' and args.ts == 'local':
test_yml_config['trial']['command'] = test_yml_config['trial']['command'].replace('python3', 'python')
# apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts)
# generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp'
dump_yml_content(new_config_file, test_yml_config)
print(yaml.dump(test_yml_config, default_flow_style=False), flush=True)
return new_config_file
def run_test_case(test_case_config, it_config, args):
new_config_file = prepare_config_file(test_case_config, it_config, args)
# set configFile variable
it_variables['$configFile'] = new_config_file
try:
launch_test(new_config_file, args.ts, test_case_config)
invoke_validator(test_case_config, args.nni_source_dir)
finally:
stop_command = get_command(test_case_config, 'stopCommand')
print('Stop command:', stop_command, flush=True)
if stop_command:
subprocess.run(shlex.split(stop_command))
# remove tmp config file
if os.path.exists(new_config_file):
os.remove(new_config_file)
def invoke_validator(test_case_config, nni_source_dir):
validator_config = test_case_config.get('validator')
if validator_config is None or validator_config.get('class') is None:
return
validator = validators.__dict__[validator_config.get('class')]()
kwargs = validator_config.get('kwargs', {})
print('kwargs:', kwargs)
validator(REST_ENDPOINT, get_experiment_dir(EXPERIMENT_URL), nni_source_dir, **kwargs)
def get_max_values(config_file):
experiment_config = get_yml_content(config_file)
return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
def get_command(test_case_config, commandKey):
command = test_case_config.get(commandKey)
if commandKey == 'launchCommand':
assert command is not None
if command is None:
return None
# replace variables
for k in it_variables:
command = command.replace(k, it_variables[k])
# hack for windows, not limited to local training service
if sys.platform == 'win32':
command = command.replace('python3', 'python')
return command
def launch_test(config_file, training_service, test_case_config):
launch_command = get_command(test_case_config, 'launchCommand')
print('launch command: ', launch_command, flush=True)
proc = subprocess.run(shlex.split(launch_command))
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
# set experiment ID into variable
exp_var_name = test_case_config.get('setExperimentIdtoVar')
if exp_var_name is not None:
assert exp_var_name.startswith('$')
it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL)
print('variables:', it_variables)
max_duration, max_trial_num = get_max_values(config_file)
print('max_duration:', max_duration, ' max_trial_num:', max_trial_num)
if not test_case_config.get('experimentStatusCheck'):
return
bg_time = time.time()
print(str(datetime.datetime.now()), ' waiting ...', flush=True)
while True:
time.sleep(3)
waited_time = time.time() - bg_time
if waited_time > max_duration + 10:
print('waited: {}, max_duration: {}'.format(waited_time, max_duration))
break
status = get_experiment_status(STATUS_URL)
if status in ['DONE', 'ERROR']:
print('experiment status:', status)
break
num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL))
if num_failed > 0:
print('failed jobs: ', num_failed)
break
print(str(datetime.datetime.now()), ' waiting done', flush=True)
if get_experiment_status(STATUS_URL) == 'ERROR':
print_experiment_log(EXPERIMENT_URL)
trial_stats = get_trial_stats(TRIAL_JOBS_URL)
print(json.dumps(trial_stats, indent=4), flush=True)
if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats['EARLY_STOPPED'] < max_trial_num:
print_trial_job_log(training_service, TRIAL_JOBS_URL)
raise AssertionError('Failed to finish in maxExecDuration')
def case_excluded(name, excludes):
if name is None:
return False
if excludes is not None:
excludes = excludes.split(',')
for e in excludes:
if name in e or e in name:
return True
return False
def case_included(name, cases):
assert cases is not None
for case in cases.split(','):
if case in name:
return True
return False
def wait_for_port_available(port, timeout):
begin_time = time.time()
while True:
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(5)
def match_platform(test_case_config):
return sys.platform in test_case_config['platform'].split(' ')
def run(args):
it_config = get_yml_content(args.config)
for test_case_config in it_config['testCases']:
name = test_case_config['name']
if case_excluded(name, args.exclude):
print('{} excluded'.format(name))
continue
if args.cases and not case_included(name, args.cases):
continue
# fill test case default config
for k in it_config['defaultTestCaseConfig']:
if k not in test_case_config:
test_case_config[k] = it_config['defaultTestCaseConfig'][k]
print(json.dumps(test_case_config, indent=4))
if not match_platform(test_case_config):
print('skipped {}, platform {} not match [{}]'.format(name, sys.platform, test_case_config['platform']))
continue
wait_for_port_available(8080, 30)
print('{}Testing: {}{}'.format(GREEN, name, CLEAR))
begin_time = time.time()
run_test_case(test_case_config, it_config, args)
print('{}Test {}: TEST PASS IN {} SECONDS{}'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, required=True)
parser.add_argument("--nni_source_dir", type=str, default='../')
parser.add_argument("--cases", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'], default='local')
args = parser.parse_args()
run(args)
...@@ -17,11 +17,12 @@ GREEN = '\33[32m' ...@@ -17,11 +17,12 @@ GREEN = '\33[32m'
RED = '\33[31m' RED = '\33[31m'
CLEAR = '\33[0m' CLEAR = '\33[0m'
REST_ENDPOINT = 'http://localhost:8080/api/v1/nni' REST_ENDPOINT = 'http://localhost:8080'
EXPERIMENT_URL = REST_ENDPOINT + '/experiment' API_ROOT_URL = REST_ENDPOINT + '/api/v1/nni'
STATUS_URL = REST_ENDPOINT + '/check-status' EXPERIMENT_URL = API_ROOT_URL + '/experiment'
TRIAL_JOBS_URL = REST_ENDPOINT + '/trial-jobs' STATUS_URL = API_ROOT_URL + '/check-status'
METRICS_URL = REST_ENDPOINT + '/metric-data' TRIAL_JOBS_URL = API_ROOT_URL + '/trial-jobs'
METRICS_URL = API_ROOT_URL + '/metric-data'
def read_last_line(file_name): def read_last_line(file_name):
'''read last line of a file and return None if file not found''' '''read last line of a file and return None if file not found'''
...@@ -90,40 +91,45 @@ def get_experiment_status(status_url): ...@@ -90,40 +91,45 @@ def get_experiment_status(status_url):
nni_status = requests.get(status_url).json() nni_status = requests.get(status_url).json()
return nni_status['status'] return nni_status['status']
def get_succeeded_trial_num(trial_jobs_url): def get_trial_stats(trial_jobs_url):
trial_jobs = requests.get(trial_jobs_url).json() trial_jobs = requests.get(trial_jobs_url).json()
num_succeed = 0 trial_stats = collections.defaultdict(int)
for trial_job in trial_jobs: for trial_job in trial_jobs:
if trial_job['status'] in ['SUCCEEDED', 'EARLY_STOPPED']: trial_stats[trial_job['status']] += 1
num_succeed += 1 return trial_stats
print('num_succeed:', num_succeed)
return num_succeed
def get_failed_trial_jobs(trial_jobs_url): def get_trial_jobs(trial_jobs_url, status=None):
'''Return failed trial jobs''' '''Return failed trial jobs'''
trial_jobs = requests.get(trial_jobs_url).json() trial_jobs = requests.get(trial_jobs_url).json()
failed_jobs = [] res = []
for trial_job in trial_jobs: for trial_job in trial_jobs:
if trial_job['status'] in ['FAILED']: if status is None or trial_job['status'] == status:
failed_jobs.append(trial_job) res.append(trial_job)
return failed_jobs return res
def get_failed_trial_jobs(trial_jobs_url):
'''Return failed trial jobs'''
return get_trial_jobs(trial_jobs_url, 'FAILED')
def print_file_content(filepath):
with open(filepath, 'r') as f:
content = f.read()
print(filepath, flush=True)
print(content, flush=True)
def print_failed_job_log(training_service, trial_jobs_url): def print_trial_job_log(training_service, trial_jobs_url):
'''Print job log of FAILED trial jobs''' trial_jobs = get_trial_jobs(trial_jobs_url)
trial_jobs = get_failed_trial_jobs(trial_jobs_url)
for trial_job in trial_jobs: for trial_job in trial_jobs:
if training_service == 'local': trial_log_dir = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['id'])
if sys.platform == "win32": log_files = ['stderr', 'trial.log'] if training_service == 'local' else ['stdout_log_collection.log']
p = trial_job['stderrPath'].split(':') for log_file in log_files:
log_filename = ':'.join([p[-2], p[-1]]) print_file_content(os.path.join(trial_log_dir, log_file))
else:
log_filename = trial_job['stderrPath'].split(':')[-1] def print_experiment_log(experiment_url):
else: log_dir = get_nni_log_dir(experiment_url)
log_filename = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['id'], 'stdout_log_collection.log') for log_file in ['dispatcher.log', 'nnimanager.log']:
with open(log_filename, 'r') as f: filepath = os.path.join(log_dir, log_file)
log_content = f.read() print_file_content(filepath)
print(log_filename, flush=True)
print(log_content, flush=True)
def parse_max_duration_time(max_exec_duration): def parse_max_duration_time(max_exec_duration):
unit = max_exec_duration[-1] unit = max_exec_duration[-1]
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os.path as osp
import json
import requests
import nnicli as nc
from utils import METRICS_URL
class ITValidator:
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
pass
class MetricsValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
self.check_metrics(nni_source_dir, **kwargs)
def check_metrics(self, nni_source_dir, **kwargs):
expected_result_file = kwargs.get('expected_result_file', 'expected_metrics.json')
with open(osp.join(nni_source_dir, 'test', 'config', 'metrics_test', expected_result_file), 'r') as f:
expected_metrics = json.load(f)
print('expected metrics:', expected_metrics)
metrics = requests.get(METRICS_URL).json()
print('RAW METRICS:', json.dumps(metrics, indent=4))
intermediate_result, final_result = self.get_metric_results(metrics)
assert intermediate_result and final_result
for trialjob_id in intermediate_result:
trial_final_result = final_result[trialjob_id]
trial_intermediate_result = intermediate_result[trialjob_id]
print('intermediate result:', trial_intermediate_result)
print('final result:', trial_final_result)
assert len(trial_final_result) == 1, 'there should be 1 final result'
assert trial_final_result[0] == expected_metrics['final_result']
# encode dict/number into json string to compare them in set
assert set([json.dumps(x) for x in trial_intermediate_result]) \
== set([json.dumps(x) for x in expected_metrics['intermediate_result']])
def get_metric_results(self, metrics):
intermediate_result = {}
final_result = {}
for metric in metrics:
# metrics value are encoded by NNI SDK as json string,
# here we decode the value by json.loads twice
metric_value = json.loads(json.loads(metric['data']))
if metric['type'] == 'PERIODICAL':
if metric['trialJobId'] in intermediate_result:
intermediate_result[metric['trialJobId']].append(metric_value)
else:
intermediate_result[metric['trialJobId']] = [metric_value]
elif metric['type'] == 'FINAL':
if metric['trialJobId'] in final_result:
final_result[metric['trialJobId']].append(metric_value)
else:
final_result[metric['trialJobId']] = [metric_value]
return intermediate_result, final_result
class NnicliValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
print(rest_endpoint)
nc.set_endpoint(rest_endpoint)
#print(nc.version())
print(nc.get_job_statistics())
print(nc.get_experiment_status())
print(nc.list_trial_jobs())
from setuptools import setup, find_packages
setup(
name="nnitest",
version="0.0.1",
author = 'Microsoft NNI team',
author_email = 'nni@microsoft.com',
description = 'Neural Network Intelligence package',
license = 'MIT',
url = 'https://github.com/Microsoft/nni',
packages=find_packages('nnitest'),
long_description="",
classifiers = [
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
"Operating System :: OS Independent"
],
)
...@@ -47,9 +47,9 @@ jobs: ...@@ -47,9 +47,9 @@ jobs:
fi fi
echo "TEST_IMG:$TEST_IMG" echo "TEST_IMG:$TEST_IMG"
cd test cd test
python3 generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \ python3 nni_test/nnitest/generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip) --azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts frameworkcontroller --exclude multi_phase PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase
displayName: 'integration test' displayName: 'integration test'
...@@ -47,9 +47,9 @@ jobs: ...@@ -47,9 +47,9 @@ jobs:
fi fi
echo "TEST_IMG:$TEST_IMG" echo "TEST_IMG:$TEST_IMG"
cd test cd test
python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \ python3 nni_test/nnitest/generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip) --azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase
displayName: 'integration test' displayName: 'integration test'
...@@ -9,34 +9,14 @@ jobs: ...@@ -9,34 +9,14 @@ jobs:
- script: | - script: |
python -m pip install scikit-learn==0.20.0 --user python -m pip install scikit-learn==0.20.0 --user
python -m pip install keras==2.1.6 --user python -m pip install keras==2.1.6 --user
python -m pip install torch===1.2.0 torchvision===0.4.1 -f https://download.pytorch.org/whl/torch_stable.html --user python -m pip install torchvision===0.4.1 torch===1.3.1 -f https://download.pytorch.org/whl/torch_stable.html --user
python -m pip install tensorflow-gpu==1.11.0 --user python -m pip install tensorflow-gpu==1.11.0 --user
displayName: 'Install dependencies for integration tests' displayName: 'Install dependencies for integration tests'
- script: | - script: |
cd test cd test
python generate_ts_config.py --ts local powershell.exe -file scripts/unittest.ps1
displayName: 'generate config files'
- script: |
cd test
python config_test.py --ts local --local_gpu --exclude smac,bohb
displayName: 'Examples and advanced features tests on local machine'
- script: |
cd test
powershell.exe -file unittest.ps1
displayName: 'unit test' displayName: 'unit test'
- script: | - script: |
cd test cd test
python naive_test.py python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts local
displayName: 'Naive test' displayName: 'Integration tests'
- script: |
cd test
python tuner_test.py
displayName: 'Built-in tuners / assessors tests'
- script: |
cd test
python metrics_test.py
displayName: 'Trial job metrics test'
- script: |
cd test
PATH=$HOME/.local/bin:$PATH python3 cli_test.py
displayName: 'nnicli test'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment