Unverified Commit a8f86a78 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

Refactor integration test (step 4) - refactor test file tree structure (#4895)

parent b0732e01
...@@ -13,7 +13,7 @@ assessor: ...@@ -13,7 +13,7 @@ assessor:
classArgs: classArgs:
optimize_mode: maximize optimize_mode: maximize
trial: trial:
codeDir: ../../../examples/trials/mnist-annotation codeDir: ../../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 100 command: python3 mnist.py --batch_num 100
gpuNum: 0 gpuNum: 0
......
...@@ -13,7 +13,7 @@ assessor: ...@@ -13,7 +13,7 @@ assessor:
classArgs: classArgs:
optimize_mode: maximize optimize_mode: maximize
trial: trial:
codeDir: ../../../examples/trials/mnist-annotation codeDir: ../../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 100 command: python3 mnist.py --batch_num 100
gpuNum: 0 gpuNum: 0
......
...@@ -7,8 +7,8 @@ import glob ...@@ -7,8 +7,8 @@ import glob
import argparse import argparse
from utils import get_yml_content, dump_yml_content from utils import get_yml_content, dump_yml_content
TRAINING_SERVICE_FILE = os.path.join('config', 'training_service.yml') TRAINING_SERVICE_FILE = os.path.join('training_service', 'config', 'training_service.yml')
TRAINING_SERVICE_FILE_V2 = os.path.join('config', 'training_service_v2.yml') TRAINING_SERVICE_FILE_V2 = os.path.join('training_service', 'config', 'training_service_v2.yml')
def update_training_service_config(args): def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE) config = get_yml_content(TRAINING_SERVICE_FILE)
......
...@@ -14,7 +14,7 @@ from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_ ...@@ -14,7 +14,7 @@ from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_
from utils import GREEN, RED, CLEAR, EXPERIMENT_URL from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
NNI_SOURCE_DIR = '..' NNI_SOURCE_DIR = '..'
NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'config', 'naive_test') NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'training_service', 'config', 'naive_test')
def naive_test(args): def naive_test(args):
'''run naive integration test''' '''run naive integration test'''
......
...@@ -24,7 +24,7 @@ it_variables = {} ...@@ -24,7 +24,7 @@ it_variables = {}
def update_training_service_config(config, training_service, config_file_path, nni_source_dir, reuse_mode='False'): def update_training_service_config(config, training_service, config_file_path, nni_source_dir, reuse_mode='False'):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml')) it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service.yml'))
# hack for kubeflow trial config # hack for kubeflow trial config
if training_service == 'kubeflow' and reuse_mode == 'False': if training_service == 'kubeflow' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command'] it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
...@@ -32,7 +32,7 @@ def update_training_service_config(config, training_service, config_file_path, n ...@@ -32,7 +32,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if 'gpuNum' in config['trial']: if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum') config['trial'].pop('gpuNum')
elif training_service == 'kubeflow' and reuse_mode == 'True': elif training_service == 'kubeflow' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml')) it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
print(it_ts_config) print(it_ts_config)
it_ts_config[training_service]['trainingService']['worker']['command'] = config['trialCommand'] it_ts_config[training_service]['trainingService']['worker']['command'] = config['trialCommand']
it_ts_config[training_service]['trainingService']['worker']['code_directory'] = config['trialCodeDirectory'] it_ts_config[training_service]['trainingService']['worker']['code_directory'] = config['trialCodeDirectory']
...@@ -43,7 +43,7 @@ def update_training_service_config(config, training_service, config_file_path, n ...@@ -43,7 +43,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if 'gpuNum' in config['trial']: if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum') config['trial'].pop('gpuNum')
elif training_service == 'frameworkcontroller' and reuse_mode == 'True': elif training_service == 'frameworkcontroller' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml')) it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
it_ts_config[training_service]['trainingService']['taskRoles'][0]['command'] = config['trialCommand'] it_ts_config[training_service]['trainingService']['taskRoles'][0]['command'] = config['trialCommand']
if training_service == 'adl': if training_service == 'adl':
...@@ -53,7 +53,7 @@ def update_training_service_config(config, training_service, config_file_path, n ...@@ -53,7 +53,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if config['trial']['codeDir'] == '.': if config['trial']['codeDir'] == '.':
containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')] containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')]
elif config['trial']['codeDir'] == '../naive_trial': elif config['trial']['codeDir'] == '../naive_trial':
containerCodeDir = '/test/config/naive_trial' containerCodeDir = '/test/training_service/config/naive_trial'
elif '../../../' in config['trial']['codeDir']: elif '../../../' in config['trial']['codeDir']:
# replace example folders to container folder # replace example folders to container folder
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/') containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
...@@ -74,7 +74,7 @@ def update_training_service_config(config, training_service, config_file_path, n ...@@ -74,7 +74,7 @@ def update_training_service_config(config, training_service, config_file_path, n
it_ts_config[training_service].pop('sharedStorage') it_ts_config[training_service].pop('sharedStorage')
if training_service == 'hybrid': if training_service == 'hybrid':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml')) it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
elif reuse_mode != 'True': elif reuse_mode != 'True':
deep_update(config, it_ts_config['all']) deep_update(config, it_ts_config['all'])
deep_update(config, it_ts_config[training_service]) deep_update(config, it_ts_config[training_service])
...@@ -89,7 +89,8 @@ def prepare_config_file(test_case_config, it_config, args): ...@@ -89,7 +89,8 @@ def prepare_config_file(test_case_config, it_config, args):
deep_update(test_yml_config, test_case_config['config']) deep_update(test_yml_config, test_case_config['config'])
# hack for windows # hack for windows
if sys.platform == 'win32' and args.ts == 'local': # We've only got windows local and remote win-to-win now.
if sys.platform == 'win32':
test_yml_config['trial']['command'] = test_yml_config['trial']['command'].replace('python3', 'python') test_yml_config['trial']['command'] = test_yml_config['trial']['command'].replace('python3', 'python')
# apply training service config # apply training service config
...@@ -259,7 +260,7 @@ def match_training_service(test_case_config, cur_training_service): ...@@ -259,7 +260,7 @@ def match_training_service(test_case_config, cur_training_service):
return False return False
def match_remoteConfig(test_case_config, nni_source_dir): def match_remoteConfig(test_case_config, nni_source_dir):
trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml')) trainingservice_config = get_yml_content(os.path.join('training_service', 'config', 'training_service.yml'))
trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower() trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower()
testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile']) testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile'])
if testcase_config.get('remoteConfig') is not None: if testcase_config.get('remoteConfig') is not None:
...@@ -270,9 +271,16 @@ def match_remoteConfig(test_case_config, nni_source_dir): ...@@ -270,9 +271,16 @@ def match_remoteConfig(test_case_config, nni_source_dir):
def run(args): def run(args):
it_config = get_yml_content(args.config) it_config = get_yml_content(args.config)
test_cases = it_config['testCases']
for test_case_config in it_config['testCases']: for test_case_id, test_case_config in enumerate(test_cases, start=1):
name = test_case_config['name'] name = test_case_config['name']
print(GREEN + '=' * 80 + CLEAR)
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
# Print progress on devops
print(f'##vso[task.setprogress value={int(test_case_id / len(test_cases) * 100)};]{name}')
if case_excluded(name, args.exclude): if case_excluded(name, args.exclude):
print('{} excluded'.format(name)) print('{} excluded'.format(name))
continue continue
...@@ -294,24 +302,24 @@ def run(args): ...@@ -294,24 +302,24 @@ def run(args):
name, args.ts, test_case_config['trainingService'])) name, args.ts, test_case_config['trainingService']))
continue continue
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
# remote mode need more time to cleanup # remote mode need more time to cleanup
if args.ts == 'remote' or args.ts == 'hybrid': if args.ts == 'remote' or args.ts == 'hybrid':
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
wait_for_port_available(8080, 240) wait_for_port_available(8080, 240)
else: wait_for_port_available(8081, 240) # some training services need one more port to listen metrics
wait_for_port_available(8080, 60)
# adl mode need more time to cleanup PVC # adl mode need more time to cleanup PVC
if args.ts == 'adl' and name == 'nnictl-resume-2': if args.ts == 'adl' and name == 'nnictl-resume-2':
time.sleep(30) time.sleep(30)
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
begin_time = time.time() begin_time = time.time()
run_test_case(test_case_config, it_config, args) run_test_case(test_case_config, it_config, args)
print('{}Test {}: TEST PASS IN {} SECONDS{}'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True) print('{}Test {}: TEST PASS IN {} SECONDS{}\n\n'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -11,6 +11,7 @@ import requests ...@@ -11,6 +11,7 @@ import requests
import time import time
import yaml import yaml
import shlex import shlex
import warnings
EXPERIMENT_DONE_SIGNAL = 'Experiment done' EXPERIMENT_DONE_SIGNAL = 'Experiment done'
...@@ -177,11 +178,11 @@ def detect_port(port): ...@@ -177,11 +178,11 @@ def detect_port(port):
def wait_for_port_available(port, timeout): def wait_for_port_available(port, timeout):
begin_time = time.time() for i in range(timeout):
while True:
if not detect_port(port): if not detect_port(port):
return return
if time.time() - begin_time > timeout: warnings.warn("Port isn't available in {} seconds (patience: {})".format(i, timeout), RuntimeWarning)
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(1) time.sleep(1)
msg = 'Port {} is not available in {} seconds. Maybe the previous experiment fails to stop?'.format(port, timeout)
raise RuntimeError(msg)
...@@ -51,7 +51,7 @@ class MetricsValidator(ITValidator): ...@@ -51,7 +51,7 @@ class MetricsValidator(ITValidator):
def check_metrics(self, nni_source_dir, **kwargs): def check_metrics(self, nni_source_dir, **kwargs):
expected_result_file = kwargs.get('expected_result_file', 'expected_metrics.json') expected_result_file = kwargs.get('expected_result_file', 'expected_metrics.json')
with open(osp.join(nni_source_dir, 'test', 'config', 'metrics_test', expected_result_file), 'r') as f: with open(osp.join(nni_source_dir, 'test', 'training_service', 'config', 'metrics_test', expected_result_file), 'r') as f:
expected_metrics = json.load(f) expected_metrics = json.load(f)
print('expected metrics:', expected_metrics) print('expected metrics:', expected_metrics)
metrics = requests.get(METRICS_URL).json() metrics = requests.get(METRICS_URL).json()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment