Unverified Commit a8f86a78 authored by Yuge Zhang's avatar Yuge Zhang Committed by GitHub
Browse files

Refactor integration test (step 4) - refactor test file tree structure (#4895)

parent b0732e01
......@@ -13,7 +13,7 @@ assessor:
classArgs:
optimize_mode: maximize
trial:
codeDir: ../../../examples/trials/mnist-annotation
codeDir: ../../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 100
gpuNum: 0
......
......@@ -13,7 +13,7 @@ assessor:
classArgs:
optimize_mode: maximize
trial:
codeDir: ../../../examples/trials/mnist-annotation
codeDir: ../../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 100
gpuNum: 0
......
......@@ -7,8 +7,8 @@ import glob
import argparse
from utils import get_yml_content, dump_yml_content
TRAINING_SERVICE_FILE = os.path.join('config', 'training_service.yml')
TRAINING_SERVICE_FILE_V2 = os.path.join('config', 'training_service_v2.yml')
TRAINING_SERVICE_FILE = os.path.join('training_service', 'config', 'training_service.yml')
TRAINING_SERVICE_FILE_V2 = os.path.join('training_service', 'config', 'training_service_v2.yml')
def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
......
......@@ -14,7 +14,7 @@ from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_
from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
NNI_SOURCE_DIR = '..'
NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'config', 'naive_test')
NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'training_service', 'config', 'naive_test')
def naive_test(args):
'''run naive integration test'''
......
......@@ -24,7 +24,7 @@ it_variables = {}
def update_training_service_config(config, training_service, config_file_path, nni_source_dir, reuse_mode='False'):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service.yml'))
# hack for kubeflow trial config
if training_service == 'kubeflow' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
......@@ -32,7 +32,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
elif training_service == 'kubeflow' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
print(it_ts_config)
it_ts_config[training_service]['trainingService']['worker']['command'] = config['trialCommand']
it_ts_config[training_service]['trainingService']['worker']['code_directory'] = config['trialCodeDirectory']
......@@ -43,7 +43,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
elif training_service == 'frameworkcontroller' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
it_ts_config[training_service]['trainingService']['taskRoles'][0]['command'] = config['trialCommand']
if training_service == 'adl':
......@@ -53,7 +53,7 @@ def update_training_service_config(config, training_service, config_file_path, n
if config['trial']['codeDir'] == '.':
containerCodeDir = '/' + config_file_path[:config_file_path.rfind('/')]
elif config['trial']['codeDir'] == '../naive_trial':
containerCodeDir = '/test/config/naive_trial'
containerCodeDir = '/test/training_service/config/naive_trial'
elif '../../../' in config['trial']['codeDir']:
# replace example folders to container folder
containerCodeDir = config['trial']['codeDir'].replace('../../../', '/')
......@@ -74,7 +74,7 @@ def update_training_service_config(config, training_service, config_file_path, n
it_ts_config[training_service].pop('sharedStorage')
if training_service == 'hybrid':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config = get_yml_content(os.path.join('training_service', 'config', 'training_service_v2.yml'))
elif reuse_mode != 'True':
deep_update(config, it_ts_config['all'])
deep_update(config, it_ts_config[training_service])
......@@ -89,7 +89,8 @@ def prepare_config_file(test_case_config, it_config, args):
deep_update(test_yml_config, test_case_config['config'])
# hack for windows
if sys.platform == 'win32' and args.ts == 'local':
# We've only got windows local and remote win-to-win now.
if sys.platform == 'win32':
test_yml_config['trial']['command'] = test_yml_config['trial']['command'].replace('python3', 'python')
# apply training service config
......@@ -259,7 +260,7 @@ def match_training_service(test_case_config, cur_training_service):
return False
def match_remoteConfig(test_case_config, nni_source_dir):
trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml'))
trainingservice_config = get_yml_content(os.path.join('training_service', 'config', 'training_service.yml'))
trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower()
testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile'])
if testcase_config.get('remoteConfig') is not None:
......@@ -270,9 +271,16 @@ def match_remoteConfig(test_case_config, nni_source_dir):
def run(args):
it_config = get_yml_content(args.config)
test_cases = it_config['testCases']
for test_case_config in it_config['testCases']:
for test_case_id, test_case_config in enumerate(test_cases, start=1):
name = test_case_config['name']
print(GREEN + '=' * 80 + CLEAR)
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
# Print progress on devops
print(f'##vso[task.setprogress value={int(test_case_id / len(test_cases) * 100)};]{name}')
if case_excluded(name, args.exclude):
print('{} excluded'.format(name))
continue
......@@ -294,24 +302,24 @@ def run(args):
name, args.ts, test_case_config['trainingService']))
continue
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
# remote mode need more time to cleanup
if args.ts == 'remote' or args.ts == 'hybrid':
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
wait_for_port_available(8080, 240)
else:
wait_for_port_available(8080, 60)
wait_for_port_available(8081, 240) # some training services need one more port to listen metrics
# adl mode need more time to cleanup PVC
if args.ts == 'adl' and name == 'nnictl-resume-2':
time.sleep(30)
print('## {}Testing: {}{} ##'.format(GREEN, name, CLEAR))
begin_time = time.time()
run_test_case(test_case_config, it_config, args)
print('{}Test {}: TEST PASS IN {} SECONDS{}'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True)
print('{}Test {}: TEST PASS IN {} SECONDS{}\n\n'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True)
if __name__ == '__main__':
......
......@@ -11,6 +11,7 @@ import requests
import time
import yaml
import shlex
import warnings
EXPERIMENT_DONE_SIGNAL = 'Experiment done'
......@@ -177,11 +178,11 @@ def detect_port(port):
def wait_for_port_available(port, timeout):
begin_time = time.time()
while True:
for i in range(timeout):
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
warnings.warn("Port isn't available in {} seconds (patience: {})".format(i, timeout), RuntimeWarning)
time.sleep(1)
msg = 'Port {} is not available in {} seconds. Maybe the previous experiment fails to stop?'.format(port, timeout)
raise RuntimeError(msg)
......@@ -51,7 +51,7 @@ class MetricsValidator(ITValidator):
def check_metrics(self, nni_source_dir, **kwargs):
expected_result_file = kwargs.get('expected_result_file', 'expected_metrics.json')
with open(osp.join(nni_source_dir, 'test', 'config', 'metrics_test', expected_result_file), 'r') as f:
with open(osp.join(nni_source_dir, 'test', 'training_service', 'config', 'metrics_test', expected_result_file), 'r') as f:
expected_metrics = json.load(f)
print('expected metrics:', expected_metrics)
metrics = requests.get(METRICS_URL).json()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment