Refactor integration tests (#2190)

c9720e95 · chicm-ms · GitHub · d5e6af27 · c9720e95 · c9720e95
Unverified Commit c9720e95 authored Mar 25, 2020 by chicm-ms Committed by GitHub Mar 25, 2020
20 changed files
--- a/test/config_test/tuners/search_space_batchtuner.json
+++ b/test/config_test/tuners/search_space_batchtuner.json
--- a/test/config_test/tuners/mnist-annotation-smac.test.yml
+++ b/test/config_test/tuners/mnist-annotation-smac.test.yml
--- a/test/config_test/tuners/mnist-annotation-tpe.test.yml
+++ b/test/config_test/tuners/mnist-annotation-tpe.test.yml
--- a/test/config_test.py
+++ b/test/config_test.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import os
-import argparse
-import glob
-import subprocess
-import time
-import traceback
-import json
-from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
-    parse_max_duration_time, get_succeeded_trial_num, deep_update, print_failed_job_log, get_failed_trial_jobs
-from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL
-def gen_new_config(config_file, training_service='local'):
-    '''
-    Generates temporary config file for integration test, the file
-    should be deleted after testing.
-    '''
-    config = get_yml_content(config_file)
-    new_config_file = config_file + '.tmp'
-    it_config = get_yml_content('training_service.yml')
-    # hack for kubeflow trial config
-    if training_service == 'kubeflow':
-        it_config[training_service]['trial']['worker']['command'] = config['trial']['command']
-        config['trial'].pop('command')
-        if 'gpuNum' in config['trial']:
-            config['trial'].pop('gpuNum')
-    if training_service == 'frameworkcontroller':
-        it_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
-        config['trial'].pop('command')
-        if 'gpuNum' in config['trial']:
-            config['trial'].pop('gpuNum')
-    deep_update(config, it_config['all'])
-    deep_update(config, it_config[training_service])
-    dump_yml_content(new_config_file, config)
-    return new_config_file, config
-def run_test(config_file, training_service, local_gpu=False):
-    '''run test per configuration file'''
-    new_config_file, config = gen_new_config(config_file, training_service)
-    print(json.dumps(config, sort_keys=True, indent=4))
-    if training_service == 'local' and not local_gpu and config['trial']['gpuNum'] > 0:
-        print('no gpu, skiping: ', config_file)
-        return
-    try:
-        proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
-        assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
-        max_duration, max_trial_num = get_max_values(new_config_file)
-        sleep_interval = 3
-        for _ in range(0, max_duration+30, sleep_interval):
-            time.sleep(sleep_interval)
-            status = get_experiment_status(STATUS_URL)
-            if status in ['DONE', 'ERROR'] or get_failed_trial_jobs(TRIAL_JOBS_URL):
-                break
-        print_failed_job_log(config['trainingServicePlatform'], TRIAL_JOBS_URL)
-        if status != 'DONE' or get_succeeded_trial_num(TRIAL_JOBS_URL) < max_trial_num:
-            raise AssertionError('Failed to finish in maxExecDuration')
-    finally:
-        if os.path.exists(new_config_file):
-            os.remove(new_config_file)
-def get_max_values(config_file):
-    '''Get maxExecDuration and maxTrialNum of experiment'''
-    experiment_config = get_yml_content(config_file)
-    return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
-def run(args):
-    '''test all configuration files'''
-    if args.config is None:
-        config_files = glob.glob('./config_test/**/*.test.yml')
-    else:
-        config_files = args.config.split(',')
-    if args.exclude is not None:
-        exclude_paths = args.exclude.split(',')
-        if exclude_paths:
-            for exclude_path in exclude_paths:
-                config_files = [x for x in config_files if exclude_path not in x]
-    print(config_files)
-    for config_file in config_files:
-        try:
-            # sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
-            time.sleep(5)
-            print(GREEN + 'Testing:' + config_file + CLEAR)
-            begin_time = time.time()
-            run_test(config_file, args.ts, args.local_gpu)
-            print(GREEN + 'Test %s: TEST PASS IN %d mins' % (config_file, (time.time() - begin_time)/60) + CLEAR)
-        except Exception as error:
-            print(RED + 'Test %s: TEST FAIL' % (config_file) + CLEAR)
-            print('%r' % error)
-            traceback.print_exc()
-            raise error
-        finally:
-            subprocess.run(['nnictl', 'stop'])
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, default=None)
-    parser.add_argument("--exclude", type=str, default=None)
-    parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'paiYarn', 'kubeflow', 'frameworkcontroller'], default='local')
-    parser.add_argument("--local_gpu", action='store_true')
-    parser.add_argument("--preinstall", action='store_true')
-    args = parser.parse_args()
-    setup_experiment(args.preinstall)
-    run(args)
--- a/test/metrics_test.py
+++ b/test/metrics_test.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import sys
-import os.path as osp
-import subprocess
-import time
-import traceback
-import json
-import requests
-from utils import get_experiment_status, get_yml_content, parse_max_duration_time, get_succeeded_trial_num, print_failed_job_log
-from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, METRICS_URL
-def run_test():
-    '''run metrics test'''
-    if sys.platform == 'win32':
-        config_file = osp.join('metrics_test', 'metrics_win32.test.yml')
-    else:
-        config_file = osp.join('metrics_test', 'metrics.test.yml')
-    print('Testing %s...' % config_file)
-    proc = subprocess.run(['nnictl', 'create', '--config', config_file])
-    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
-    max_duration, max_trial_num = get_max_values(config_file)
-    sleep_interval = 3
-    for _ in range(0, max_duration, sleep_interval):
-        time.sleep(sleep_interval)
-        status = get_experiment_status(STATUS_URL)
-        #print('experiment status:', status)
-        if status == 'DONE':
-            num_succeeded = get_succeeded_trial_num(TRIAL_JOBS_URL)
-            print_failed_job_log('local', TRIAL_JOBS_URL)
-            if sys.platform == "win32":
-                time.sleep(sleep_interval)  # Windows seems to have some issues on updating in time
-            assert num_succeeded == max_trial_num, 'only %d succeeded trial jobs, there should be %d' % (num_succeeded, max_trial_num)
-            check_metrics()
-            break
-    assert status == 'DONE', 'Failed to finish in maxExecDuration'
-def check_metrics():
-    with open(osp.join('metrics_test', 'expected_metrics.json'), 'r') as f:
-        expected_metrics = json.load(f)
-    print(expected_metrics)
-    metrics = requests.get(METRICS_URL).json()
-    intermediate_result, final_result = get_metric_results(metrics)
-    assert len(final_result) == 1, 'there should be 1 final result'
-    assert final_result[0] == expected_metrics['final_result']
-    assert set(intermediate_result) == set(expected_metrics['intermediate_result'])
-def get_metric_results(metrics):
-    intermediate_result = []
-    final_result = []
-    for metric in metrics:
-        if metric['type'] == 'PERIODICAL':
-            intermediate_result.append(json.loads(metric['data']))
-        elif metric['type'] == 'FINAL':
-            final_result.append(json.loads(metric['data']))
-    print(intermediate_result, final_result)
-    return [round(float(x),6) for x in intermediate_result], [round(float(x), 6) for x in final_result]
-def get_max_values(config_file):
-    experiment_config = get_yml_content(config_file)
-    return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
-if __name__ == '__main__':
-    try:
-        # sleep 5 seconds here, to make sure previous stopped exp has enough time to exit to avoid port conflict
-        time.sleep(5)
-        run_test()
-        print(GREEN + 'TEST PASS' + CLEAR)
-    except Exception as error:
-        print(RED + 'TEST FAIL' + CLEAR)
-        print('%r' % error)
-        traceback.print_exc()
-        raise error
-    finally:
-        subprocess.run(['nnictl', 'stop'])
--- a/test/metrics_test/trial.py
+++ b/test/metrics_test/trial.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-import time
-import nni
-if __name__ == '__main__':
-    nni.get_next_parameter()
-    time.sleep(1)
-    for i in range(10):
-        if i % 2 == 0:
-            print('report intermediate result without end of line.', end='')
-        else:
-            print('report intermediate result.')
-        nni.report_intermediate_result(0.1*(i+1))
-        time.sleep(2)
-    print('test final metrics not at line start.', end='')
-    nni.report_final_result(1.0)
-    print('done')
--- a/test/naive_test/local_win32.yml
+++ b/test/naive_test/local_win32.yml
-authorName: nni
-experimentName: naive
-trialConcurrency: 3
-maxExecDuration: 1h
-maxTrialNum: 10
-#choice: local, remote
-trainingServicePlatform: local
-searchSpacePath: search_space.json
-#choice: true, false
-useAnnotation: false
-tuner:
-    codeDir: .
-    classFileName: naive_tuner.py
-    className: NaiveTuner
-    classArgs:
-        optimize_mode: maximize
-assessor:
-    codeDir: .
-    classFileName: naive_assessor.py
-    className: NaiveAssessor
-    classArgs:
-        optimize_mode: maximize
-trial:
-    command: python naive_trial.py
-    codeDir: .
-    gpuNum: 0
--- a/test/nni_test/nnitest/__init__.py
+++ b/test/nni_test/nnitest/__init__.py
--- a/test/nni_test/nnitest/foreground.py
+++ b/test/nni_test/nnitest/foreground.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import subprocess
+import argparse
+import time
+import shlex
+import signal
+def test_foreground(args):
+    launch_command = 'nnictl create --config {} --foreground'.format(args.config)
+    print('nnictl foreground launch command: ', launch_command, flush=True)
+    proc = subprocess.Popen(shlex.split(launch_command))
+    time.sleep(args.timeout)
+    proc.send_signal(signal.SIGINT)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--timeout", type=int, default=45)
+    args = parser.parse_args()
+    test_foreground(args)
--- a/test/generate_ts_config.py
+++ b/test/generate_ts_config.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT license.
 import sys
+import os
 import glob
 import argparse
 from utils import get_yml_content, dump_yml_content
-TRAINING_SERVICE_FILE = 'training_service.yml'
+TRAINING_SERVICE_FILE = os.path.join('config', 'training_service.yml')
 def update_training_service_config(args):
    config = get_yml_content(TRAINING_SERVICE_FILE)
@@ -86,18 +87,6 @@ def update_training_service_config(args):
    dump_yml_content(TRAINING_SERVICE_FILE, config)
-def convert_command():
-    '''convert command by platform'''
-    if sys.platform != 'win32':
-        return None
-    config_files = glob.glob('./**/*.yml') + glob.glob('./**/**/*.yml')
-    for config_file in config_files:
-        print('processing {}'.format(config_file))
-        yml_content = get_yml_content(config_file)
-        if yml_content.get('trial'):
-            if yml_content['trial'].get('command'):
-                yml_content['trial']['command'] = yml_content['trial']['command'].replace('python3', 'python')
-                dump_yml_content(config_file, yml_content)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
@@ -130,5 +119,3 @@ if __name__ == '__main__':
    args = parser.parse_args()
    update_training_service_config(args)
-    if args.ts == 'local':
-        convert_command()
--- a/test/naive_test.py
+++ b/test/naive_test.py
@@ -3,6 +3,7 @@
 import sys
 import os.path as osp
+import argparse
 import json
 import subprocess
 import sys
@@ -12,17 +13,16 @@ import traceback
 from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze
 from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
-def naive_test():
+NNI_SOURCE_DIR = '..'
+NAIVE_TEST_CONFIG_DIR = osp.join(NNI_SOURCE_DIR, 'test', 'config', 'naive_test')
+def naive_test(args):
    '''run naive integration test'''
    to_remove = ['tuner_search_space.json', 'tuner_result.txt', 'assessor_result.txt']
-    to_remove = list(map(lambda file: osp.join('naive_test', file), to_remove))
+    to_remove = list(map(lambda file: osp.join(NAIVE_TEST_CONFIG_DIR, file), to_remove))
    remove_files(to_remove)
-    if sys.platform == 'win32':
+    proc = subprocess.run(['nnictl', 'create', '--config', args.config])
-        config_file = 'local_win32.yml'
-    else:
-        config_file = 'local.yml'
-    proc = subprocess.run(['nnictl', 'create', '--config', osp.join('naive_test' , config_file)])
    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
    print('Spawning trials...')
@@ -33,8 +33,8 @@ def naive_test():
    for _ in range(120):
        time.sleep(1)
-        tuner_status = read_last_line(osp.join('naive_test', 'tuner_result.txt'))
+        tuner_status = read_last_line(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt'))
-        assessor_status = read_last_line(osp.join('naive_test', 'assessor_result.txt'))
+        assessor_status = read_last_line(osp.join(NAIVE_TEST_CONFIG_DIR, 'assessor_result.txt'))
        experiment_status = is_experiment_done(nnimanager_log_path)
        assert tuner_status != 'ERROR', 'Tuner exited with error'
@@ -44,7 +44,7 @@ def naive_test():
            break
        if tuner_status is not None:
-            for line in open(osp.join('naive_test', 'tuner_result.txt')):
+            for line in open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt')):
                if line.strip() == 'ERROR':
                    break
                trial = int(line.split(' ')[0])
@@ -54,32 +54,33 @@ def naive_test():
    assert experiment_status, 'Failed to finish in 2 min'
-    ss1 = json.load(open(osp.join('naive_test', 'search_space.json')))
+    ss1 = json.load(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'search_space.json')))
-    ss2 = json.load(open(osp.join('naive_test', 'tuner_search_space.json')))
+    ss2 = json.load(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_search_space.json')))
    assert ss1 == ss2, 'Tuner got wrong search space'
-    tuner_result = set(open(osp.join('naive_test', 'tuner_result.txt')))
+    tuner_result = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'tuner_result.txt')))
-    expected = set(open(osp.join('naive_test', 'expected_tuner_result.txt')))
+    expected = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'expected_tuner_result.txt')))
    # Trials may complete before NNI gets assessor's result,
    # so it is possible to have more final result than expected
    print('Tuner result:', tuner_result)
    print('Expected tuner result:', expected)
    assert tuner_result.issuperset(expected), 'Bad tuner result'
-    assessor_result = set(open(osp.join('naive_test', 'assessor_result.txt')))
+    assessor_result = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'assessor_result.txt')))
-    expected = set(open(osp.join('naive_test', 'expected_assessor_result.txt')))
+    expected = set(open(osp.join(NAIVE_TEST_CONFIG_DIR, 'expected_assessor_result.txt')))
    assert assessor_result == expected, 'Bad assessor result'
    subprocess.run(['nnictl', 'stop'])
    snooze()
-def stop_experiment_test():
+def stop_experiment_test(args):
+    config_file = args.config
    '''Test `nnictl stop` command, including `nnictl stop exp_id` and `nnictl stop all`.
    Simple `nnictl stop` is not tested here since it is used in all other test code'''
-    subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8080'], check=True)
+    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8080'], check=True)
-    subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8888'], check=True)
+    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8888'], check=True)
-    subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8989'], check=True)
+    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8989'], check=True)
-    subprocess.run(['nnictl', 'create', '--config', osp.join('tuner_test', 'local.yml'), '--port', '8990'], check=True)
+    subprocess.run(['nnictl', 'create', '--config', config_file, '--port', '8990'], check=True)
    # test cmd 'nnictl stop id`
    experiment_id = get_experiment_id(EXPERIMENT_URL)
@@ -102,11 +103,14 @@ def stop_experiment_test():
 if __name__ == '__main__':
-    installed = (sys.argv[-1] != '--preinstall')
+    parser = argparse.ArgumentParser()
-    setup_experiment(installed)
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--preinstall", action='store_true')
+    args = parser.parse_args()
+    setup_experiment(not args.preinstall)
    try:
-        naive_test()
+        naive_test(args)
-        stop_experiment_test()
+        stop_experiment_test(args)
        # TODO: check the output of rest server
        print(GREEN + 'PASS' + CLEAR)
    except Exception as error:

--- a/test/remote_docker.py
+++ b/test/remote_docker.py
--- a/test/nni_test/nnitest/run_tests.py
+++ b/test/nni_test/nnitest/run_tests.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import sys
+import os
+import argparse
+import subprocess
+import time
+import datetime
+import shlex
+import traceback
+import json
+import ruamel.yaml as yaml
+from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \
+    parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \
+    get_experiment_dir, print_experiment_log
+from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, detect_port
+import validators
+it_variables = {}
+def update_training_service_config(config, training_service):
+    it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
+    # hack for kubeflow trial config
+    if training_service == 'kubeflow':
+        it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
+        config['trial'].pop('command')
+        if 'gpuNum' in config['trial']:
+            config['trial'].pop('gpuNum')
+    if training_service == 'frameworkcontroller':
+        it_ts_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
+        config['trial'].pop('command')
+        if 'gpuNum' in config['trial']:
+            config['trial'].pop('gpuNum')
+    deep_update(config, it_ts_config['all'])
+    deep_update(config, it_ts_config[training_service])
+def prepare_config_file(test_case_config, it_config, args):
+    config_path = args.nni_source_dir + test_case_config['configFile']
+    test_yml_config = get_yml_content(config_path)
+    # apply test case specific config
+    if test_case_config.get('config') is not None:
+        deep_update(test_yml_config, test_case_config['config'])
+    # hack for windows
+    if sys.platform == 'win32' and args.ts == 'local':
+        test_yml_config['trial']['command'] = test_yml_config['trial']['command'].replace('python3', 'python')
+    # apply training service config
+    # user's gpuNum, logCollection config is overwritten by the config in training_service.yml
+    # the hack for kubeflow should be applied at last step
+    update_training_service_config(test_yml_config, args.ts)
+    # generate temporary config yml file to launch experiment
+    new_config_file = config_path + '.tmp'
+    dump_yml_content(new_config_file, test_yml_config)
+    print(yaml.dump(test_yml_config, default_flow_style=False), flush=True)
+    return new_config_file
+def run_test_case(test_case_config, it_config, args):
+    new_config_file = prepare_config_file(test_case_config, it_config, args)
+    # set configFile variable
+    it_variables['$configFile'] = new_config_file
+    try:
+        launch_test(new_config_file, args.ts, test_case_config)
+        invoke_validator(test_case_config, args.nni_source_dir)
+    finally:
+        stop_command = get_command(test_case_config, 'stopCommand')
+        print('Stop command:', stop_command, flush=True)
+        if stop_command:
+            subprocess.run(shlex.split(stop_command))
+        # remove tmp config file
+        if os.path.exists(new_config_file):
+            os.remove(new_config_file)
+def invoke_validator(test_case_config, nni_source_dir):
+    validator_config = test_case_config.get('validator')
+    if validator_config is None or validator_config.get('class') is None:
+        return
+    validator = validators.__dict__[validator_config.get('class')]()
+    kwargs = validator_config.get('kwargs', {})
+    print('kwargs:', kwargs)
+    validator(REST_ENDPOINT, get_experiment_dir(EXPERIMENT_URL), nni_source_dir, **kwargs)
+def get_max_values(config_file):
+    experiment_config = get_yml_content(config_file)
+    return parse_max_duration_time(experiment_config['maxExecDuration']), experiment_config['maxTrialNum']
+def get_command(test_case_config, commandKey):
+    command = test_case_config.get(commandKey)
+    if commandKey == 'launchCommand':
+        assert command is not None
+    if command is None:
+        return None
+    # replace variables
+    for k in it_variables:
+        command = command.replace(k, it_variables[k])
+    # hack for windows, not limited to local training service
+    if sys.platform == 'win32':
+        command = command.replace('python3', 'python')
+    return command
+def launch_test(config_file, training_service, test_case_config):
+    launch_command = get_command(test_case_config, 'launchCommand')
+    print('launch command: ', launch_command, flush=True)
+    proc = subprocess.run(shlex.split(launch_command))
+    assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
+    # set experiment ID into variable
+    exp_var_name = test_case_config.get('setExperimentIdtoVar')
+    if exp_var_name is not None:
+        assert exp_var_name.startswith('$')
+        it_variables[exp_var_name] = get_experiment_id(EXPERIMENT_URL)
+    print('variables:', it_variables)
+    max_duration, max_trial_num = get_max_values(config_file)
+    print('max_duration:', max_duration, ' max_trial_num:', max_trial_num)
+    if not test_case_config.get('experimentStatusCheck'):
+        return
+    bg_time = time.time()
+    print(str(datetime.datetime.now()), ' waiting ...', flush=True)
+    while True:
+        time.sleep(3)
+        waited_time = time.time() - bg_time
+        if  waited_time > max_duration + 10:
+            print('waited: {}, max_duration: {}'.format(waited_time, max_duration))
+            break
+        status = get_experiment_status(STATUS_URL)
+        if status in ['DONE', 'ERROR']:
+            print('experiment status:', status)
+            break
+        num_failed = len(get_failed_trial_jobs(TRIAL_JOBS_URL))
+        if num_failed > 0:
+            print('failed jobs: ', num_failed)
+            break
+    print(str(datetime.datetime.now()), ' waiting done', flush=True)
+    if get_experiment_status(STATUS_URL) == 'ERROR':
+        print_experiment_log(EXPERIMENT_URL)
+    trial_stats = get_trial_stats(TRIAL_JOBS_URL)
+    print(json.dumps(trial_stats, indent=4), flush=True)
+    if status != 'DONE' or trial_stats['SUCCEEDED'] + trial_stats['EARLY_STOPPED'] < max_trial_num:
+        print_trial_job_log(training_service, TRIAL_JOBS_URL)
+        raise AssertionError('Failed to finish in maxExecDuration')
+def case_excluded(name, excludes):
+    if name is None:
+        return False
+    if excludes is not None:
+        excludes = excludes.split(',')
+        for e in excludes:
+            if name in e or e in name:
+                return True
+    return False
+def case_included(name, cases):
+    assert cases is not None
+    for case in cases.split(','):
+        if case in name:
+            return True
+    return False
+def wait_for_port_available(port, timeout):
+    begin_time = time.time()
+    while True:
+        if not detect_port(port):
+            return
+        if time.time() - begin_time > timeout:
+            msg = 'port {} is not available in {} seconds.'.format(port, timeout)
+            raise RuntimeError(msg)
+        time.sleep(5)
+def match_platform(test_case_config):
+    return sys.platform in test_case_config['platform'].split(' ')
+def run(args):
+    it_config = get_yml_content(args.config)
+    for test_case_config in it_config['testCases']:
+        name = test_case_config['name']
+        if case_excluded(name, args.exclude):
+            print('{} excluded'.format(name))
+            continue
+        if args.cases and not case_included(name, args.cases):
+            continue
+        # fill test case default config
+        for k in it_config['defaultTestCaseConfig']:
+            if k not in test_case_config:
+                test_case_config[k] = it_config['defaultTestCaseConfig'][k]
+        print(json.dumps(test_case_config, indent=4))
+        if not match_platform(test_case_config):
+            print('skipped {}, platform {} not match [{}]'.format(name, sys.platform, test_case_config['platform']))
+            continue
+        wait_for_port_available(8080, 30)
+        print('{}Testing: {}{}'.format(GREEN, name, CLEAR))
+        begin_time = time.time()
+        run_test_case(test_case_config, it_config, args)
+        print('{}Test {}: TEST PASS IN {} SECONDS{}'.format(GREEN, name, int(time.time()-begin_time), CLEAR), flush=True)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--nni_source_dir", type=str, default='../')
+    parser.add_argument("--cases", type=str, default=None)
+    parser.add_argument("--exclude", type=str, default=None)
+    parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'], default='local')
+    args = parser.parse_args()
+    run(args)
--- a/test/utils.py
+++ b/test/utils.py
@@ -17,11 +17,12 @@ GREEN = '\33[32m'
 RED = '\33[31m'
 CLEAR = '\33[0m'
-REST_ENDPOINT = 'http://localhost:8080/api/v1/nni'
+REST_ENDPOINT = 'http://localhost:8080'
-EXPERIMENT_URL = REST_ENDPOINT + '/experiment'
+API_ROOT_URL = REST_ENDPOINT + '/api/v1/nni'
-STATUS_URL = REST_ENDPOINT + '/check-status'
+EXPERIMENT_URL = API_ROOT_URL + '/experiment'
-TRIAL_JOBS_URL = REST_ENDPOINT + '/trial-jobs'
+STATUS_URL = API_ROOT_URL + '/check-status'
-METRICS_URL = REST_ENDPOINT + '/metric-data'
+TRIAL_JOBS_URL = API_ROOT_URL + '/trial-jobs'
+METRICS_URL = API_ROOT_URL + '/metric-data'
 def read_last_line(file_name):
    '''read last line of a file and return None if file not found'''
@@ -90,40 +91,45 @@ def get_experiment_status(status_url):
    nni_status = requests.get(status_url).json()
    return nni_status['status']
-def get_succeeded_trial_num(trial_jobs_url):
+def get_trial_stats(trial_jobs_url):
    trial_jobs = requests.get(trial_jobs_url).json()
-    num_succeed = 0
+    trial_stats = collections.defaultdict(int)
    for trial_job in trial_jobs:
-        if trial_job['status'] in ['SUCCEEDED', 'EARLY_STOPPED']:
+        trial_stats[trial_job['status']] += 1
-            num_succeed += 1
+    return trial_stats
-    print('num_succeed:', num_succeed)
-    return num_succeed
-def get_failed_trial_jobs(trial_jobs_url):
+def get_trial_jobs(trial_jobs_url, status=None):
    '''Return failed trial jobs'''
    trial_jobs = requests.get(trial_jobs_url).json()
-    failed_jobs = []
+    res = []
    for trial_job in trial_jobs:
-        if trial_job['status'] in ['FAILED']:
+        if status is None or trial_job['status'] == status:
-            failed_jobs.append(trial_job)
+            res.append(trial_job)
-    return failed_jobs
+    return res
+def get_failed_trial_jobs(trial_jobs_url):
+    '''Return failed trial jobs'''
+    return get_trial_jobs(trial_jobs_url, 'FAILED')
+def print_file_content(filepath):
+    with open(filepath, 'r') as f:
+        content = f.read()
+        print(filepath, flush=True)
+        print(content, flush=True)
-def print_failed_job_log(training_service, trial_jobs_url):
+def print_trial_job_log(training_service, trial_jobs_url):
-    '''Print job log of FAILED trial jobs'''
+    trial_jobs = get_trial_jobs(trial_jobs_url)
-    trial_jobs = get_failed_trial_jobs(trial_jobs_url)
    for trial_job in trial_jobs:
-        if training_service == 'local':
+        trial_log_dir = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['id'])
-            if sys.platform == "win32":
+        log_files = ['stderr', 'trial.log'] if training_service == 'local' else ['stdout_log_collection.log']
-                p = trial_job['stderrPath'].split(':')
+        for log_file in log_files:
-                log_filename = ':'.join([p[-2], p[-1]])
+            print_file_content(os.path.join(trial_log_dir, log_file))
-            else:
-                log_filename = trial_job['stderrPath'].split(':')[-1]
+def print_experiment_log(experiment_url):
-        else:
+    log_dir = get_nni_log_dir(experiment_url)
-            log_filename = os.path.join(get_experiment_dir(EXPERIMENT_URL), 'trials', trial_job['id'], 'stdout_log_collection.log')
+    for log_file in ['dispatcher.log', 'nnimanager.log']:
-        with open(log_filename, 'r') as f:
+        filepath = os.path.join(log_dir, log_file)
-            log_content = f.read()
+        print_file_content(filepath)
-            print(log_filename, flush=True)
-            print(log_content, flush=True)
 def parse_max_duration_time(max_exec_duration):
    unit = max_exec_duration[-1]

--- a/test/nni_test/nnitest/validators.py
+++ b/test/nni_test/nnitest/validators.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import os.path as osp
+import json
+import requests
+import nnicli as nc
+from utils import METRICS_URL
+class ITValidator:
+    def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
+        pass
+class MetricsValidator(ITValidator):
+    def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
+        self.check_metrics(nni_source_dir, **kwargs)
+    def check_metrics(self, nni_source_dir, **kwargs):
+        expected_result_file = kwargs.get('expected_result_file', 'expected_metrics.json')
+        with open(osp.join(nni_source_dir, 'test', 'config', 'metrics_test', expected_result_file), 'r') as f:
+            expected_metrics = json.load(f)
+        print('expected metrics:', expected_metrics)
+        metrics = requests.get(METRICS_URL).json()
+        print('RAW METRICS:', json.dumps(metrics, indent=4))
+        intermediate_result, final_result = self.get_metric_results(metrics)
+        assert intermediate_result and final_result
+        for trialjob_id in intermediate_result:
+            trial_final_result = final_result[trialjob_id]
+            trial_intermediate_result = intermediate_result[trialjob_id]
+            print('intermediate result:', trial_intermediate_result)
+            print('final result:', trial_final_result)
+            assert len(trial_final_result) == 1, 'there should be 1 final result'
+            assert trial_final_result[0] == expected_metrics['final_result']
+            # encode dict/number into json string to compare them in set
+            assert set([json.dumps(x) for x in trial_intermediate_result]) \
+                == set([json.dumps(x) for x in expected_metrics['intermediate_result']])
+    def get_metric_results(self, metrics):
+        intermediate_result = {}
+        final_result = {}
+        for metric in metrics:
+            # metrics value are encoded by NNI SDK as json string,
+            # here we decode the value by json.loads twice
+            metric_value = json.loads(json.loads(metric['data']))
+            if metric['type'] == 'PERIODICAL':
+                if metric['trialJobId'] in intermediate_result:
+                    intermediate_result[metric['trialJobId']].append(metric_value)
+                else:
+                    intermediate_result[metric['trialJobId']] = [metric_value]
+            elif metric['type'] == 'FINAL':
+                if metric['trialJobId'] in final_result:
+                    final_result[metric['trialJobId']].append(metric_value)
+                else:
+                    final_result[metric['trialJobId']] = [metric_value]
+        return intermediate_result, final_result
+class NnicliValidator(ITValidator):
+    def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
+        print(rest_endpoint)
+        nc.set_endpoint(rest_endpoint)
+        #print(nc.version())
+        print(nc.get_job_statistics())
+        print(nc.get_experiment_status())
+        print(nc.list_trial_jobs())
--- a/test/nni_test/setup.py
+++ b/test/nni_test/setup.py
+from setuptools import setup, find_packages
+setup(
+    name="nnitest",
+    version="0.0.1",
+    author = 'Microsoft NNI team',
+    author_email = 'nni@microsoft.com',
+    description = 'Neural Network Intelligence package',
+    license = 'MIT',
+    url = 'https://github.com/Microsoft/nni',
+    packages=find_packages('nnitest'),
+    long_description="",
+    classifiers = [
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: MIT License',
+        "Operating System :: OS Independent"
+    ],
+)
--- a/test/pipelines-it-frameworkcontroller.yml
+++ b/test/pipelines-it-frameworkcontroller.yml
@@ -47,9 +47,9 @@ jobs:
      fi
      echo "TEST_IMG:$TEST_IMG"
      cd test
-      python3 generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
+      python3 nni_test/nnitest/generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
      --azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
-      cat training_service.yml
+      cat config/training_service.yml
-      PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts frameworkcontroller --exclude multi_phase
+      PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase
    displayName: 'integration test'
--- a/test/pipelines-it-installation.yml
+++ b/test/pipelines-it-installation.yml
--- a/test/pipelines-it-kubeflow.yml
+++ b/test/pipelines-it-kubeflow.yml
@@ -47,9 +47,9 @@ jobs:
      fi
      echo "TEST_IMG:$TEST_IMG"
      cd test
-      python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
+      python3 nni_test/nnitest/generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
      --azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
-      cat training_service.yml
+      cat config/training_service.yml
-      PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase
+      PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase
    displayName: 'integration test'
--- a/test/pipelines-it-local-windows.yml
+++ b/test/pipelines-it-local-windows.yml
@@ -9,34 +9,14 @@ jobs:
  - script: |
      python -m pip install scikit-learn==0.20.0 --user
      python -m pip install keras==2.1.6 --user
-      python -m pip install torch===1.2.0 torchvision===0.4.1 -f https://download.pytorch.org/whl/torch_stable.html --user
+      python -m pip install torchvision===0.4.1 torch===1.3.1 -f https://download.pytorch.org/whl/torch_stable.html --user
      python -m pip install tensorflow-gpu==1.11.0 --user
    displayName: 'Install dependencies for integration tests'
  - script: |
      cd test
-      python generate_ts_config.py --ts local
+      powershell.exe -file scripts/unittest.ps1
-    displayName: 'generate config files'
-  - script: |
-      cd test
-      python config_test.py --ts local --local_gpu --exclude smac,bohb
-    displayName: 'Examples and advanced features tests on local machine'
-  - script: |
-      cd test
-      powershell.exe -file unittest.ps1
    displayName: 'unit test'
  - script: |
      cd test
-      python naive_test.py
+      python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts local
-    displayName: 'Naive test'
+    displayName: 'Integration tests'
-  - script: |
-      cd test
-      python tuner_test.py
-    displayName: 'Built-in tuners / assessors tests'
-  - script: |
-      cd test
-      python metrics_test.py
-    displayName: 'Trial job metrics test'
-  - script: |
-      cd test
-      PATH=$HOME/.local/bin:$PATH python3 cli_test.py
-    displayName: 'nnicli test'