Integration test azure pipelines for PAI/kubeflow training service (#768)

Integration test pipeline for PAI/kubeflow training service. It build nni.it docker image and run integration tests on PAI/kubeflow with the built docker image.

Integration test azure pipelines for PAI/kubeflow training service (#768)
Integration test pipeline for PAI/kubeflow training service. It build nni.it docker image and run integration tests on PAI/kubeflow with the built docker image.
5f8ffcd5 · chicm-ms · GitHub · b2cdc30d · 5f8ffcd5 · 5f8ffcd5
Unverified Commit 5f8ffcd5 authored Feb 24, 2019 by chicm-ms Committed by GitHub Feb 24, 2019
6 changed files
--- a/test/config_test.py
+++ b/test/config_test.py
@@ -26,12 +26,11 @@ import time
 import traceback

 from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
-    parse_max_duration_time, get_succeeded_trial_num, print_stderr
+    parse_max_duration_time, get_succeeded_trial_num, print_stderr, deep_update
 from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL

-
 def gen_new_config(config_file, training_service='local'):
-    ''' 
+    '''
    Generates temporary config file for integration test, the file
    should be deleted after testing.
    '''
@@ -41,7 +40,15 @@ def gen_new_config(config_file, training_service='local'):
    ts = get_yml_content('training_service.yml')[training_service]
    print(config)
    print(ts)
-    config.update(ts)
+
+    # hack for kubeflow trial config
+    if training_service == 'kubeflow':
+        ts['trial']['worker']['command'] = config['trial']['command']
+        config['trial'].pop('command')
+        if 'gpuNum' in config['trial']:
+            config['trial'].pop('gpuNum')
+
+    deep_update(config, ts)
    print(config)
    dump_yml_content(new_config_file, config)

@@ -61,7 +68,7 @@ def run_test(config_file, training_service, local_gpu=False):
        proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
        assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode

-        max_duration, max_trial_num = get_max_values(config_file)
+        max_duration, max_trial_num = get_max_values(new_config_file)
        sleep_interval = 3

        for _ in range(0, max_duration+30, sleep_interval):
@@ -90,6 +97,12 @@ def run(args):
        config_files = glob.glob('./config_test/**/*.test.yml')
    else:
        config_files = args.config.split(',')
+
+    if args.exclude is not None:
+        exclude_paths = args.exclude.split(',')
+        if exclude_paths:
+            for exclude_path in exclude_paths:
+                config_files = [x for x in config_files if exclude_path not in x]
    print(config_files)

    for config_file in config_files:
@@ -107,11 +120,10 @@ def run(args):
            subprocess.run(['nnictl', 'stop'])

 if __name__ == '__main__':
-    import tensorflow as tf
-    print('TF VERSION:', tf.__version__)
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default=None)
-    parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai'], default='local')
+    parser.add_argument("--exclude", type=str, default=None)
+    parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
    parser.add_argument("--local_gpu", action='store_true')
    parser.add_argument("--preinstall", action='store_true')
    args = parser.parse_args()

--- a/test/generate_ts_config.py
+++ b/test/generate_ts_config.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge,
+# to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import argparse
+from utils import get_yml_content, dump_yml_content
+
+TRAINING_SERVICE_FILE = 'training_service.yml'
+
+def update_training_service_config(args):
+    config = get_yml_content(TRAINING_SERVICE_FILE)
+    if args.nni_manager_ip is not None:
+        config[args.ts]['nniManagerIp'] = args.nni_manager_ip
+    if args.ts == 'pai':
+        if args.pai_user is not None:
+            config[args.ts]['paiConfig']['userName'] = args.pai_user
+        if args.pai_pwd is not None:
+            config[args.ts]['paiConfig']['passWord'] = args.pai_pwd
+        if args.pai_host is not None:
+            config[args.ts]['paiConfig']['host'] = args.pai_host
+        if args.nni_docker_image is not None:
+            config[args.ts]['trial']['image'] = args.nni_docker_image
+        if args.data_dir is not None:
+            config[args.ts]['trial']['dataDir'] = args.data_dir
+        if args.output_dir is not None:
+            config[args.ts]['trial']['outputDir'] = args.output_dir
+    elif args.ts == 'kubeflow':
+        if args.nfs_server is not None:
+            config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
+        if args.nfs_path is not None:
+            config[args.ts]['kubeflowConfig']['nfs']['path'] = args.nfs_path
+        if args.keyvault_vaultname is not None:
+            config[args.ts]['kubeflowConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
+        if args.keyvault_name is not None:
+            config[args.ts]['kubeflowConfig']['keyVault']['name'] = args.keyvault_name
+        if args.azs_account is not None:
+            config[args.ts]['kubeflowConfig']['azureStorage']['accountName'] = args.azs_account
+        if args.azs_share is not None:
+            config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
+        if args.nni_docker_image is not None:
+            config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
+
+    dump_yml_content(TRAINING_SERVICE_FILE, config)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow'], default='pai')
+    parser.add_argument("--nni_docker_image", type=str)
+    parser.add_argument("--nni_manager_ip", type=str)
+    # args for PAI
+    parser.add_argument("--pai_user", type=str)
+    parser.add_argument("--pai_pwd", type=str)
+    parser.add_argument("--pai_host", type=str)
+    parser.add_argument("--data_dir", type=str)
+    parser.add_argument("--output_dir", type=str)
+    # args for kubeflow
+    parser.add_argument("--nfs_server", type=str)
+    parser.add_argument("--nfs_path", type=str)
+    parser.add_argument("--keyvault_vaultname", type=str)
+    parser.add_argument("--keyvault_name", type=str)
+    parser.add_argument("--azs_account", type=str)
+    parser.add_argument("--azs_share", type=str)
+    args = parser.parse_args()
+
+    update_training_service_config(args)
--- a/test/pipelines-it-kubeflow.yml
+++ b/test/pipelines-it-kubeflow.yml
+jobs:
+- job: 'integration_test_kubeflow'
+  pool: 'NNI CI KUBE CLI'
+
+  variables:
+    new_docker_img: msranni/nni.it.kb:latest
+
+  steps:
+  - script: python3 -m pip install --upgrade pip setuptools --user
+    displayName: 'Install python tools'
+
+  - script: |
+      cd deployment/pypi
+      echo 'building prerelease package...'
+      make build
+      ls $(Build.SourcesDirectory)/deployment/pypi/dist/
+    condition: eq( variables['build_docker_img'], 'true' )
+    displayName: 'build nni bdsit_wheel'
+
+  - script: |
+      cd deployment/pypi
+      docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
+
+      echo 'updating docker file for installing nni from local...'
+      # update Dockerfile to install NNI in docker image from whl file built in last step
+      sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
+      cat ../docker/Dockerfile
+      echo $IMG_TAG
+      docker build -f ../docker/Dockerfile -t $(new_docker_img) .
+      docker push $(new_docker_img)
+    condition: eq( variables['build_docker_img'], 'true' )
+    displayName: 'build and upload nni docker image'
+
+  - script: |
+      source install.sh
+    displayName: 'Install nni toolkit via source code'
+
+  - script: |
+      if [ $(build_docker_img) = 'true' ]
+      then
+        export TEST_IMG=$(new_docker_img)
+      else
+        export TEST_IMG=$(existing_docker_img)
+      fi
+      echo "TEST_IMG:$TEST_IMG"
+      cd test
+      python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
+      --azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
+
+      cat training_service.yml
+      PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase
+    displayName: 'integration test'
--- a/test/pipelines-it-pai.yml
+++ b/test/pipelines-it-pai.yml
+jobs:
+- job: 'integration_test_pai'
+  pool: 'NNI CI PAI CLI'
+
+  variables:
+    new_docker_img: msranni/nni.it.pai:latest
+
+  steps:
+  - script: python3 -m pip install --upgrade pip setuptools --user
+    displayName: 'Install python tools'
+
+  - script: |
+      cd deployment/pypi
+      echo 'building prerelease package...'
+      make build
+      ls $(Build.SourcesDirectory)/deployment/pypi/dist/
+    condition: eq( variables['build_docker_img'], 'true' )
+    displayName: 'build nni bdsit_wheel'
+
+  - script: |
+      cd deployment/pypi
+      docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
+
+      echo 'updating docker file for installing nni from local...'
+      # update Dockerfile to install NNI in docker image from whl file built in last step
+      sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
+      cat ../docker/Dockerfile
+      echo $IMG_TAG
+      docker build -f ../docker/Dockerfile -t $(new_docker_img) .
+      docker push $(new_docker_img)
+    condition: eq( variables['build_docker_img'], 'true' )
+    displayName: 'build and upload nni docker image'
+
+  - script: |
+      source install.sh
+    displayName: 'Install nni toolkit via source code'
+
+  - script: |
+      if [ $(build_docker_img) = 'true' ]
+      then
+        export TEST_IMG=$(new_docker_img)
+      else
+        export TEST_IMG=$(existing_docker_img)
+      fi
+      echo "TEST_IMG:$TEST_IMG"
+      cd test
+      python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
+      --nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)
+
+      PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase
+    displayName: 'integration test'
--- a/test/training_service.yml
+++ b/test/training_service.yml
+kubeflow:
+  maxExecDuration: 15m
+  nniManagerIp:
+  kubeflowConfig:
+    operator: tf-operator
+    apiVersion: v1alpha2
+    storage: azureStorage
+    keyVault:
+      vaultName:
+      name:
+    azureStorage:
+      accountName:
+      azureShare:
+  trial:
+    worker:
+      replicas: 1
+      command:
+      gpuNum: 1
+      cpuNum: 1
+      memoryMB: 8192
+      image:
+  trainingServicePlatform: kubeflow
+
 local:
  trainingServicePlatform: local
-
-remote:
-  trainingServicePlatform: remote
-  machineList:
-    - ip:
-      port:
-      username:
-      passwd:
-
 pai:
-  trainingServicePlatform: pai
+  nniManagerIp:
+  maxExecDuration: 15m
  paiConfig:
-    userName: 
-    passWord: 
-    host: 
+    host:
+    passWord:
+    userName:
+  trainingServicePlatform: pai
  trial:
-    gpuNum:
-    cpuNum:
-    memoryMB: 
-    image: msranni/latest
-    dataDir: 
-    outputDir: 
+    gpuNum: 1
+    cpuNum: 1
+    dataDir:
+    image:
+    memoryMB: 8192
+    outputDir:
+remote:
+  machineList:
+  - ip:
+    passwd:
+    port:
+    username:
+  trainingServicePlatform: remote
--- a/test/utils.py
+++ b/test/utils.py
@@ -19,6 +19,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

 import contextlib
+import collections
 import json
 import os
 import subprocess
@@ -118,3 +119,16 @@ def parse_max_duration_time(max_exec_duration):
    time = max_exec_duration[:-1]
    units_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
    return int(time) * units_dict[unit]
+
+def deep_update(source, overrides):
+    """Update a nested dictionary or similar mapping.
+
+    Modify ``source`` in place.
+    """
+    for key, value in overrides.items():
+        if isinstance(value, collections.Mapping) and value:
+            returned = deep_update(source.get(key, {}), value)
+            source[key] = returned
+        else:
+            source[key] = overrides[key]
+    return source