Unverified Commit 5f8ffcd5 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Integration test azure pipelines for PAI/kubeflow training service (#768)

Integration test pipeline for PAI/kubeflow training service. It build nni.it docker image and run integration tests on PAI/kubeflow with the built docker image.
parent b2cdc30d
......@@ -26,12 +26,11 @@ import time
import traceback
from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
parse_max_duration_time, get_succeeded_trial_num, print_stderr
parse_max_duration_time, get_succeeded_trial_num, print_stderr, deep_update
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL
def gen_new_config(config_file, training_service='local'):
'''
'''
Generates temporary config file for integration test, the file
should be deleted after testing.
'''
......@@ -41,7 +40,15 @@ def gen_new_config(config_file, training_service='local'):
ts = get_yml_content('training_service.yml')[training_service]
print(config)
print(ts)
config.update(ts)
# hack for kubeflow trial config
if training_service == 'kubeflow':
ts['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, ts)
print(config)
dump_yml_content(new_config_file, config)
......@@ -61,7 +68,7 @@ def run_test(config_file, training_service, local_gpu=False):
proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
max_duration, max_trial_num = get_max_values(config_file)
max_duration, max_trial_num = get_max_values(new_config_file)
sleep_interval = 3
for _ in range(0, max_duration+30, sleep_interval):
......@@ -90,6 +97,12 @@ def run(args):
config_files = glob.glob('./config_test/**/*.test.yml')
else:
config_files = args.config.split(',')
if args.exclude is not None:
exclude_paths = args.exclude.split(',')
if exclude_paths:
for exclude_path in exclude_paths:
config_files = [x for x in config_files if exclude_path not in x]
print(config_files)
for config_file in config_files:
......@@ -107,11 +120,10 @@ def run(args):
subprocess.run(['nnictl', 'stop'])
if __name__ == '__main__':
import tensorflow as tf
print('TF VERSION:', tf.__version__)
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai'], default='local')
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
......
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import argparse
from utils import get_yml_content, dump_yml_content
TRAINING_SERVICE_FILE = 'training_service.yml'
def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
if args.nni_manager_ip is not None:
config[args.ts]['nniManagerIp'] = args.nni_manager_ip
if args.ts == 'pai':
if args.pai_user is not None:
config[args.ts]['paiConfig']['userName'] = args.pai_user
if args.pai_pwd is not None:
config[args.ts]['paiConfig']['passWord'] = args.pai_pwd
if args.pai_host is not None:
config[args.ts]['paiConfig']['host'] = args.pai_host
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.data_dir is not None:
config[args.ts]['trial']['dataDir'] = args.data_dir
if args.output_dir is not None:
config[args.ts]['trial']['outputDir'] = args.output_dir
elif args.ts == 'kubeflow':
if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
config[args.ts]['kubeflowConfig']['nfs']['path'] = args.nfs_path
if args.keyvault_vaultname is not None:
config[args.ts]['kubeflowConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
if args.keyvault_name is not None:
config[args.ts]['kubeflowConfig']['keyVault']['name'] = args.keyvault_name
if args.azs_account is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['accountName'] = args.azs_account
if args.azs_share is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
dump_yml_content(TRAINING_SERVICE_FILE, config)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
parser.add_argument("--pai_user", type=str)
parser.add_argument("--pai_pwd", type=str)
parser.add_argument("--pai_host", type=str)
parser.add_argument("--data_dir", type=str)
parser.add_argument("--output_dir", type=str)
# args for kubeflow
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
parser.add_argument("--keyvault_name", type=str)
parser.add_argument("--azs_account", type=str)
parser.add_argument("--azs_share", type=str)
args = parser.parse_args()
update_training_service_config(args)
jobs:
- job: 'integration_test_kubeflow'
pool: 'NNI CI KUBE CLI'
variables:
new_docker_img: msranni/nni.it.kb:latest
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase
displayName: 'integration test'
jobs:
- job: 'integration_test_pai'
pool: 'NNI CI PAI CLI'
variables:
new_docker_img: msranni/nni.it.pai:latest
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
--nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase
displayName: 'integration test'
kubeflow:
maxExecDuration: 15m
nniManagerIp:
kubeflowConfig:
operator: tf-operator
apiVersion: v1alpha2
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
trial:
worker:
replicas: 1
command:
gpuNum: 1
cpuNum: 1
memoryMB: 8192
image:
trainingServicePlatform: kubeflow
local:
trainingServicePlatform: local
remote:
trainingServicePlatform: remote
machineList:
- ip:
port:
username:
passwd:
pai:
trainingServicePlatform: pai
nniManagerIp:
maxExecDuration: 15m
paiConfig:
userName:
passWord:
host:
host:
passWord:
userName:
trainingServicePlatform: pai
trial:
gpuNum:
cpuNum:
memoryMB:
image: msranni/latest
dataDir:
outputDir:
gpuNum: 1
cpuNum: 1
dataDir:
image:
memoryMB: 8192
outputDir:
remote:
machineList:
- ip:
passwd:
port:
username:
trainingServicePlatform: remote
......@@ -19,6 +19,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import contextlib
import collections
import json
import os
import subprocess
......@@ -118,3 +119,16 @@ def parse_max_duration_time(max_exec_duration):
time = max_exec_duration[:-1]
units_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
return int(time) * units_dict[unit]
def deep_update(source, overrides):
"""Update a nested dictionary or similar mapping.
Modify ``source`` in place.
"""
for key, value in overrides.items():
if isinstance(value, collections.Mapping) and value:
returned = deep_update(source.get(key, {}), value)
source[key] = returned
else:
source[key] = overrides[key]
return source
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment