Unverified Commit 5f8ffcd5 authored by chicm-ms's avatar chicm-ms Committed by GitHub
Browse files

Integration test azure pipelines for PAI/kubeflow training service (#768)

Integration test pipeline for PAI/kubeflow training service. It build nni.it docker image and run integration tests on PAI/kubeflow with the built docker image.
parent b2cdc30d
...@@ -26,12 +26,11 @@ import time ...@@ -26,12 +26,11 @@ import time
import traceback import traceback
from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \ from utils import setup_experiment, get_experiment_status, get_yml_content, dump_yml_content, \
parse_max_duration_time, get_succeeded_trial_num, print_stderr parse_max_duration_time, get_succeeded_trial_num, print_stderr, deep_update
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL
def gen_new_config(config_file, training_service='local'): def gen_new_config(config_file, training_service='local'):
''' '''
Generates temporary config file for integration test, the file Generates temporary config file for integration test, the file
should be deleted after testing. should be deleted after testing.
''' '''
...@@ -41,7 +40,15 @@ def gen_new_config(config_file, training_service='local'): ...@@ -41,7 +40,15 @@ def gen_new_config(config_file, training_service='local'):
ts = get_yml_content('training_service.yml')[training_service] ts = get_yml_content('training_service.yml')[training_service]
print(config) print(config)
print(ts) print(ts)
config.update(ts)
# hack for kubeflow trial config
if training_service == 'kubeflow':
ts['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, ts)
print(config) print(config)
dump_yml_content(new_config_file, config) dump_yml_content(new_config_file, config)
...@@ -61,7 +68,7 @@ def run_test(config_file, training_service, local_gpu=False): ...@@ -61,7 +68,7 @@ def run_test(config_file, training_service, local_gpu=False):
proc = subprocess.run(['nnictl', 'create', '--config', new_config_file]) proc = subprocess.run(['nnictl', 'create', '--config', new_config_file])
assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode assert proc.returncode == 0, '`nnictl create` failed with code %d' % proc.returncode
max_duration, max_trial_num = get_max_values(config_file) max_duration, max_trial_num = get_max_values(new_config_file)
sleep_interval = 3 sleep_interval = 3
for _ in range(0, max_duration+30, sleep_interval): for _ in range(0, max_duration+30, sleep_interval):
...@@ -90,6 +97,12 @@ def run(args): ...@@ -90,6 +97,12 @@ def run(args):
config_files = glob.glob('./config_test/**/*.test.yml') config_files = glob.glob('./config_test/**/*.test.yml')
else: else:
config_files = args.config.split(',') config_files = args.config.split(',')
if args.exclude is not None:
exclude_paths = args.exclude.split(',')
if exclude_paths:
for exclude_path in exclude_paths:
config_files = [x for x in config_files if exclude_path not in x]
print(config_files) print(config_files)
for config_file in config_files: for config_file in config_files:
...@@ -107,11 +120,10 @@ def run(args): ...@@ -107,11 +120,10 @@ def run(args):
subprocess.run(['nnictl', 'stop']) subprocess.run(['nnictl', 'stop'])
if __name__ == '__main__': if __name__ == '__main__':
import tensorflow as tf
print('TF VERSION:', tf.__version__)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None) parser.add_argument("--config", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai'], default='local') parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
parser.add_argument("--local_gpu", action='store_true') parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true') parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args() args = parser.parse_args()
......
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import argparse
from utils import get_yml_content, dump_yml_content
TRAINING_SERVICE_FILE = 'training_service.yml'
def update_training_service_config(args):
config = get_yml_content(TRAINING_SERVICE_FILE)
if args.nni_manager_ip is not None:
config[args.ts]['nniManagerIp'] = args.nni_manager_ip
if args.ts == 'pai':
if args.pai_user is not None:
config[args.ts]['paiConfig']['userName'] = args.pai_user
if args.pai_pwd is not None:
config[args.ts]['paiConfig']['passWord'] = args.pai_pwd
if args.pai_host is not None:
config[args.ts]['paiConfig']['host'] = args.pai_host
if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image
if args.data_dir is not None:
config[args.ts]['trial']['dataDir'] = args.data_dir
if args.output_dir is not None:
config[args.ts]['trial']['outputDir'] = args.output_dir
elif args.ts == 'kubeflow':
if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
config[args.ts]['kubeflowConfig']['nfs']['path'] = args.nfs_path
if args.keyvault_vaultname is not None:
config[args.ts]['kubeflowConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
if args.keyvault_name is not None:
config[args.ts]['kubeflowConfig']['keyVault']['name'] = args.keyvault_name
if args.azs_account is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['accountName'] = args.azs_account
if args.azs_share is not None:
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
dump_yml_content(TRAINING_SERVICE_FILE, config)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
parser.add_argument("--pai_user", type=str)
parser.add_argument("--pai_pwd", type=str)
parser.add_argument("--pai_host", type=str)
parser.add_argument("--data_dir", type=str)
parser.add_argument("--output_dir", type=str)
# args for kubeflow
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
parser.add_argument("--keyvault_name", type=str)
parser.add_argument("--azs_account", type=str)
parser.add_argument("--azs_share", type=str)
args = parser.parse_args()
update_training_service_config(args)
jobs:
- job: 'integration_test_kubeflow'
pool: 'NNI CI KUBE CLI'
variables:
new_docker_img: msranni/nni.it.kb:latest
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts kubeflow --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts kubeflow --exclude multi_phase
displayName: 'integration test'
jobs:
- job: 'integration_test_pai'
pool: 'NNI CI PAI CLI'
variables:
new_docker_img: msranni/nni.it.pai:latest
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
echo $IMG_TAG
docker build -f ../docker/Dockerfile -t $(new_docker_img) .
docker push $(new_docker_img)
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build and upload nni docker image'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
if [ $(build_docker_img) = 'true' ]
then
export TEST_IMG=$(new_docker_img)
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts pai --pai_host $(pai_host) --pai_user $(pai_user) --pai_pwd $(pai_pwd) \
--nni_docker_image $TEST_IMG --data_dir $(data_dir) --output_dir $(output_dir) --nni_manager_ip $(nni_manager_ip)
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts pai --exclude multi_phase
displayName: 'integration test'
kubeflow:
maxExecDuration: 15m
nniManagerIp:
kubeflowConfig:
operator: tf-operator
apiVersion: v1alpha2
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
trial:
worker:
replicas: 1
command:
gpuNum: 1
cpuNum: 1
memoryMB: 8192
image:
trainingServicePlatform: kubeflow
local: local:
trainingServicePlatform: local trainingServicePlatform: local
remote:
trainingServicePlatform: remote
machineList:
- ip:
port:
username:
passwd:
pai: pai:
trainingServicePlatform: pai nniManagerIp:
maxExecDuration: 15m
paiConfig: paiConfig:
userName: host:
passWord: passWord:
host: userName:
trainingServicePlatform: pai
trial: trial:
gpuNum: gpuNum: 1
cpuNum: cpuNum: 1
memoryMB: dataDir:
image: msranni/latest image:
dataDir: memoryMB: 8192
outputDir: outputDir:
remote:
machineList:
- ip:
passwd:
port:
username:
trainingServicePlatform: remote
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import contextlib import contextlib
import collections
import json import json
import os import os
import subprocess import subprocess
...@@ -118,3 +119,16 @@ def parse_max_duration_time(max_exec_duration): ...@@ -118,3 +119,16 @@ def parse_max_duration_time(max_exec_duration):
time = max_exec_duration[:-1] time = max_exec_duration[:-1]
units_dict = {'s':1, 'm':60, 'h':3600, 'd':86400} units_dict = {'s':1, 'm':60, 'h':3600, 'd':86400}
return int(time) * units_dict[unit] return int(time) * units_dict[unit]
def deep_update(source, overrides):
"""Update a nested dictionary or similar mapping.
Modify ``source`` in place.
"""
for key, value in overrides.items():
if isinstance(value, collections.Mapping) and value:
returned = deep_update(source.get(key, {}), value)
source[key] = returned
else:
source[key] = overrides[key]
return source
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment