Unverified Commit 649a9c38 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Add frameworkcontroller pipeline (#1971)

merging to master
parent b49b38f8
......@@ -29,6 +29,12 @@ def gen_new_config(config_file, training_service='local'):
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
if training_service == 'frameworkcontroller':
it_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command')
if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum')
deep_update(config, it_config['all'])
deep_update(config, it_config[training_service])
......@@ -106,7 +112,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--config", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow'], default='local')
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'], default='local')
parser.add_argument("--local_gpu", action='store_true')
parser.add_argument("--preinstall", action='store_true')
args = parser.parse_args()
......
......@@ -42,6 +42,21 @@ def update_training_service_config(args):
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller':
if args.nfs_server is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['path'] = args.nfs_path
if args.keyvault_vaultname is not None:
config[args.ts]['frameworkcontrollerConfig']['keyVault']['vaultName'] = args.keyvault_vaultname
if args.keyvault_name is not None:
config[args.ts]['frameworkcontrollerConfig']['keyVault']['name'] = args.keyvault_name
if args.azs_account is not None:
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['accountName'] = args.azs_account
if args.azs_share is not None:
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['taskRoles'][0]['image'] = args.nni_docker_image
elif args.ts == 'remote':
if args.remote_user is not None:
config[args.ts]['machineList'][0]['username'] = args.remote_user
......@@ -69,7 +84,7 @@ def convert_command():
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote', 'local', 'frameworkcontroller'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
......@@ -79,7 +94,7 @@ if __name__ == '__main__':
parser.add_argument("--data_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--vc", type=str)
# args for kubeflow
# args for kubeflow and frameworkController
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str)
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
jobs:
- job: 'integration_test_frameworkController'
timeoutInMinutes: 0
steps:
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: 'Install python tools'
- script: |
cd deployment/pypi
echo 'building prerelease package...'
make build
ls $(Build.SourcesDirectory)/deployment/pypi/dist/
condition: eq( variables['build_docker_img'], 'true' )
displayName: 'build nni bdsit_wheel'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- script: |
sudo apt-get install swig -y
PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC
PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB
displayName: 'Install dependencies for integration tests in frameworkcontroller mode'
- script: |
if [ $(build_docker_img) = 'true' ]
then
cd deployment/pypi
docker login -u $(docker_hub_user) -p $(docker_hub_pwd)
echo 'updating docker file for installing nni from local...'
# update Dockerfile to install NNI in docker image from whl file built in last step
sed -ie 's/RUN python3 -m pip --no-cache-dir install nni/COPY .\/dist\/* .\nRUN python3 -m pip install nni-*.whl/' ../docker/Dockerfile
cat ../docker/Dockerfile
export IMG_TAG=`date -u +%y%m%d%H%M`
docker build -f ../docker/Dockerfile -t $(test_docker_img_name):$IMG_TAG .
docker push $(test_docker_img_name):$IMG_TAG
export TEST_IMG=$(test_docker_img_name):$IMG_TAG
cd ../../
else
export TEST_IMG=$(existing_docker_img)
fi
echo "TEST_IMG:$TEST_IMG"
cd test
python3 generate_ts_config.py --ts frameworkcontroller --keyvault_vaultname $(keyVault_vaultName) --keyvault_name $(keyVault_name) \
--azs_account $(azureStorage_accountName) --azs_share $(azureStorage_azureShare) --nni_docker_image $TEST_IMG --nni_manager_ip $(nni_manager_ip)
cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts frameworkcontroller --exclude multi_phase
displayName: 'integration test'
......@@ -24,6 +24,32 @@ kubeflow:
image:
trainingServicePlatform: kubeflow
frameworkcontroller:
maxExecDuration: 15m
nniManagerIp:
frameworkcontrollerConfig:
serviceAccountName: frameworkbarrier
storage: azureStorage
keyVault:
vaultName:
name:
azureStorage:
accountName:
azureShare:
trial:
taskRoles:
- name: worker
taskNum: 1
command:
gpuNum: 1
cpuNum: 1
memoryMB: 8192
image:
frameworkAttemptCompletionPolicy:
minFailedTaskCount: 1
minSucceededTaskCount: 1
trainingServicePlatform: frameworkcontroller
local:
trainingServicePlatform: local
pai:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment