Unverified Commit 263498de authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support remote ci (#497)

Add integration test for remoteTrainingService platform
parent 51fbf695
......@@ -55,12 +55,21 @@ def update_training_service_config(args):
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
elif args.ts == 'remote':
if args.remote_user is not None:
config[args.ts]['machineList'][0]['username'] = args.remote_user
if args.remote_host is not None:
config[args.ts]['machineList'][0]['ip'] = args.remote_host
if args.remote_port is not None:
config[args.ts]['machineList'][0]['port'] = args.remote_port
if args.remote_pwd is not None:
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
dump_yml_content(TRAINING_SERVICE_FILE, config)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow'], default='pai')
parser.add_argument("--ts", type=str, choices=['pai', 'kubeflow', 'remote'], default='pai')
parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str)
# args for PAI
......@@ -76,6 +85,11 @@ if __name__ == '__main__':
parser.add_argument("--keyvault_name", type=str)
parser.add_argument("--azs_account", type=str)
parser.add_argument("--azs_share", type=str)
# args for remote
parser.add_argument("--remote_user", type=str)
parser.add_argument("--remote_pwd", type=str)
parser.add_argument("--remote_host", type=str)
parser.add_argument("--remote_port", type=int)
args = parser.parse_args()
update_training_service_config(args)
#!/bin/bash
ip="nni@104.210.63.241"
key="id_rsa"
chmod 600 $key
echo "Initializing remote machine..."
yes | ssh -i $key $ip "rm -rf pynni"
echo "Copy nni sdk to remote machine..."
scp -i $key -r ../src/sdk/pynni $ip:~
echo "Install nni sdk in remote machine..."
ssh -i $key $ip "cd pynni && python3 -m pip install --user ."
\ No newline at end of file
jobs:
- job: 'integration_test_remote'
steps:
- script: python3 -m pip install --upgrade pip setuptools
displayName: 'Install python tools'
- script: |
source install.sh
displayName: 'Install nni toolkit via source code'
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: remote_nni-ci-gpu-01
sourceFolder: src/sdk/pynni
targetFolder: /tmp/nnitest/$(Build.BuildId)/pynni
overwrite: true
displayName: 'Copy sdk files to remote machine'
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: remote_nni-ci-gpu-01
sourceFolder: tools
targetFolder: /tmp/nnitest/$(Build.BuildId)/tools
overwrite: true
displayName: 'Copy tool files to remote machine'
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: remote_nni-ci-gpu-01
sourceFolder: test
targetFolder: /tmp/nnitest/$(Build.BuildId)/test
overwrite: true
displayName: 'Copy test files to remote machine'
- task: SSH@0
inputs:
sshEndpoint: remote_nni-ci-gpu-01
runOptions: commands
commands: python3 /tmp/nnitest/$(Build.BuildId)/test/remote_docker.py --mode start --name $(Build.BuildId) --image nni/nni
displayName: 'Start docker'
- task: DownloadSecureFile@1
inputs:
secureFile: remote_ci_private_key
- script: |
cp $(Agent.TempDirectory)/remote_ci_private_key test/id_rsa
chmod 600 test/id_rsa
scp -i test/id_rsa $(remote_user)@$(remote_host):/tmp/nnitest/$(Build.BuildId)/port test/port
cat test/port
displayName: 'Get docker port'
- script: |
cd test
python3 generate_ts_config.py --ts remote --remote_user $(docker_user) --remote_host $(remote_host) \
--remote_port $(cat port) --remote_pwd $(docker_pwd) --nni_manager_ip $(nni_manager_ip)
cat training_service.yml
PATH=$HOME/.local/bin:$PATH python3 config_test.py --ts remote --exclude cifar10
displayName: 'integration test'
- task: SSH@0
inputs:
sshEndpoint: remote_nni-ci-gpu-01
runOptions: commands
commands: python3 /tmp/nnitest/$(Build.BuildId)/test/remote_docker.py --mode stop --name $(Build.BuildId)
displayName: 'Stop docker'
import os
import argparse
from subprocess import check_output, check_call
import socket
import random
def detect_port(port):
'''Detect if the port is used, return True if the port is used'''
socket_test = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
socket_test.connect(('127.0.0.1', int(port)))
socket_test.close()
return True
except:
return False
def find_port():
'''Find a port which is free'''
port = random.randint(10000, 20000)
while detect_port(port):
port = random.randint(10000, 20000)
return port
def start_container(image, name):
'''Start docker container, generate a port in /tmp/nnitest/{name}/port file'''
port = find_port()
source_dir = '/tmp/nnitest/' + name
run_cmds = ['docker', 'run', '-d', '-p', str(port) + ':22', '--name', name, '--mount', 'type=bind,source=' + source_dir + ',target=/tmp/nni', image]
output = check_output(run_cmds)
commit_id = output.decode('utf-8')
sdk_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--user', '--no-cache-dir', '/tmp/nni/pynni/']
check_call(sdk_cmds)
tools_cmds = ['docker', 'exec', name, 'python3', '-m', 'pip', 'install', '--user', '--no-cache-dir', '/tmp/nni/tools']
check_call(tools_cmds)
with open(source_dir + '/port', 'w') as file:
file.write(str(port))
def stop_container(name):
'''Stop docker container'''
stop_cmds = ['docker', 'container', 'stop', name]
check_call(stop_cmds)
rm_cmds = ['docker', 'container', 'rm', name]
check_call(rm_cmds)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--mode', required=True, choices=['start', 'stop'], dest='mode', help='start or stop a container')
parser.add_argument('--name', required=True, dest='name', help='the name of container to be used')
parser.add_argument('--image', dest='image', help='the image to be used')
args = parser.parse_args()
if args.mode == 'start':
start_container(args.image, args.name)
else:
stop_container(args.name)
......@@ -72,6 +72,9 @@ def parse_path(experiment_config, config_path):
parse_relative_path(root_path, experiment_config['assessor'], 'codeDir')
if experiment_config.get('advisor'):
parse_relative_path(root_path, experiment_config['advisor'], 'codeDir')
if experiment_config.get('machineList'):
for index in range(len(experiment_config['machineList'])):
parse_relative_path(root_path, experiment_config['machineList'][index], 'sshKeyPath')
def validate_search_space_content(experiment_config):
'''Validate searchspace content,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment