Unverified Commit 8f01c779 authored by Ni Hao's avatar Ni Hao Committed by GitHub
Browse files

Add shared storage integration test (#3455)

parent 32fdd32b
...@@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig): ...@@ -46,6 +46,7 @@ class CustomAlgorithmConfig(_AlgorithmConfig):
class TrainingServiceConfig(ConfigBase): class TrainingServiceConfig(ConfigBase):
platform: str platform: str
@dataclass(init=False)
class SharedStorageConfig(ConfigBase): class SharedStorageConfig(ConfigBase):
storage_type: str storage_type: str
local_mount_point: str local_mount_point: str
......
...@@ -87,27 +87,30 @@ jobs: ...@@ -87,27 +87,30 @@ jobs:
cd test cd test
python3 nni_test/nnitest/generate_ts_config.py \ python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \ --ts remote \
--remote_reuse false \ --remote_reuse true \
--remote_user nni \ --remote_user nni \
--remote_host $(worker_ip) \ --remote_host $(worker_ip) \
--remote_port $(docker_port) \ --remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \ --remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip) --nni_manager_ip $(manager_ip) \
--azurestoragetoken $(azureblob_token_test) \
--nfs_server $(NFS_IP)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test displayName: Integration test (reuse mode)
- script: | - script: |
cd test cd test
python3 nni_test/nnitest/generate_ts_config.py \ python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \ --ts remote \
--remote_reuse true \ --remote_reuse false \
--remote_user nni \ --remote_user nni \
--remote_host $(worker_ip) \ --remote_host $(worker_ip) \
--remote_port $(docker_port) \ --remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \ --remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip) --nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test (reuse mode) displayName: Integration test
- task: SSH@0 - task: SSH@0
inputs: inputs:
......
...@@ -34,6 +34,34 @@ testCases: ...@@ -34,6 +34,34 @@ testCases:
# check status of experiment before calling validator # check status of experiment before calling validator
experimentStatusCheck: True experimentStatusCheck: True
- name: shared-storage-remote-azureblob
configFile: test/config/sharedstorage_test/config_sharedstorage_remote_azureblob.yml
config:
sharedStorage:
localMountPoint: /tmp/nnimount/testlocalrootpath
remoteMountPoint: /tmp/nnimount/testremoterootpath
storageAccountName: nennistorage
storageAccountKey: $(azureblob_token_test)
containerName: sharedstorage
validator:
class: FileExistValidator
kwargs:
rootpath: /tmp/nnimount/testlocalrootpath
# TODO: Enable this case after nfs server is ready
#- name: shared-storage-remote-nfs
# configFile: test/config/sharedstorage_test/config_sharedstorage_remote_nfs.yml
# config:
# sharedStorage:
# localMountPoint: /tmp/nnimount/testlocalrootpath
# remoteMountPoint: /tmp/nnimount/testremoterootpath
# nfsServer: $(NFS_IP)
# exportedDirectory: /home/nni/mnt/
# validator:
# class: FileExistValidator
# kwargs:
# rootpath: /tmp/nnimount/testlocalrootpath
- name: sklearn-regression - name: sklearn-regression
configFile: test/config/examples/sklearn-regression.yml configFile: test/config/examples/sklearn-regression.yml
...@@ -227,4 +255,3 @@ testCases: ...@@ -227,4 +255,3 @@ testCases:
######################################################################### #########################################################################
- name: customized-tuners-demotuner - name: customized-tuners-demotuner
configFile: test/config/customized_tuners/demotuner-sklearn-classification.yml configFile: test/config/customized_tuners/demotuner-sklearn-classification.yml
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
trainingServicePlatform: remote
searchSpacePath: config_sharedstorage_search_space.json
#choice: true, false
useAnnotation: false
nniManagerIp: 127.0.0.1
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 config_sharedstorage_trial.py
codeDir: .
gpuNum: 0
sharedStorage:
storageType: AzureBlob
localMountPoint: ${your/local/mount/point}
remoteMountPoint: ${your/remote/mount/point}
storageAccountName: ${replace_to_your_storageAccountName}
storageAccountKey: ${replace_to_your_storageAccountKey}
# If you did not set storageAccountKey, you need use `az login` with Azure CLI at first and set resourceGroupName.
# resourceGroupName: ${replace_to_your_resourceGroupName}
containerName: ${replace_to_your_containerName}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted: nnimount
#machineList can be empty if the platform is local
machineList:
- ip: 10.1.1.1
username: bob
passwd: bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig:
reuse: true
\ No newline at end of file
authorName: default
experimentName: example_mnist
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 1
trainingServicePlatform: remote
searchSpacePath: config_sharedstorage_search_space.json
#choice: true, false
useAnnotation: false
nniManagerIp: 127.0.0.1
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
#SMAC (SMAC should be installed through nnictl)
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
trial:
command: python3 config_sharedstorage_trial.py
codeDir: .
gpuNum: 0
sharedStorage:
storageType: NFS
localMountPoint: ${your/local/mount/point}
remoteMountPoint: ${your/remote/mount/point}
nfsServer: ${nfs-server-ip}
exportedDirectory: ${nfs/exported/directory}
# usermount means you have already mount this storage on localMountPoint
# nnimount means nni will try to mount this storage on localMountPoint
# nomount means storage will not mount in local machine, will support partial storages in the future
localMounted: nnimount
#machineList can be empty if the platform is local
machineList:
- ip: 10.1.1.1
username: bob
passwd: bob123
#port can be skip if using default ssh port 22
#port: 22
remoteConfig:
reuse: true
\ No newline at end of file
{
"dropout_rate":{"_type":"uniform","_value":[0.5, 0.9]},
"conv_size":{"_type":"choice","_value":[2,3,5,7]},
"hidden_size":{"_type":"choice","_value":[124, 512, 1024]},
"batch_size": {"_type":"choice", "_value": [16, 32]},
"learning_rate":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}
}
"""
A deep MNIST classifier using convolutional layers.
This file is a modification of the official pytorch mnist example:
https://github.com/pytorch/examples/blob/master/mnist/main.py
"""
import os
import logging
import nni
logger = logging.getLogger('mnist_AutoML')
if __name__ == '__main__':
try:
logger.debug(os.environ.get('NNI_OUTPUT_DIR'))
filename = os.path.join(os.environ.get('NNI_OUTPUT_DIR'), 'checkingfile.txt')
f = open(filename, "a")
tuner_params = nni.get_next_parameter()
f.write(str(tuner_params))
nni.report_final_result(1)
f.close()
except Exception as exception:
logger.exception(exception)
raise
...@@ -87,6 +87,9 @@ remote: ...@@ -87,6 +87,9 @@ remote:
port: port:
username: username:
trainingServicePlatform: remote trainingServicePlatform: remote
sharedStorage:
storageAccountKey:
nfsServer:
hybrid: hybrid:
maxExecDuration: 15m maxExecDuration: 15m
nniManagerIp: nniManagerIp:
......
...@@ -74,6 +74,10 @@ def update_training_service_config(args): ...@@ -74,6 +74,10 @@ def update_training_service_config(args):
config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd config[args.ts]['machineList'][0]['passwd'] = args.remote_pwd
if args.remote_reuse is not None: if args.remote_reuse is not None:
config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true' config[args.ts]['remoteConfig']['reuse'] = args.remote_reuse.lower() == 'true'
if args.azurestoragetoken is not None:
config[args.ts]['sharedStorage']['storageAccountKey'] = args.azurestoragetoken
if args.nfs_server is not None:
config[args.ts]['sharedStorage']['nfsServer'] = args.nfs_server
elif args.ts == 'adl': elif args.ts == 'adl':
if args.nni_docker_image is not None: if args.nni_docker_image is not None:
config[args.ts]['trial']['image'] = args.nni_docker_image config[args.ts]['trial']['image'] = args.nni_docker_image
...@@ -118,6 +122,8 @@ if __name__ == '__main__': ...@@ -118,6 +122,8 @@ if __name__ == '__main__':
parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1') parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1')
parser.add_argument("--nni_docker_image", type=str) parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str) parser.add_argument("--nni_manager_ip", type=str)
parser.add_argument("--azurestoragetoken", type=str)
parser.add_argument("--nfs_server", type=str)
# args for PAI # args for PAI
parser.add_argument("--pai_user", type=str) parser.add_argument("--pai_user", type=str)
parser.add_argument("--pai_pwd", type=str) parser.add_argument("--pai_pwd", type=str)
...@@ -131,7 +137,6 @@ if __name__ == '__main__': ...@@ -131,7 +137,6 @@ if __name__ == '__main__':
parser.add_argument("--nni_manager_nfs_mount_path", type=str) parser.add_argument("--nni_manager_nfs_mount_path", type=str)
parser.add_argument("--container_nfs_mount_path", type=str) parser.add_argument("--container_nfs_mount_path", type=str)
# args for kubeflow and frameworkController # args for kubeflow and frameworkController
parser.add_argument("--nfs_server", type=str)
parser.add_argument("--nfs_path", type=str) parser.add_argument("--nfs_path", type=str)
parser.add_argument("--keyvault_vaultname", type=str) parser.add_argument("--keyvault_vaultname", type=str)
parser.add_argument("--keyvault_name", type=str) parser.add_argument("--keyvault_name", type=str)
......
...@@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT, ...@@ -23,7 +23,7 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables = {} it_variables = {}
def update_training_service_config(config, training_service, config_file_path): def update_training_service_config(config, training_service, config_file_path, nni_source_dir):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml')) it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
# hack for kubeflow trial config # hack for kubeflow trial config
...@@ -53,6 +53,18 @@ def update_training_service_config(config, training_service, config_file_path): ...@@ -53,6 +53,18 @@ def update_training_service_config(config, training_service, config_file_path):
it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir it_ts_config[training_service]['trial']['codeDir'] = containerCodeDir
it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command']) it_ts_config[training_service]['trial']['command'] = 'cd {0} && {1}'.format(containerCodeDir, config['trial']['command'])
if training_service == 'remote':
testcase_config = get_yml_content(nni_source_dir + config_file_path)
sharedStorage = testcase_config.get('sharedStorage')
if sharedStorage is None:
it_ts_config[training_service].pop('sharedStorage')
elif str(sharedStorage.get('storageType')).lower() == 'nfs':
it_ts_config[training_service].get('sharedStorage').pop('storageAccountKey')
elif str(sharedStorage.get('storageType')).lower() == 'azureblob':
it_ts_config[training_service].get('sharedStorage').pop('nfsServer')
else:
it_ts_config[training_service].pop('sharedStorage')
if training_service == 'hybrid': if training_service == 'hybrid':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml')) it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
else: else:
...@@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args): ...@@ -75,7 +87,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config # apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml # user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step # the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile']) update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir)
# generate temporary config yml file to launch experiment # generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp' new_config_file = config_path + '.tmp'
...@@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service): ...@@ -238,6 +250,15 @@ def match_training_service(test_case_config, cur_training_service):
return True return True
return False return False
def match_remoteConfig(test_case_config, nni_source_dir):
trainingservice_config = get_yml_content(os.path.join('config', 'training_service.yml'))
trainingservice_config_reuse_value = str(trainingservice_config['remote']['remoteConfig']['reuse']).lower()
testcase_config = get_yml_content(nni_source_dir + test_case_config['configFile'])
if testcase_config.get('remoteConfig') is not None:
if testcase_config['remoteConfig'].get('reuse') is not None:
return str(testcase_config['remoteConfig']['reuse']).lower() == trainingservice_config_reuse_value
return True
def run(args): def run(args):
it_config = get_yml_content(args.config) it_config = get_yml_content(args.config)
...@@ -264,8 +285,13 @@ def run(args): ...@@ -264,8 +285,13 @@ def run(args):
print('skipped {}, training service {} not match [{}]'.format( print('skipped {}, training service {} not match [{}]'.format(
name, args.ts, test_case_config['trainingService'])) name, args.ts, test_case_config['trainingService']))
continue continue
# remote mode need more time to cleanup # remote mode need more time to cleanup
if args.ts == 'remote' or args.ts == 'hybrid': if args.ts == 'remote' or args.ts == 'hybrid':
if args.ts == 'remote':
if not match_remoteConfig(test_case_config, args.nni_source_dir):
print('skipped {}, remoteConfig not match.'.format(name))
continue
wait_for_port_available(8080, 240) wait_for_port_available(8080, 240)
else: else:
wait_for_port_available(8080, 60) wait_for_port_available(8080, 60)
......
...@@ -97,3 +97,17 @@ class NnicliValidator(ITValidator): ...@@ -97,3 +97,17 @@ class NnicliValidator(ITValidator):
print(exp.get_job_statistics()) print(exp.get_job_statistics())
print(exp.get_experiment_status()) print(exp.get_experiment_status())
print(exp.list_trial_jobs()) print(exp.list_trial_jobs())
class FileExistValidator(ITValidator):
def __call__(self, rest_endpoint, experiment_dir, nni_source_dir, **kwargs):
print(rest_endpoint)
exp_id = osp.split(experiment_dir)[-1]
rootpath = kwargs.get('rootpath')
metrics = requests.get(METRICS_URL).json()
for metric in metrics:
trial_id = metric['trialJobId']
checkpath = osp.join(rootpath, 'nni', exp_id, 'trials', trial_id, 'nnioutput', 'checkingfile.txt')
print('Checking shared storage log exists on trial ',trial_id)
assert osp.exists(checkpath)
...@@ -25,8 +25,9 @@ container = sys.argv[2] ...@@ -25,8 +25,9 @@ container = sys.argv[2]
password = sys.argv[3] password = sys.argv[3]
run_command(f'docker build --build-arg NNI_RELEASE={version} -t nnidev/nni-nightly .') run_command(f'docker build --build-arg NNI_RELEASE={version} -t nnidev/nni-nightly .')
run_command(f'docker run -d -t -p {port}:22 --name {container} nnidev/nni-nightly') run_command(f'docker run --privileged -d -t -p {port}:22 --name {container} nnidev/nni-nightly')
run_command(f'docker exec {container} useradd --create-home --password {password} nni') run_command(f'docker exec {container} useradd --create-home --password {password} nni')
run_command(['docker', 'exec', container, 'bash', '-c', f'echo "nni:{password}" | chpasswd']) run_command(['docker', 'exec', container, 'bash', '-c', f'echo "nni:{password}" | chpasswd'])
run_command(['docker', 'exec', container, 'bash', '-c', 'echo "nni ALL=(ALL:ALL) NOPASSWD:ALL" >> /etc/sudoers'])
run_command(f'docker exec {container} service ssh start') run_command(f'docker exec {container} service ssh start')
set_variable('docker_port', port) set_variable('docker_port', port)
...@@ -34,13 +34,13 @@ fi ...@@ -34,13 +34,13 @@ fi
id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g) id=$(lsb_release -i | cut -c16- | sed s/[[:space:]]//g)
version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g) version=$(lsb_release -r | cut -c9- | sed s/[[:space:]]//g)
if [ $id = "Ubuntu" ] if [ "$id" = "Ubuntu" ]
then then
wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb wget https://packages.microsoft.com/config/ubuntu/$version/packages-microsoft-prod.deb
sudo dpkg -i packages-microsoft-prod.deb sudo DEBIAN_FRONTEND=noninteractive dpkg -i packages-microsoft-prod.deb
sudo apt-get update sudo apt-get update
sudo apt-get install -y blobfuse fuse sudo apt-get install -y blobfuse fuse
elif [ $id = "CentOS" ] || [ $id = "RHEL" ] elif [ "$id" = "CentOS" ] || [ "$id" = "RHEL" ]
then then
sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm sudo rpm -Uvh https://packages.microsoft.com/config/rhel/$(echo $version | cut -c1)/packages-microsoft-prod.rpm
sudo yum install -y blobfuse fuse sudo yum install -y blobfuse fuse
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment