Unverified Commit e50ca8d3 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support reuse mode for pipeline (#4310)

parent 21256bf9
...@@ -50,3 +50,19 @@ jobs: ...@@ -50,3 +50,19 @@ jobs:
--nni_manager_ip $(manager_ip) --nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --exclude multi-phase,multi-thread
displayName: Integration test displayName: Integration test
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts frameworkcontroller \
--keyvault_vaultname $(keyvault_vaultname) \
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts frameworkcontroller --reuse_mode True --exclude multi-phase,multi-thread
displayName: Integration test (reuse mode)
...@@ -61,3 +61,20 @@ jobs: ...@@ -61,3 +61,20 @@ jobs:
--nni_manager_ip $(manager_ip) --nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --exclude multi-phase,multi-thread
displayName: Integration test displayName: Integration test
- script: |
set -e
cd test
az login --service-principal -u $(client_id) -p $(client_secret) --tenant $(tenant_id)
python3 nni_test/nnitest/generate_ts_config.py \
--ts kubeflow \
--keyvault_vaultname $(keyvault_vaultname) \``
--keyvault_name $(keyvault_name) \
--azs_account $(azs_account) \
--azs_share $(azs_share) \
--nni_docker_image nnidev/nni-nightly \
--nni_manager_ip $(manager_ip) \
--reuse_mode True \
--config_version v2
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts kubeflow --reuse_mode True --exclude multi-phase,multi-thread
displayName: Integration test (reuse mode)
...@@ -12,3 +12,53 @@ hybrid: ...@@ -12,3 +12,53 @@ hybrid:
resourceGroup: resourceGroup:
workspaceName: workspaceName:
computeTarget: computeTarget:
kubeflow:
trialGpuNumber: 0
trialConcurrency: 2
maxTrialNumber: 2
nniManagerIp:
trainingService:
reuseMode: true
platform: kubeflow
worker:
command:
code_directory:
dockerImage:
cpuNumber: 1
gpuNumber: 0
memorySize: 8192
replicas: 1
operator: tf-operator
storage:
storageType: azureStorage
azureAccount:
azureShare:
keyVaultName:
keyVaultKey:
apiVersion: v1
frameworkcontroller:
trialGpuNumber: 0
trialConcurrency: 2
maxTrialNumber: 2
nniManagerIp:
trainingService:
reuseMode: true
platform: frameworkcontroller
serviceAccountName: frameworkcontroller
taskRoles:
- name: worker
dockerImage:
taskNumber: 1
command:
gpuNumber: 0
cpuNumber: 1
memorySize: 8192
framework_attempt_completion_policy:
min_failed_task_count: 1
minSucceedTaskCount: 1
storage:
storageType: azureStorage
azureAccount:
azureShare:
keyVaultName:
keyVaultKey:
\ No newline at end of file
...@@ -35,7 +35,7 @@ def update_training_service_config(args): ...@@ -35,7 +35,7 @@ def update_training_service_config(args):
config[args.ts]['trial']['virtualCluster'] = args.vc config[args.ts]['trial']['virtualCluster'] = args.vc
if args.debug is not None: if args.debug is not None:
config[args.ts]['debug'] = args.debug.lower() == 'true' config[args.ts]['debug'] = args.debug.lower() == 'true'
elif args.ts == 'kubeflow': elif args.ts == 'kubeflow' and args.reuse_mode == 'False':
if args.nfs_server is not None: if args.nfs_server is not None:
config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server config[args.ts]['kubeflowConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None: if args.nfs_path is not None:
...@@ -50,7 +50,16 @@ def update_training_service_config(args): ...@@ -50,7 +50,16 @@ def update_training_service_config(args):
config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share config[args.ts]['kubeflowConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None: if args.nni_docker_image is not None:
config[args.ts]['trial']['worker']['image'] = args.nni_docker_image config[args.ts]['trial']['worker']['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller': elif args.ts == 'kubeflow' and args.reuse_mode == 'True':
config = get_yml_content(TRAINING_SERVICE_FILE_V2)
config[args.ts]['trainingService']['worker']['dockerImage'] = args.nni_docker_image
config[args.ts]['trainingService']['storage']['azureAccount'] = args.azs_account
config[args.ts]['trainingService']['storage']['azureShare'] = args.azs_share
config[args.ts]['trainingService']['storage']['keyVaultName'] = args.keyvault_name
config[args.ts]['trainingService']['storage']['keyVaultKey'] = args.keyvault_vaultname
config[args.ts]['nni_manager_ip'] = args.nni_manager_ip
dump_yml_content(TRAINING_SERVICE_FILE_V2, config)
elif args.ts == 'frameworkcontroller' and args.reuse_mode == 'False':
if args.nfs_server is not None: if args.nfs_server is not None:
config[args.ts]['frameworkcontrollerConfig']['nfs']['server'] = args.nfs_server config[args.ts]['frameworkcontrollerConfig']['nfs']['server'] = args.nfs_server
if args.nfs_path is not None: if args.nfs_path is not None:
...@@ -65,6 +74,15 @@ def update_training_service_config(args): ...@@ -65,6 +74,15 @@ def update_training_service_config(args):
config[args.ts]['frameworkcontrollerConfig']['azureStorage']['azureShare'] = args.azs_share config[args.ts]['frameworkcontrollerConfig']['azureStorage']['azureShare'] = args.azs_share
if args.nni_docker_image is not None: if args.nni_docker_image is not None:
config[args.ts]['trial']['taskRoles'][0]['image'] = args.nni_docker_image config[args.ts]['trial']['taskRoles'][0]['image'] = args.nni_docker_image
elif args.ts == 'frameworkcontroller' and args.reuse_mode == 'True':
config = get_yml_content(TRAINING_SERVICE_FILE_V2)
config[args.ts]['trainingService']['taskRoles'][0]['dockerImage'] = args.nni_docker_image
config[args.ts]['trainingService']['storage']['azureAccount'] = args.azs_account
config[args.ts]['trainingService']['storage']['azureShare'] = args.azs_share
config[args.ts]['trainingService']['storage']['keyVaultName'] = args.keyvault_name
config[args.ts]['trainingService']['storage']['keyVaultKey'] = args.keyvault_vaultname
config[args.ts]['nni_manager_ip'] = args.nni_manager_ip
dump_yml_content(TRAINING_SERVICE_FILE_V2, config)
elif args.ts == 'remote': elif args.ts == 'remote':
if args.remote_user is not None: if args.remote_user is not None:
config[args.ts]['machineList'][0]['username'] = args.remote_user config[args.ts]['machineList'][0]['username'] = args.remote_user
...@@ -134,6 +152,7 @@ if __name__ == '__main__': ...@@ -134,6 +152,7 @@ if __name__ == '__main__':
parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1') parser.add_argument("--config_version", type=str, choices=['v1', 'v2'], default='v1')
parser.add_argument("--nni_docker_image", type=str) parser.add_argument("--nni_docker_image", type=str)
parser.add_argument("--nni_manager_ip", type=str) parser.add_argument("--nni_manager_ip", type=str)
parser.add_argument("--reuse_mode", type=str, default='False')
# args for remote with shared storage # args for remote with shared storage
parser.add_argument("--azurestoragetoken", type=str) parser.add_argument("--azurestoragetoken", type=str)
parser.add_argument("--nfs_server", type=str) parser.add_argument("--nfs_server", type=str)
......
...@@ -23,21 +23,27 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT, ...@@ -23,21 +23,27 @@ from utils import (CLEAR, EXPERIMENT_URL, GREEN, RED, REST_ENDPOINT,
it_variables = {} it_variables = {}
def update_training_service_config(config, training_service, config_file_path, nni_source_dir): def update_training_service_config(config, training_service, config_file_path, nni_source_dir, reuse_mode='False'):
it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml')) it_ts_config = get_yml_content(os.path.join('config', 'training_service.yml'))
# hack for kubeflow trial config # hack for kubeflow trial config
if training_service == 'kubeflow': if training_service == 'kubeflow' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command'] it_ts_config[training_service]['trial']['worker']['command'] = config['trial']['command']
config['trial'].pop('command') config['trial'].pop('command')
if 'gpuNum' in config['trial']: if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum') config['trial'].pop('gpuNum')
elif training_service == 'kubeflow' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config['trainingService']['worker']['command'] = config['trialCommand']
if training_service == 'frameworkcontroller': if training_service == 'frameworkcontroller' and reuse_mode == 'False':
it_ts_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command'] it_ts_config[training_service]['trial']['taskRoles'][0]['command'] = config['trial']['command']
config['trial'].pop('command') config['trial'].pop('command')
if 'gpuNum' in config['trial']: if 'gpuNum' in config['trial']:
config['trial'].pop('gpuNum') config['trial'].pop('gpuNum')
elif training_service == 'frameworkcontroller' and reuse_mode == 'True':
it_ts_config = get_yml_content(os.path.join('config', 'training_service_v2.yml'))
it_ts_config['trainingService']['taskRoles'][0]['command'] = config['trialCommand']
if training_service == 'adl': if training_service == 'adl':
# hack for adl trial config, codeDir in adl mode refers to path in container # hack for adl trial config, codeDir in adl mode refers to path in container
...@@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args): ...@@ -88,7 +94,7 @@ def prepare_config_file(test_case_config, it_config, args):
# apply training service config # apply training service config
# user's gpuNum, logCollection config is overwritten by the config in training_service.yml # user's gpuNum, logCollection config is overwritten by the config in training_service.yml
# the hack for kubeflow should be applied at last step # the hack for kubeflow should be applied at last step
update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir) update_training_service_config(test_yml_config, args.ts, test_case_config['configFile'], args.nni_source_dir, args.reuse_mode)
# generate temporary config yml file to launch experiment # generate temporary config yml file to launch experiment
new_config_file = config_path + '.tmp' new_config_file = config_path + '.tmp'
...@@ -313,6 +319,7 @@ if __name__ == '__main__': ...@@ -313,6 +319,7 @@ if __name__ == '__main__':
parser.add_argument("--nni_source_dir", type=str, default='../') parser.add_argument("--nni_source_dir", type=str, default='../')
parser.add_argument("--cases", type=str, default=None) parser.add_argument("--cases", type=str, default=None)
parser.add_argument("--exclude", type=str, default=None) parser.add_argument("--exclude", type=str, default=None)
parser.add_argument("--reuse_mode", type=str, default='False')
parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai', parser.add_argument("--ts", type=str, choices=['local', 'remote', 'pai',
'kubeflow', 'frameworkcontroller', 'adl', 'aml', 'hybrid'], default='local') 'kubeflow', 'frameworkcontroller', 'adl', 'aml', 'hybrid'], default='local')
args = parser.parse_args() args = parser.parse_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment