Unverified Commit 31fbcf41 authored by Weidan Kong's avatar Weidan Kong Committed by GitHub
Browse files

HPO: DLC mode support nas&oss at same time (#4506)

parent f8d2ab31
...@@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's ...@@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1 podCount: 1
ecsSpec: ecs.c6.large ecsSpec: ecs.c6.large
region: cn-hangzhou region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id} accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key} accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
......
...@@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__. ...@@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
- ``str`` - ``str``
- The NAS datasource id configurated in PAI-DLC side. - The NAS datasource id configurated in PAI-DLC side.
* - ossDataSourceId
- ``str``
- The OSS datasource id configurated in PAI-DLC side, this is optional.
* - accessKeyId * - accessKeyId
- ``str`` - ``str``
- The accessKeyId of your cloud account. - The accessKeyId of your cloud account.
......
...@@ -17,9 +17,9 @@ trainingService: ...@@ -17,9 +17,9 @@ trainingService:
podCount: 1 podCount: 1
ecsSpec: ecs.c6.large ecsSpec: ecs.c6.large
region: cn-hangzhou region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id} accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key} accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # optional, OSS data source id.
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path. localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path.
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# Licensed under the MIT license. # Licensed under the MIT license.
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional
from ..training_service import TrainingServiceConfig from ..training_service import TrainingServiceConfig
...@@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig): ...@@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
ecs_spec: str # e.g.,'ecs.c6.large' ecs_spec: str # e.g.,'ecs.c6.large'
region: str region: str
nas_data_source_id: str nas_data_source_id: str
oss_data_source_id: Optional[str] = None
access_key_id: str access_key_id: str
access_key_secret: str access_key_secret: str
local_storage_mount_point: str local_storage_mount_point: str
......
...@@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig { ...@@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
ecsSpec: string; ecsSpec: string;
region: string; region: string;
nasDataSourceId: string; nasDataSourceId: string;
ossDataSourceId?: string;
accessKeyId: string; accessKeyId: string;
accessKeySecret: string; accessKeySecret: string;
localStorageMountPoint: string; localStorageMountPoint: string;
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
import os import os
import sys import sys
import time import traceback
import json
from argparse import ArgumentParser from argparse import ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x # ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from alibabacloud_pai_dlc20201203.client import Client from alibabacloud_pai_dlc20201203.client import Client
...@@ -20,10 +20,12 @@ if __name__ == "__main__": ...@@ -20,10 +20,12 @@ if __name__ == "__main__":
parser.add_argument('--ecs_spec', help='ecs spec') parser.add_argument('--ecs_spec', help='ecs spec')
parser.add_argument('--region', help='region') parser.add_argument('--region', help='region')
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration') parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
parser.add_argument('--access_key_id', help='access_key_id') parser.add_argument('--access_key_id', help='access_key_id')
parser.add_argument('--access_key_secret', help='access_key_secret') parser.add_argument('--access_key_secret', help='access_key_secret')
parser.add_argument('--experiment_name', help='the experiment name') parser.add_argument('--experiment_name', help='the experiment name')
parser.add_argument('--user_command', help='user command') parser.add_argument('--user_command', help='user command')
parser.add_argument('--log_dir', help='exception log dir')
args = parser.parse_args() args = parser.parse_args()
# init client # init client
...@@ -37,10 +39,17 @@ if __name__ == "__main__": ...@@ -37,10 +39,17 @@ if __name__ == "__main__":
) )
nas_1 = DataSourceItem( nas_1 = DataSourceItem(
data_source_type = 'nas', data_source_type='nas',
data_source_id=args.nas_data_source_id, data_source_id=args.nas_data_source_id,
) )
oss = None
if args.oss_data_source_id:
oss = DataSourceItem(
data_source_type='oss',
data_source_id=args.oss_data_source_id,
)
# job spec # job spec
spec = JobSpec( spec = JobSpec(
type=args.type, type=args.type,
...@@ -49,15 +58,19 @@ if __name__ == "__main__": ...@@ -49,15 +58,19 @@ if __name__ == "__main__":
ecs_spec=args.ecs_spec, ecs_spec=args.ecs_spec,
) )
data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
req = CreateJobRequest( req = CreateJobRequest(
display_name=args.experiment_name, display_name=args.experiment_name,
job_type=args.job_type, job_type=args.job_type,
job_specs=[spec], job_specs=[spec],
data_sources=[nas_1], data_sources=data_sources,
user_command=args.user_command user_command=args.user_command
) )
# DLC submit # DLC submit
try:
response = client.create_job(req) response = client.create_job(req)
job_id = response.body.job_id job_id = response.body.job_id
print('job id: ' + job_id) print('job id: ' + job_id)
...@@ -72,3 +85,7 @@ if __name__ == "__main__": ...@@ -72,3 +85,7 @@ if __name__ == "__main__":
elif line == 'stop': elif line == 'stop':
client.stop_job(job_id) client.stop_job(job_id)
exit(0) exit(0)
except Exception as e:
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f:
f.write('DLC submit Exception: \n')
traceback.print_exc(file=f)
...@@ -16,11 +16,14 @@ export class DlcClient { ...@@ -16,11 +16,14 @@ export class DlcClient {
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC // e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID' // create a NAS data and copy the 'DataSet ConfigurationID'
public nasDataSourceId: string; public nasDataSourceId: string;
public ossDataSourceId: string;
public accessKeyId: string; public accessKeyId: string;
public accessKeySecret: string; public accessKeySecret: string;
public experimentId: string; public experimentId: string;
public environmentId: string; public environmentId: string;
public userCommand: string; public userCommand: string;
// dlcUtil exception log dir
public logDir: string;
public pythonShellClient: undefined | PythonShell; public pythonShellClient: undefined | PythonShell;
constructor( constructor(
...@@ -36,6 +39,8 @@ export class DlcClient { ...@@ -36,6 +39,8 @@ export class DlcClient {
accessKeyId: string, accessKeyId: string,
accessKeySecret: string, accessKeySecret: string,
userCommand: string, userCommand: string,
logDir: string,
ossDataSourceId?: string,
) { ) {
this.log = getLogger('DlcClient'); this.log = getLogger('DlcClient');
this.type = type; this.type = type;
...@@ -46,11 +51,17 @@ export class DlcClient { ...@@ -46,11 +51,17 @@ export class DlcClient {
this.image = image; this.image = image;
this.region = region; this.region = region;
this.nasDataSourceId = nasDataSourceId; this.nasDataSourceId = nasDataSourceId;
if (ossDataSourceId !== undefined) {
this.ossDataSourceId = ossDataSourceId;
} else {
this.ossDataSourceId = '';
}
this.accessKeyId = accessKeyId; this.accessKeyId = accessKeyId;
this.accessKeySecret = accessKeySecret this.accessKeySecret = accessKeySecret
this.experimentId = experimentId; this.experimentId = experimentId;
this.environmentId = environmentId; this.environmentId = environmentId;
this.userCommand = userCommand; this.userCommand = userCommand;
this.logDir = logDir;
} }
public submit(): Promise<string> { public submit(): Promise<string> {
...@@ -67,10 +78,12 @@ export class DlcClient { ...@@ -67,10 +78,12 @@ export class DlcClient {
'--ecs_spec', this.ecsSpec, '--ecs_spec', this.ecsSpec,
'--region', this.region, '--region', this.region,
'--nas_data_source_id', this.nasDataSourceId, '--nas_data_source_id', this.nasDataSourceId,
'--oss_data_source_id', this.ossDataSourceId,
'--access_key_id', this.accessKeyId, '--access_key_id', this.accessKeyId,
'--access_key_secret', this.accessKeySecret, '--access_key_secret', this.accessKeySecret,
'--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`, '--experiment_name', `nni_exp_${this.experimentId}_env_${this.environmentId}`,
'--user_command', this.userCommand, '--user_command', this.userCommand,
'--log_dir', this.logDir,
] ]
}); });
this.log.debug(this.pythonShellClient.command); this.log.debug(this.pythonShellClient.command);
......
...@@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService { ...@@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
this.config.accessKeyId, this.config.accessKeyId,
this.config.accessKeySecret, this.config.accessKeySecret,
environment.command, environment.command,
dlcEnvironment.workingFolder,
this.config.ossDataSourceId,
); );
dlcEnvironment.id = await dlcClient.submit(); dlcEnvironment.id = await dlcClient.submit();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment