Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
31fbcf41
Unverified
Commit
31fbcf41
authored
Feb 15, 2022
by
Weidan Kong
Committed by
GitHub
Feb 16, 2022
Browse files
HPO: DLC mode support nas&oss at same time (#4506)
parent
f8d2ab31
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
59 additions
and
20 deletions
+59
-20
docs/source/TrainingService/DLCMode.rst
docs/source/TrainingService/DLCMode.rst
+2
-2
docs/source/reference/experiment_config.rst
docs/source/reference/experiment_config.rst
+4
-0
examples/trials/mnist-pytorch/config_dlc.yml
examples/trials/mnist-pytorch/config_dlc.yml
+1
-1
nni/experiment/config/training_services/dlc.py
nni/experiment/config/training_services/dlc.py
+2
-0
ts/nni_manager/common/experimentConfig.ts
ts/nni_manager/common/experimentConfig.ts
+1
-0
ts/nni_manager/config/dlc/dlcUtil.py
ts/nni_manager/config/dlc/dlcUtil.py
+34
-17
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
+13
-0
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
...ng_service/reusable/environments/dlcEnvironmentService.ts
+2
-0
No files found.
docs/source/TrainingService/DLCMode.rst
View file @
31fbcf41
...
...
@@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
...
...
docs/source/reference/experiment_config.rst
View file @
31fbcf41
...
...
@@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
- ``str``
- The NAS datasource id configurated in PAI-DLC side.
* - ossDataSourceId
- ``str``
- The OSS datasource id configurated in PAI-DLC side, this is optional.
* - accessKeyId
- ``str``
- The accessKeyId of your cloud account.
...
...
examples/trials/mnist-pytorch/config_dlc.yml
View file @
31fbcf41
...
...
@@ -17,9 +17,9 @@ trainingService:
podCount
:
1
ecsSpec
:
ecs.c6.large
region
:
cn-hangzhou
nasDataSourceId
:
${your_nas_data_source_id}
accessKeyId
:
${your_ak_id}
accessKeySecret
:
${your_ak_key}
nasDataSourceId
:
${your_nas_data_source_id}
# NAS datasource ID,e.g., datat56by9n1xt0a
ossDataSourceId
:
${your_oss_data_source_id}
# optional, OSS data source id.
localStorageMountPoint
:
/home/admin/workspace/
# default NAS path on DSW, MUST provide full path.
containerStorageMountPoint
:
/root/data/
# default NAS path on DLC container, change it according your setting
nni/experiment/config/training_services/dlc.py
View file @
31fbcf41
...
...
@@ -2,6 +2,7 @@
# Licensed under the MIT license.
from
dataclasses
import
dataclass
from
typing
import
Optional
from
..training_service
import
TrainingServiceConfig
...
...
@@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
ecs_spec
:
str
# e.g.,'ecs.c6.large'
region
:
str
nas_data_source_id
:
str
oss_data_source_id
:
Optional
[
str
]
=
None
access_key_id
:
str
access_key_secret
:
str
local_storage_mount_point
:
str
...
...
ts/nni_manager/common/experimentConfig.ts
View file @
31fbcf41
...
...
@@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
ecsSpec
:
string
;
region
:
string
;
nasDataSourceId
:
string
;
ossDataSourceId
?:
string
;
accessKeyId
:
string
;
accessKeySecret
:
string
;
localStorageMountPoint
:
string
;
...
...
ts/nni_manager/config/dlc/dlcUtil.py
View file @
31fbcf41
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import
os
import
sys
import
time
import
json
import
traceback
from
argparse
import
ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from
alibabacloud_pai_dlc20201203.client
import
Client
...
...
@@ -20,10 +20,12 @@ if __name__ == "__main__":
parser
.
add_argument
(
'--ecs_spec'
,
help
=
'ecs spec'
)
parser
.
add_argument
(
'--region'
,
help
=
'region'
)
parser
.
add_argument
(
'--nas_data_source_id'
,
help
=
'nas data_source_id of DLC dataset configuration'
)
parser
.
add_argument
(
'--oss_data_source_id'
,
help
=
'oss data_source_id of DLC dataset configuration'
)
parser
.
add_argument
(
'--access_key_id'
,
help
=
'access_key_id'
)
parser
.
add_argument
(
'--access_key_secret'
,
help
=
'access_key_secret'
)
parser
.
add_argument
(
'--experiment_name'
,
help
=
'the experiment name'
)
parser
.
add_argument
(
'--user_command'
,
help
=
'user command'
)
parser
.
add_argument
(
'--log_dir'
,
help
=
'exception log dir'
)
args
=
parser
.
parse_args
()
# init client
...
...
@@ -37,10 +39,17 @@ if __name__ == "__main__":
)
nas_1
=
DataSourceItem
(
data_source_type
=
'nas'
,
data_source_type
=
'nas'
,
data_source_id
=
args
.
nas_data_source_id
,
)
oss
=
None
if
args
.
oss_data_source_id
:
oss
=
DataSourceItem
(
data_source_type
=
'oss'
,
data_source_id
=
args
.
oss_data_source_id
,
)
# job spec
spec
=
JobSpec
(
type
=
args
.
type
,
...
...
@@ -49,26 +58,34 @@ if __name__ == "__main__":
ecs_spec
=
args
.
ecs_spec
,
)
data_sources
=
[
nas_1
]
if
oss
:
data_sources
=
[
nas_1
,
oss
]
req
=
CreateJobRequest
(
display_name
=
args
.
experiment_name
,
job_type
=
args
.
job_type
,
job_specs
=
[
spec
],
data_sources
=
[
nas_1
]
,
data_sources
=
data_sources
,
user_command
=
args
.
user_command
)
# DLC submit
response
=
client
.
create_job
(
req
)
job_id
=
response
.
body
.
job_id
print
(
'job id: '
+
job_id
)
try
:
response
=
client
.
create_job
(
req
)
job_id
=
response
.
body
.
job_id
print
(
'job id: '
+
job_id
)
while
True
:
line
=
sys
.
stdin
.
readline
().
rstrip
()
if
line
==
'update_status'
:
print
(
'status:'
+
client
.
get_job
(
job_id
).
body
.
status
)
elif
line
==
'tracking_url'
:
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
elif
line
==
'stop'
:
client
.
stop_job
(
job_id
)
exit
(
0
)
while
True
:
line
=
sys
.
stdin
.
readline
().
rstrip
()
if
line
==
'update_status'
:
print
(
'status:'
+
client
.
get_job
(
job_id
).
body
.
status
)
elif
line
==
'tracking_url'
:
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
elif
line
==
'stop'
:
client
.
stop_job
(
job_id
)
exit
(
0
)
except
Exception
as
e
:
with
open
(
os
.
path
.
join
(
args
.
log_dir
,
'dlc_exception.log'
),
'w'
)
as
f
:
f
.
write
(
'DLC submit Exception:
\n
'
)
traceback
.
print_exc
(
file
=
f
)
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
View file @
31fbcf41
...
...
@@ -16,11 +16,14 @@ export class DlcClient {
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID'
public
nasDataSourceId
:
string
;
public
ossDataSourceId
:
string
;
public
accessKeyId
:
string
;
public
accessKeySecret
:
string
;
public
experimentId
:
string
;
public
environmentId
:
string
;
public
userCommand
:
string
;
// dlcUtil exception log dir
public
logDir
:
string
;
public
pythonShellClient
:
undefined
|
PythonShell
;
constructor
(
...
...
@@ -36,6 +39,8 @@ export class DlcClient {
accessKeyId
:
string
,
accessKeySecret
:
string
,
userCommand
:
string
,
logDir
:
string
,
ossDataSourceId
?:
string
,
)
{
this
.
log
=
getLogger
(
'
DlcClient
'
);
this
.
type
=
type
;
...
...
@@ -46,11 +51,17 @@ export class DlcClient {
this
.
image
=
image
;
this
.
region
=
region
;
this
.
nasDataSourceId
=
nasDataSourceId
;
if
(
ossDataSourceId
!==
undefined
)
{
this
.
ossDataSourceId
=
ossDataSourceId
;
}
else
{
this
.
ossDataSourceId
=
''
;
}
this
.
accessKeyId
=
accessKeyId
;
this
.
accessKeySecret
=
accessKeySecret
this
.
experimentId
=
experimentId
;
this
.
environmentId
=
environmentId
;
this
.
userCommand
=
userCommand
;
this
.
logDir
=
logDir
;
}
public
submit
():
Promise
<
string
>
{
...
...
@@ -67,10 +78,12 @@ export class DlcClient {
'
--ecs_spec
'
,
this
.
ecsSpec
,
'
--region
'
,
this
.
region
,
'
--nas_data_source_id
'
,
this
.
nasDataSourceId
,
'
--oss_data_source_id
'
,
this
.
ossDataSourceId
,
'
--access_key_id
'
,
this
.
accessKeyId
,
'
--access_key_secret
'
,
this
.
accessKeySecret
,
'
--experiment_name
'
,
`nni_exp_
${
this
.
experimentId
}
_env_
${
this
.
environmentId
}
`
,
'
--user_command
'
,
this
.
userCommand
,
'
--log_dir
'
,
this
.
logDir
,
]
});
this
.
log
.
debug
(
this
.
pythonShellClient
.
command
);
...
...
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
View file @
31fbcf41
...
...
@@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
this
.
config
.
accessKeyId
,
this
.
config
.
accessKeySecret
,
environment
.
command
,
dlcEnvironment
.
workingFolder
,
this
.
config
.
ossDataSourceId
,
);
dlcEnvironment
.
id
=
await
dlcClient
.
submit
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment