Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
31fbcf41
Unverified
Commit
31fbcf41
authored
Feb 15, 2022
by
Weidan Kong
Committed by
GitHub
Feb 16, 2022
Browse files
HPO: DLC mode support nas&oss at same time (#4506)
parent
f8d2ab31
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
59 additions
and
20 deletions
+59
-20
docs/source/TrainingService/DLCMode.rst
docs/source/TrainingService/DLCMode.rst
+2
-2
docs/source/reference/experiment_config.rst
docs/source/reference/experiment_config.rst
+4
-0
examples/trials/mnist-pytorch/config_dlc.yml
examples/trials/mnist-pytorch/config_dlc.yml
+1
-1
nni/experiment/config/training_services/dlc.py
nni/experiment/config/training_services/dlc.py
+2
-0
ts/nni_manager/common/experimentConfig.ts
ts/nni_manager/common/experimentConfig.ts
+1
-0
ts/nni_manager/config/dlc/dlcUtil.py
ts/nni_manager/config/dlc/dlcUtil.py
+34
-17
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
+13
-0
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
...ng_service/reusable/environments/dlcEnvironmentService.ts
+2
-0
No files found.
docs/source/TrainingService/DLCMode.rst
View file @
31fbcf41
...
@@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
...
@@ -51,10 +51,10 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1
podCount: 1
ecsSpec: ecs.c6.large
ecsSpec: ecs.c6.large
region: cn-hangzhou
region: cn-hangzhou
nasDataSourceId: ${your_nas_data_source_id}
accessKeyId: ${your_ak_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # OSS datasource ID, in case your data is on oss
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
...
...
docs/source/reference/experiment_config.rst
View file @
31fbcf41
...
@@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
...
@@ -584,6 +584,10 @@ Detailed usage can be found `here <../TrainingService/DlcMode.rst>`__.
- ``str``
- ``str``
- The NAS datasource id configurated in PAI-DLC side.
- The NAS datasource id configurated in PAI-DLC side.
* - ossDataSourceId
- ``str``
- The OSS datasource id configurated in PAI-DLC side, this is optional.
* - accessKeyId
* - accessKeyId
- ``str``
- ``str``
- The accessKeyId of your cloud account.
- The accessKeyId of your cloud account.
...
...
examples/trials/mnist-pytorch/config_dlc.yml
View file @
31fbcf41
...
@@ -17,9 +17,9 @@ trainingService:
...
@@ -17,9 +17,9 @@ trainingService:
podCount
:
1
podCount
:
1
ecsSpec
:
ecs.c6.large
ecsSpec
:
ecs.c6.large
region
:
cn-hangzhou
region
:
cn-hangzhou
nasDataSourceId
:
${your_nas_data_source_id}
accessKeyId
:
${your_ak_id}
accessKeyId
:
${your_ak_id}
accessKeySecret
:
${your_ak_key}
accessKeySecret
:
${your_ak_key}
nasDataSourceId
:
${your_nas_data_source_id}
# NAS datasource ID,e.g., datat56by9n1xt0a
nasDataSourceId
:
${your_nas_data_source_id}
# NAS datasource ID,e.g., datat56by9n1xt0a
ossDataSourceId
:
${your_oss_data_source_id}
# optional, OSS data source id.
localStorageMountPoint
:
/home/admin/workspace/
# default NAS path on DSW, MUST provide full path.
localStorageMountPoint
:
/home/admin/workspace/
# default NAS path on DSW, MUST provide full path.
containerStorageMountPoint
:
/root/data/
# default NAS path on DLC container, change it according your setting
containerStorageMountPoint
:
/root/data/
# default NAS path on DLC container, change it according your setting
nni/experiment/config/training_services/dlc.py
View file @
31fbcf41
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# Licensed under the MIT license.
# Licensed under the MIT license.
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Optional
from
..training_service
import
TrainingServiceConfig
from
..training_service
import
TrainingServiceConfig
...
@@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
...
@@ -17,6 +18,7 @@ class DlcConfig(TrainingServiceConfig):
ecs_spec
:
str
# e.g.,'ecs.c6.large'
ecs_spec
:
str
# e.g.,'ecs.c6.large'
region
:
str
region
:
str
nas_data_source_id
:
str
nas_data_source_id
:
str
oss_data_source_id
:
Optional
[
str
]
=
None
access_key_id
:
str
access_key_id
:
str
access_key_secret
:
str
access_key_secret
:
str
local_storage_mount_point
:
str
local_storage_mount_point
:
str
...
...
ts/nni_manager/common/experimentConfig.ts
View file @
31fbcf41
...
@@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
...
@@ -93,6 +93,7 @@ export interface DlcConfig extends TrainingServiceConfig {
ecsSpec
:
string
;
ecsSpec
:
string
;
region
:
string
;
region
:
string
;
nasDataSourceId
:
string
;
nasDataSourceId
:
string
;
ossDataSourceId
?:
string
;
accessKeyId
:
string
;
accessKeyId
:
string
;
accessKeySecret
:
string
;
accessKeySecret
:
string
;
localStorageMountPoint
:
string
;
localStorageMountPoint
:
string
;
...
...
ts/nni_manager/config/dlc/dlcUtil.py
View file @
31fbcf41
# Copyright (c) Microsoft Corporation.
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Licensed under the MIT license.
import
os
import
os
import
sys
import
sys
import
time
import
traceback
import
json
from
argparse
import
ArgumentParser
from
argparse
import
ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from
alibabacloud_pai_dlc20201203.client
import
Client
from
alibabacloud_pai_dlc20201203.client
import
Client
...
@@ -20,10 +20,12 @@ if __name__ == "__main__":
...
@@ -20,10 +20,12 @@ if __name__ == "__main__":
parser
.
add_argument
(
'--ecs_spec'
,
help
=
'ecs spec'
)
parser
.
add_argument
(
'--ecs_spec'
,
help
=
'ecs spec'
)
parser
.
add_argument
(
'--region'
,
help
=
'region'
)
parser
.
add_argument
(
'--region'
,
help
=
'region'
)
parser
.
add_argument
(
'--nas_data_source_id'
,
help
=
'nas data_source_id of DLC dataset configuration'
)
parser
.
add_argument
(
'--nas_data_source_id'
,
help
=
'nas data_source_id of DLC dataset configuration'
)
parser
.
add_argument
(
'--oss_data_source_id'
,
help
=
'oss data_source_id of DLC dataset configuration'
)
parser
.
add_argument
(
'--access_key_id'
,
help
=
'access_key_id'
)
parser
.
add_argument
(
'--access_key_id'
,
help
=
'access_key_id'
)
parser
.
add_argument
(
'--access_key_secret'
,
help
=
'access_key_secret'
)
parser
.
add_argument
(
'--access_key_secret'
,
help
=
'access_key_secret'
)
parser
.
add_argument
(
'--experiment_name'
,
help
=
'the experiment name'
)
parser
.
add_argument
(
'--experiment_name'
,
help
=
'the experiment name'
)
parser
.
add_argument
(
'--user_command'
,
help
=
'user command'
)
parser
.
add_argument
(
'--user_command'
,
help
=
'user command'
)
parser
.
add_argument
(
'--log_dir'
,
help
=
'exception log dir'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# init client
# init client
...
@@ -37,10 +39,17 @@ if __name__ == "__main__":
...
@@ -37,10 +39,17 @@ if __name__ == "__main__":
)
)
nas_1
=
DataSourceItem
(
nas_1
=
DataSourceItem
(
data_source_type
=
'nas'
,
data_source_type
=
'nas'
,
data_source_id
=
args
.
nas_data_source_id
,
data_source_id
=
args
.
nas_data_source_id
,
)
)
oss
=
None
if
args
.
oss_data_source_id
:
oss
=
DataSourceItem
(
data_source_type
=
'oss'
,
data_source_id
=
args
.
oss_data_source_id
,
)
# job spec
# job spec
spec
=
JobSpec
(
spec
=
JobSpec
(
type
=
args
.
type
,
type
=
args
.
type
,
...
@@ -49,26 +58,34 @@ if __name__ == "__main__":
...
@@ -49,26 +58,34 @@ if __name__ == "__main__":
ecs_spec
=
args
.
ecs_spec
,
ecs_spec
=
args
.
ecs_spec
,
)
)
data_sources
=
[
nas_1
]
if
oss
:
data_sources
=
[
nas_1
,
oss
]
req
=
CreateJobRequest
(
req
=
CreateJobRequest
(
display_name
=
args
.
experiment_name
,
display_name
=
args
.
experiment_name
,
job_type
=
args
.
job_type
,
job_type
=
args
.
job_type
,
job_specs
=
[
spec
],
job_specs
=
[
spec
],
data_sources
=
[
nas_1
]
,
data_sources
=
data_sources
,
user_command
=
args
.
user_command
user_command
=
args
.
user_command
)
)
# DLC submit
# DLC submit
response
=
client
.
create_job
(
req
)
try
:
job_id
=
response
.
body
.
job_id
response
=
client
.
create_job
(
req
)
print
(
'job id: '
+
job_id
)
job_id
=
response
.
body
.
job_id
print
(
'job id: '
+
job_id
)
while
True
:
while
True
:
line
=
sys
.
stdin
.
readline
().
rstrip
()
line
=
sys
.
stdin
.
readline
().
rstrip
()
if
line
==
'update_status'
:
if
line
==
'update_status'
:
print
(
'status:'
+
client
.
get_job
(
job_id
).
body
.
status
)
print
(
'status:'
+
client
.
get_job
(
job_id
).
body
.
status
)
elif
line
==
'tracking_url'
:
elif
line
==
'tracking_url'
:
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
#TODO: 1. get this url by api? 2. change this url in private dlc mode.
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
print
(
'tracking_url:'
+
f
'https://pai-dlc.console.aliyun.com/#/jobs/detail?jobId=
{
job_id
}
®ionId=
{
args
.
region
}
'
)
elif
line
==
'stop'
:
elif
line
==
'stop'
:
client
.
stop_job
(
job_id
)
client
.
stop_job
(
job_id
)
exit
(
0
)
exit
(
0
)
except
Exception
as
e
:
with
open
(
os
.
path
.
join
(
args
.
log_dir
,
'dlc_exception.log'
),
'w'
)
as
f
:
f
.
write
(
'DLC submit Exception:
\n
'
)
traceback
.
print_exc
(
file
=
f
)
ts/nni_manager/training_service/reusable/dlc/dlcClient.ts
View file @
31fbcf41
...
@@ -16,11 +16,14 @@ export class DlcClient {
...
@@ -16,11 +16,14 @@ export class DlcClient {
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID'
// create a NAS data and copy the 'DataSet ConfigurationID'
public
nasDataSourceId
:
string
;
public
nasDataSourceId
:
string
;
public
ossDataSourceId
:
string
;
public
accessKeyId
:
string
;
public
accessKeyId
:
string
;
public
accessKeySecret
:
string
;
public
accessKeySecret
:
string
;
public
experimentId
:
string
;
public
experimentId
:
string
;
public
environmentId
:
string
;
public
environmentId
:
string
;
public
userCommand
:
string
;
public
userCommand
:
string
;
// dlcUtil exception log dir
public
logDir
:
string
;
public
pythonShellClient
:
undefined
|
PythonShell
;
public
pythonShellClient
:
undefined
|
PythonShell
;
constructor
(
constructor
(
...
@@ -36,6 +39,8 @@ export class DlcClient {
...
@@ -36,6 +39,8 @@ export class DlcClient {
accessKeyId
:
string
,
accessKeyId
:
string
,
accessKeySecret
:
string
,
accessKeySecret
:
string
,
userCommand
:
string
,
userCommand
:
string
,
logDir
:
string
,
ossDataSourceId
?:
string
,
)
{
)
{
this
.
log
=
getLogger
(
'
DlcClient
'
);
this
.
log
=
getLogger
(
'
DlcClient
'
);
this
.
type
=
type
;
this
.
type
=
type
;
...
@@ -46,11 +51,17 @@ export class DlcClient {
...
@@ -46,11 +51,17 @@ export class DlcClient {
this
.
image
=
image
;
this
.
image
=
image
;
this
.
region
=
region
;
this
.
region
=
region
;
this
.
nasDataSourceId
=
nasDataSourceId
;
this
.
nasDataSourceId
=
nasDataSourceId
;
if
(
ossDataSourceId
!==
undefined
)
{
this
.
ossDataSourceId
=
ossDataSourceId
;
}
else
{
this
.
ossDataSourceId
=
''
;
}
this
.
accessKeyId
=
accessKeyId
;
this
.
accessKeyId
=
accessKeyId
;
this
.
accessKeySecret
=
accessKeySecret
this
.
accessKeySecret
=
accessKeySecret
this
.
experimentId
=
experimentId
;
this
.
experimentId
=
experimentId
;
this
.
environmentId
=
environmentId
;
this
.
environmentId
=
environmentId
;
this
.
userCommand
=
userCommand
;
this
.
userCommand
=
userCommand
;
this
.
logDir
=
logDir
;
}
}
public
submit
():
Promise
<
string
>
{
public
submit
():
Promise
<
string
>
{
...
@@ -67,10 +78,12 @@ export class DlcClient {
...
@@ -67,10 +78,12 @@ export class DlcClient {
'
--ecs_spec
'
,
this
.
ecsSpec
,
'
--ecs_spec
'
,
this
.
ecsSpec
,
'
--region
'
,
this
.
region
,
'
--region
'
,
this
.
region
,
'
--nas_data_source_id
'
,
this
.
nasDataSourceId
,
'
--nas_data_source_id
'
,
this
.
nasDataSourceId
,
'
--oss_data_source_id
'
,
this
.
ossDataSourceId
,
'
--access_key_id
'
,
this
.
accessKeyId
,
'
--access_key_id
'
,
this
.
accessKeyId
,
'
--access_key_secret
'
,
this
.
accessKeySecret
,
'
--access_key_secret
'
,
this
.
accessKeySecret
,
'
--experiment_name
'
,
`nni_exp_
${
this
.
experimentId
}
_env_
${
this
.
environmentId
}
`
,
'
--experiment_name
'
,
`nni_exp_
${
this
.
experimentId
}
_env_
${
this
.
environmentId
}
`
,
'
--user_command
'
,
this
.
userCommand
,
'
--user_command
'
,
this
.
userCommand
,
'
--log_dir
'
,
this
.
logDir
,
]
]
});
});
this
.
log
.
debug
(
this
.
pythonShellClient
.
command
);
this
.
log
.
debug
(
this
.
pythonShellClient
.
command
);
...
...
ts/nni_manager/training_service/reusable/environments/dlcEnvironmentService.ts
View file @
31fbcf41
...
@@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
...
@@ -116,6 +116,8 @@ export class DlcEnvironmentService extends EnvironmentService {
this
.
config
.
accessKeyId
,
this
.
config
.
accessKeyId
,
this
.
config
.
accessKeySecret
,
this
.
config
.
accessKeySecret
,
environment
.
command
,
environment
.
command
,
dlcEnvironment
.
workingFolder
,
this
.
config
.
ossDataSourceId
,
);
);
dlcEnvironment
.
id
=
await
dlcClient
.
submit
();
dlcEnvironment
.
id
=
await
dlcClient
.
submit
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment