Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
858daf9f
Unverified
Commit
858daf9f
authored
Aug 26, 2022
by
yjjinjie
Committed by
GitHub
Aug 26, 2022
Browse files
[dlc] add resource prepare support (#5065)
parent
bbf54a88
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
78 additions
and
19 deletions
+78
-19
examples/trials/mnist-pytorch/config_dlc_resource.yml
examples/trials/mnist-pytorch/config_dlc_resource.yml
+26
-0
ts/nni_manager/config/dlc/dlcUtil.py
ts/nni_manager/config/dlc/dlcUtil.py
+52
-19
No files found.
examples/trials/mnist-pytorch/config_dlc_resource.yml
0 → 100644
View file @
858daf9f
# working directory on DSW, please provie FULL path
searchSpaceFile
:
search_space.json
# the command on trial runner(or, DLC container), be aware of data_dir
trialCommand
:
python mnist.py --data_dir /root/data/{your_data_dir}
trialConcurrency
:
1
# NOTE: please provide number <= 3 due to DLC system limit.
maxTrialNumber
:
10
tuner
:
name
:
TPE
classArgs
:
optimize_mode
:
maximize
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
trainingService
:
platform
:
dlc
type
:
Worker
image
:
registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04
jobType
:
PyTorchJob
# choices: [TFJob, PyTorchJob]
podCount
:
1
ecsSpec
:
"
{'resource_id':'rg19d2oleg252kke','cpu':2,'memory':8,'gpu':0,'gputype':'','shared_memory':''}"
# resource ID,e.g., rg19d2oleg252kke
region
:
cn-hangzhou
workspaceId
:
${your_workspace_id}
accessKeyId
:
${your_ak_id}
accessKeySecret
:
${your_ak_key}
nasDataSourceId
:
${your_nas_data_source_id}
# NAS datasource ID,e.g., datat56by9n1xt0a
ossDataSourceId
:
${your_oss_data_source_id}
# optional, OSS data source id.
localStorageMountPoint
:
/home/admin/workspace/
# default NAS path on DSW, MUST provide full path.
containerStorageMountPoint
:
/root/data/
# default NAS path on DLC container, change it according your setting
ts/nni_manager/config/dlc/dlcUtil.py
View file @
858daf9f
...
...
@@ -8,6 +8,7 @@ import pathlib
import
sys
import
traceback
import
time
import
ast
from
argparse
import
ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from
alibabacloud_pai_dlc20201203.client
import
Client
...
...
@@ -71,15 +72,6 @@ if __name__ == "__main__":
data_source_id
=
args
.
oss_data_source_id
,
)
# job spec
spec
=
JobSpec
(
type
=
args
.
type
,
image
=
args
.
image
,
pod_count
=
args
.
pod_count
,
ecs_spec
=
args
.
ecs_spec
,
)
if
args
.
workspace_id
==
'None'
:
args
.
workspace_id
=
None
logging
.
info
(
"args.workspace_id %s %s"
,
args
.
workspace_id
,
type
(
args
.
workspace_id
))
...
...
@@ -87,14 +79,56 @@ if __name__ == "__main__":
data_sources
=
[
nas_1
]
if
oss
:
data_sources
=
[
nas_1
,
oss
]
req
=
CreateJobRequest
(
display_name
=
args
.
experiment_name
,
job_type
=
args
.
job_type
,
job_specs
=
[
spec
],
data_sources
=
data_sources
,
user_command
=
args
.
user_command
,
workspace_id
=
args
.
workspace_id
,
)
if
args
.
ecs_spec
[
0
]
==
'{'
and
args
.
ecs_spec
[
-
1
]
==
'}'
:
config
=
ast
.
literal_eval
(
args
.
ecs_spec
)
resource_id
=
config
[
'resource_id'
]
cpu
=
config
.
get
(
'cpu'
,
1
)
memory
=
config
.
get
(
'memory'
,
2
)
gpu
=
config
.
get
(
'gpu'
,
0
)
gputype
=
config
.
get
(
'gputype'
,
""
)
shared_memory
=
config
.
get
(
'shared_memory'
,
""
)
spec
=
JobSpec
(
type
=
args
.
type
,
image
=
args
.
image
,
pod_count
=
args
.
pod_count
,
resource_config
=
ResourceConfig
(
cpu
=
str
(
cpu
),
memory
=
str
(
memory
)
+
"Gi"
,
gpu
=
str
(
gpu
),
gputype
=
str
(
gputype
),
shared_memory
=
str
(
shared_memory
)
),
)
# 声明任务的执行内容。
req
=
CreateJobRequest
(
display_name
=
args
.
experiment_name
,
job_type
=
args
.
job_type
,
job_specs
=
[
spec
],
data_sources
=
data_sources
,
user_command
=
args
.
user_command
,
workspace_id
=
args
.
workspace_id
,
resource_id
=
str
(
resource_id
),
)
else
:
# job spec
spec
=
JobSpec
(
type
=
args
.
type
,
image
=
args
.
image
,
pod_count
=
args
.
pod_count
,
ecs_spec
=
args
.
ecs_spec
,
)
req
=
CreateJobRequest
(
display_name
=
args
.
experiment_name
,
job_type
=
args
.
job_type
,
job_specs
=
[
spec
],
data_sources
=
data_sources
,
user_command
=
args
.
user_command
,
workspace_id
=
args
.
workspace_id
,
)
response
=
client
.
create_job
(
req
)
job_id
=
response
.
body
.
job_id
...
...
@@ -135,5 +169,4 @@ if __name__ == "__main__":
except
Exception
as
e
:
logging
.
exception
(
'DLC submit Exception:
\n
'
)
logging
.
exception
(
'DLC submit Exception:
\n
'
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment