Unverified Commit 858daf9f authored by yjjinjie's avatar yjjinjie Committed by GitHub
Browse files

[dlc] add resource prepare support (#5065)

parent bbf54a88
# working directory on DSW, please provie FULL path
searchSpaceFile: search_space.json
# the command on trial runner(or, DLC container), be aware of data_dir
trialCommand: python mnist.py --data_dir /root/data/{your_data_dir}
trialConcurrency: 1 # NOTE: please provide number <= 3 due to DLC system limit.
maxTrialNumber: 10
tuner:
name: TPE
classArgs:
optimize_mode: maximize
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
trainingService:
platform: dlc
type: Worker
image: registry-vpc.cn-beijing.aliyuncs.com/pai-dlc/pytorch-training:1.6.0-gpu-py37-cu101-ubuntu18.04
jobType: PyTorchJob # choices: [TFJob, PyTorchJob]
podCount: 1
ecsSpec: "{'resource_id':'rg19d2oleg252kke','cpu':2,'memory':8,'gpu':0,'gputype':'','shared_memory':''}" # resource ID,e.g., rg19d2oleg252kke
region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
ossDataSourceId: ${your_oss_data_source_id} # optional, OSS data source id.
localStorageMountPoint: /home/admin/workspace/ # default NAS path on DSW, MUST provide full path.
containerStorageMountPoint: /root/data/ # default NAS path on DLC container, change it according your setting
......@@ -8,6 +8,7 @@ import pathlib
import sys
import traceback
import time
import ast
from argparse import ArgumentParser
# ref: https://help.aliyun.com/document_detail/203290.html?spm=a2c4g.11186623.6.727.6f9b5db6bzJh4x
from alibabacloud_pai_dlc20201203.client import Client
......@@ -71,6 +72,47 @@ if __name__ == "__main__":
data_source_id=args.oss_data_source_id,
)
if args.workspace_id == 'None':
args.workspace_id = None
logging.info("args.workspace_id %s %s",args.workspace_id,type(args.workspace_id))
data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
if args.ecs_spec[0] == '{' and args.ecs_spec[-1] == '}':
config = ast.literal_eval(args.ecs_spec)
resource_id = config['resource_id']
cpu = config.get('cpu',1)
memory = config.get('memory',2)
gpu = config.get('gpu',0)
gputype = config.get('gputype',"")
shared_memory = config.get('shared_memory',"")
spec = JobSpec(
type=args.type,
image=args.image,
pod_count=args.pod_count,
resource_config=ResourceConfig(
cpu=str(cpu),
memory=str(memory) + "Gi",
gpu=str(gpu),
gputype=str(gputype),
shared_memory=str(shared_memory)
),
)
# 声明任务的执行内容。
req = CreateJobRequest(
display_name=args.experiment_name,
job_type=args.job_type,
job_specs=[spec],
data_sources=data_sources,
user_command=args.user_command,
workspace_id=args.workspace_id,
resource_id=str(resource_id),
)
else:
# job spec
spec = JobSpec(
type=args.type,
......@@ -79,14 +121,6 @@ if __name__ == "__main__":
ecs_spec=args.ecs_spec,
)
if args.workspace_id == 'None':
args.workspace_id = None
logging.info("args.workspace_id %s %s",args.workspace_id,type(args.workspace_id))
data_sources = [nas_1]
if oss:
data_sources = [nas_1, oss]
req = CreateJobRequest(
display_name=args.experiment_name,
job_type=args.job_type,
......@@ -136,4 +170,3 @@ if __name__ == "__main__":
except Exception as e:
logging.exception('DLC submit Exception: \n')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment