"vscode:/vscode.git/clone" did not exist on "24fa4619cf01bc4279ff4ba2051bb73f8f049ee3"
Unverified Commit 7afe8a71 authored by Weidan Kong's avatar Weidan Kong Committed by GitHub
Browse files

[DLC]: pai-dlc api update & log folder update (#4909)

parent 2815fb1f
......@@ -51,6 +51,7 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
......
......@@ -17,6 +17,7 @@ trainingService:
podCount: 1
ecsSpec: ecs.c6.large
region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
......
......@@ -17,6 +17,7 @@ class DlcConfig(TrainingServiceConfig):
pod_count: int
ecs_spec: str # e.g.,'ecs.c6.large'
region: str
workspace_id: str
nas_data_source_id: str
oss_data_source_id: Optional[str] = None
access_key_id: str
......
......@@ -92,6 +92,7 @@ export interface DlcConfig extends TrainingServiceConfig {
podCount: number;
ecsSpec: string;
region: string;
workspaceId: string;
nasDataSourceId: string;
ossDataSourceId?: string;
accessKeyId: string;
......
......@@ -2,7 +2,9 @@
# Licensed under the MIT license.
import logging
import os
import pathlib
import sys
import traceback
from argparse import ArgumentParser
......@@ -19,6 +21,7 @@ if __name__ == "__main__":
parser.add_argument('--pod_count', type=int, default=1, help='pod count')
parser.add_argument('--ecs_spec', help='ecs spec')
parser.add_argument('--region', help='region')
parser.add_argument('--workspace_id', help='workspace id for your project')
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
parser.add_argument('--access_key_id', help='access_key_id')
......@@ -28,6 +31,14 @@ if __name__ == "__main__":
parser.add_argument('--log_dir', help='exception log dir')
args = parser.parse_args()
pathlib.Path(args.log_dir).mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename=os.path.join(args.log_dir, 'dlc_exception.log'),
format='%(asctime)s %(message)s',
level=logging.INFO)
# DLC submit
try:
# init client
client = Client(
Config(
......@@ -66,11 +77,10 @@ if __name__ == "__main__":
job_type=args.job_type,
job_specs=[spec],
data_sources=data_sources,
user_command=args.user_command
user_command=args.user_command,
workspace_id=args.workspace_id,
)
# DLC submit
try:
response = client.create_job(req)
job_id = response.body.job_id
print('job id: ' + job_id)
......@@ -86,6 +96,5 @@ if __name__ == "__main__":
client.stop_job(job_id)
exit(0)
except Exception as e:
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f:
f.write('DLC submit Exception: \n')
traceback.print_exc(file=f)
logging.error('DLC submit Exception: \n')
logging.error(e, exc_info=1)
......@@ -13,6 +13,7 @@ export class DlcClient {
public podCount: number;
public ecsSpec: string;
public region: string;
public workspaceId: string;
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID'
public nasDataSourceId: string;
......@@ -35,6 +36,7 @@ export class DlcClient {
environmentId: string,
ecsSpec: string,
region: string,
workspaceId: string,
nasDataSourceId: string,
accessKeyId: string,
accessKeySecret: string,
......@@ -50,6 +52,7 @@ export class DlcClient {
this.ecsSpec = ecsSpec;
this.image = image;
this.region = region;
this.workspaceId = workspaceId;
this.nasDataSourceId = nasDataSourceId;
if (ossDataSourceId !== undefined) {
this.ossDataSourceId = ossDataSourceId;
......@@ -77,6 +80,7 @@ export class DlcClient {
'--pod_count', String(this.podCount),
'--ecs_spec', this.ecsSpec,
'--region', this.region,
'--workspace_id', this.workspaceId,
'--nas_data_source_id', this.nasDataSourceId,
'--oss_data_source_id', this.ossDataSourceId,
'--access_key_id', this.accessKeyId,
......
......@@ -15,6 +15,7 @@ import { FileCommandChannel } from '../channels/fileCommandChannel';
import { MountedStorageService } from '../storages/mountedStorageService';
import { Scope } from 'typescript-ioc';
import { StorageService } from '../storageService';
import { getLogDir } from 'common/utils';
/**
* Collector DLC jobs info from DLC cluster, and update dlc job status locally
......@@ -112,11 +113,12 @@ export class DlcEnvironmentService extends EnvironmentService {
environment.id,
this.config.ecsSpec,
this.config.region,
this.config.workspaceId,
this.config.nasDataSourceId,
this.config.accessKeyId,
this.config.accessKeySecret,
environment.command,
dlcEnvironment.workingFolder,
path.join(getLogDir(), `envs/${environment.id}`),
this.config.ossDataSourceId,
);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment