Unverified Commit 7afe8a71 authored by Weidan Kong's avatar Weidan Kong Committed by GitHub
Browse files

[DLC]: pai-dlc api update & log folder update (#4909)

parent 2815fb1f
...@@ -51,6 +51,7 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's ...@@ -51,6 +51,7 @@ Use ``examples/trials/mnist-pytorch`` as an example. The NNI config YAML file's
podCount: 1 podCount: 1
ecsSpec: ecs.c6.large ecsSpec: ecs.c6.large
region: cn-hangzhou region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id} accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key} accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID, e.g., datat56by9n1xt0a
......
...@@ -17,6 +17,7 @@ trainingService: ...@@ -17,6 +17,7 @@ trainingService:
podCount: 1 podCount: 1
ecsSpec: ecs.c6.large ecsSpec: ecs.c6.large
region: cn-hangzhou region: cn-hangzhou
workspaceId: ${your_workspace_id}
accessKeyId: ${your_ak_id} accessKeyId: ${your_ak_id}
accessKeySecret: ${your_ak_key} accessKeySecret: ${your_ak_key}
nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a nasDataSourceId: ${your_nas_data_source_id} # NAS datasource ID,e.g., datat56by9n1xt0a
......
...@@ -17,6 +17,7 @@ class DlcConfig(TrainingServiceConfig): ...@@ -17,6 +17,7 @@ class DlcConfig(TrainingServiceConfig):
pod_count: int pod_count: int
ecs_spec: str # e.g.,'ecs.c6.large' ecs_spec: str # e.g.,'ecs.c6.large'
region: str region: str
workspace_id: str
nas_data_source_id: str nas_data_source_id: str
oss_data_source_id: Optional[str] = None oss_data_source_id: Optional[str] = None
access_key_id: str access_key_id: str
......
...@@ -92,6 +92,7 @@ export interface DlcConfig extends TrainingServiceConfig { ...@@ -92,6 +92,7 @@ export interface DlcConfig extends TrainingServiceConfig {
podCount: number; podCount: number;
ecsSpec: string; ecsSpec: string;
region: string; region: string;
workspaceId: string;
nasDataSourceId: string; nasDataSourceId: string;
ossDataSourceId?: string; ossDataSourceId?: string;
accessKeyId: string; accessKeyId: string;
......
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
# Licensed under the MIT license. # Licensed under the MIT license.
import logging
import os import os
import pathlib
import sys import sys
import traceback import traceback
from argparse import ArgumentParser from argparse import ArgumentParser
...@@ -19,6 +21,7 @@ if __name__ == "__main__": ...@@ -19,6 +21,7 @@ if __name__ == "__main__":
parser.add_argument('--pod_count', type=int, default=1, help='pod count') parser.add_argument('--pod_count', type=int, default=1, help='pod count')
parser.add_argument('--ecs_spec', help='ecs spec') parser.add_argument('--ecs_spec', help='ecs spec')
parser.add_argument('--region', help='region') parser.add_argument('--region', help='region')
parser.add_argument('--workspace_id', help='workspace id for your project')
parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration') parser.add_argument('--nas_data_source_id', help='nas data_source_id of DLC dataset configuration')
parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration') parser.add_argument('--oss_data_source_id', help='oss data_source_id of DLC dataset configuration')
parser.add_argument('--access_key_id', help='access_key_id') parser.add_argument('--access_key_id', help='access_key_id')
...@@ -28,6 +31,14 @@ if __name__ == "__main__": ...@@ -28,6 +31,14 @@ if __name__ == "__main__":
parser.add_argument('--log_dir', help='exception log dir') parser.add_argument('--log_dir', help='exception log dir')
args = parser.parse_args() args = parser.parse_args()
pathlib.Path(args.log_dir).mkdir(parents=True, exist_ok=True)
logging.basicConfig(filename=os.path.join(args.log_dir, 'dlc_exception.log'),
format='%(asctime)s %(message)s',
level=logging.INFO)
# DLC submit
try:
# init client # init client
client = Client( client = Client(
Config( Config(
...@@ -66,11 +77,10 @@ if __name__ == "__main__": ...@@ -66,11 +77,10 @@ if __name__ == "__main__":
job_type=args.job_type, job_type=args.job_type,
job_specs=[spec], job_specs=[spec],
data_sources=data_sources, data_sources=data_sources,
user_command=args.user_command user_command=args.user_command,
workspace_id=args.workspace_id,
) )
# DLC submit
try:
response = client.create_job(req) response = client.create_job(req)
job_id = response.body.job_id job_id = response.body.job_id
print('job id: ' + job_id) print('job id: ' + job_id)
...@@ -86,6 +96,5 @@ if __name__ == "__main__": ...@@ -86,6 +96,5 @@ if __name__ == "__main__":
client.stop_job(job_id) client.stop_job(job_id)
exit(0) exit(0)
except Exception as e: except Exception as e:
with open(os.path.join(args.log_dir, 'dlc_exception.log'), 'w') as f: logging.error('DLC submit Exception: \n')
f.write('DLC submit Exception: \n') logging.error(e, exc_info=1)
traceback.print_exc(file=f)
...@@ -13,6 +13,7 @@ export class DlcClient { ...@@ -13,6 +13,7 @@ export class DlcClient {
public podCount: number; public podCount: number;
public ecsSpec: string; public ecsSpec: string;
public region: string; public region: string;
public workspaceId: string;
// e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC // e.g., data1e6vg1tu0zi7, to generate it, go to 'Dataset Config' page of DLC
// create a NAS data and copy the 'DataSet ConfigurationID' // create a NAS data and copy the 'DataSet ConfigurationID'
public nasDataSourceId: string; public nasDataSourceId: string;
...@@ -35,6 +36,7 @@ export class DlcClient { ...@@ -35,6 +36,7 @@ export class DlcClient {
environmentId: string, environmentId: string,
ecsSpec: string, ecsSpec: string,
region: string, region: string,
workspaceId: string,
nasDataSourceId: string, nasDataSourceId: string,
accessKeyId: string, accessKeyId: string,
accessKeySecret: string, accessKeySecret: string,
...@@ -50,6 +52,7 @@ export class DlcClient { ...@@ -50,6 +52,7 @@ export class DlcClient {
this.ecsSpec = ecsSpec; this.ecsSpec = ecsSpec;
this.image = image; this.image = image;
this.region = region; this.region = region;
this.workspaceId = workspaceId;
this.nasDataSourceId = nasDataSourceId; this.nasDataSourceId = nasDataSourceId;
if (ossDataSourceId !== undefined) { if (ossDataSourceId !== undefined) {
this.ossDataSourceId = ossDataSourceId; this.ossDataSourceId = ossDataSourceId;
...@@ -77,6 +80,7 @@ export class DlcClient { ...@@ -77,6 +80,7 @@ export class DlcClient {
'--pod_count', String(this.podCount), '--pod_count', String(this.podCount),
'--ecs_spec', this.ecsSpec, '--ecs_spec', this.ecsSpec,
'--region', this.region, '--region', this.region,
'--workspace_id', this.workspaceId,
'--nas_data_source_id', this.nasDataSourceId, '--nas_data_source_id', this.nasDataSourceId,
'--oss_data_source_id', this.ossDataSourceId, '--oss_data_source_id', this.ossDataSourceId,
'--access_key_id', this.accessKeyId, '--access_key_id', this.accessKeyId,
......
...@@ -15,6 +15,7 @@ import { FileCommandChannel } from '../channels/fileCommandChannel'; ...@@ -15,6 +15,7 @@ import { FileCommandChannel } from '../channels/fileCommandChannel';
import { MountedStorageService } from '../storages/mountedStorageService'; import { MountedStorageService } from '../storages/mountedStorageService';
import { Scope } from 'typescript-ioc'; import { Scope } from 'typescript-ioc';
import { StorageService } from '../storageService'; import { StorageService } from '../storageService';
import { getLogDir } from 'common/utils';
/** /**
* Collector DLC jobs info from DLC cluster, and update dlc job status locally * Collector DLC jobs info from DLC cluster, and update dlc job status locally
...@@ -112,11 +113,12 @@ export class DlcEnvironmentService extends EnvironmentService { ...@@ -112,11 +113,12 @@ export class DlcEnvironmentService extends EnvironmentService {
environment.id, environment.id,
this.config.ecsSpec, this.config.ecsSpec,
this.config.region, this.config.region,
this.config.workspaceId,
this.config.nasDataSourceId, this.config.nasDataSourceId,
this.config.accessKeyId, this.config.accessKeyId,
this.config.accessKeySecret, this.config.accessKeySecret,
environment.command, environment.command,
dlcEnvironment.workingFolder, path.join(getLogDir(), `envs/${environment.id}`),
this.config.ossDataSourceId, this.config.ossDataSourceId,
); );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment