"...composable_kernel_onnxruntime.git" did not exist on "1f543bfa79de0687f9b6144b5dea10f4190c8892"
Unverified Commit ef15fc81 authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Bump node.js version to v16 (#3828)

parent b2225436
...@@ -87,6 +87,10 @@ stages: ...@@ -87,6 +87,10 @@ stages:
variables: variables:
YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn
steps: steps:
- task: NodeTool@0
inputs:
versionSpec: 16.3.0
displayName: Configure Node.js version
- task: Cache@2 - task: Cache@2
inputs: inputs:
key: 'yarn | "$(Agent.OS)" | ts/**/yarn.lock, !**/node_modules/**' key: 'yarn | "$(Agent.OS)" | ts/**/yarn.lock, !**/node_modules/**'
...@@ -123,6 +127,11 @@ stages: ...@@ -123,6 +127,11 @@ stages:
versionSpec: 3.8 versionSpec: 3.8
displayName: Configure Python version displayName: Configure Python version
- task: NodeTool@0
inputs:
versionSpec: 16.3.0
displayName: Configure Node.js version
- script: | - script: |
sudo apt-get install -y pandoc sudo apt-get install -y pandoc
sudo apt-get remove swig -y sudo apt-get remove swig -y
...@@ -201,13 +210,17 @@ stages: ...@@ -201,13 +210,17 @@ stages:
PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn
# This platform runs integration test first.
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
versionSpec: 3.6 versionSpec: 3.6
displayName: Configure Python version displayName: Configure Python version
- task: NodeTool@0
inputs:
versionSpec: 16.3.0
displayName: Configure Node.js version
- script: | - script: |
sudo apt-get install -y pandoc sudo apt-get install -y pandoc
sudo apt-get remove swig -y sudo apt-get remove swig -y
...@@ -283,14 +296,17 @@ stages: ...@@ -283,14 +296,17 @@ stages:
PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn
# This platform runs TypeScript unit test first.
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
versionSpec: 3.8 versionSpec: 3.8
displayName: Configure Python version displayName: Configure Python version
- task: NodeTool@0
inputs:
versionSpec: 16.3.0
displayName: Configure Node.js version
- script: | - script: |
brew install swig@3 brew install swig@3
rm -f /usr/local/bin/swig rm -f /usr/local/bin/swig
...@@ -361,14 +377,17 @@ stages: ...@@ -361,14 +377,17 @@ stages:
PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip PIP_CACHE_DIR: $(Pipeline.Workspace)/.pip
YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn YARN_CACHE_FOLDER: $(Pipeline.Workspace)/.yarn
# This platform runs Python unit test first.
steps: steps:
- task: UsePythonVersion@0 - task: UsePythonVersion@0
inputs: inputs:
versionSpec: 3.8 versionSpec: 3.8
displayName: Configure Python version displayName: Configure Python version
- task: NodeTool@0
inputs:
versionSpec: 16.3.0
displayName: Configure Node.js version
- task: Cache@2 - task: Cache@2
inputs: inputs:
key: 'python | "$(Agent.OS)" | dependencies/*.txt' key: 'python | "$(Agent.OS)" | dependencies/*.txt'
......
...@@ -22,7 +22,7 @@ import tarfile ...@@ -22,7 +22,7 @@ import tarfile
from zipfile import ZipFile from zipfile import ZipFile
node_version = 'v10.23.0' node_version = 'v16.3.0'
yarn_version = 'v1.22.10' yarn_version = 'v1.22.10'
......
...@@ -24,14 +24,16 @@ ...@@ -24,14 +24,16 @@
"@typescript-eslint/no-inferrable-types": 0, "@typescript-eslint/no-inferrable-types": 0,
"no-inner-declarations": 0, "no-inner-declarations": 0,
"@typescript-eslint/explicit-function-return-type": "error", "@typescript-eslint/explicit-function-return-type": "error",
"@typescript-eslint/no-var-requires": 0,
"@typescript-eslint/no-non-null-assertion": 0,
"@typescript-eslint/no-unused-vars": [ "@typescript-eslint/no-unused-vars": [
"error", "off",
{ {
"argsIgnorePattern": "^_" "argsIgnorePattern": "^_"
} }
], ],
"@typescript-eslint/no-var-requires": 0, "@typescript-eslint/no-use-before-define": 0
"@typescript-eslint/no-non-null-assertion": 0
}, },
"ignorePatterns": [ "ignorePatterns": [
"node_modules/", "node_modules/",
......
...@@ -56,7 +56,7 @@ function mkDirP(dirPath: string): Promise<void> { ...@@ -56,7 +56,7 @@ function mkDirP(dirPath: string): Promise<void> {
} else { } else {
const parent: string = path.dirname(dirPath); const parent: string = path.dirname(dirPath);
mkDirP(parent).then(() => { mkDirP(parent).then(() => {
fs.mkdir(dirPath, (err: Error) => { fs.mkdir(dirPath, (err: Error | null) => {
if (err) { if (err) {
deferred.reject(err); deferred.reject(err);
} else { } else {
......
...@@ -70,7 +70,7 @@ class NNITensorboardManager implements TensorboardManager { ...@@ -70,7 +70,7 @@ class NNITensorboardManager implements TensorboardManager {
this.log.error(error); this.log.error(error);
const alive: boolean = await isAlive(tensorboardProc.pid); const alive: boolean = await isAlive(tensorboardProc.pid);
if (alive) { if (alive) {
process.kill(-tensorboardProc.pid); process.kill(-tensorboardProc.pid!);
} }
this.setTensorboardTaskStatus(tensorboardTask, 'ERROR'); this.setTensorboardTaskStatus(tensorboardTask, 'ERROR');
}); });
......
...@@ -490,7 +490,7 @@ class NNIManager implements Manager { ...@@ -490,7 +490,7 @@ class NNIManager implements Manager {
}; };
const newEnv = Object.assign({}, process.env, nniEnv); const newEnv = Object.assign({}, process.env, nniEnv);
const tunerProc: ChildProcess = getTunerProc(command, stdio, newCwd, newEnv); const tunerProc: ChildProcess = getTunerProc(command, stdio, newCwd, newEnv);
this.dispatcherPid = tunerProc.pid; this.dispatcherPid = tunerProc.pid!;
this.dispatcher = createDispatcherInterface(tunerProc); this.dispatcher = createDispatcherInterface(tunerProc);
return; return;
......
...@@ -30,7 +30,7 @@ function runProcess(): Promise<Error | null> { ...@@ -30,7 +30,7 @@ function runProcess(): Promise<Error | null> {
if (code !== 0) { if (code !== 0) {
deferred.resolve(new Error(`return code: ${code}`)); deferred.resolve(new Error(`return code: ${code}`));
} else { } else {
let str = proc.stdout.read().toString(); let str = proc.stdout!.read().toString();
if(str.search("\r\n")!=-1){ if(str.search("\r\n")!=-1){
sentCommands = str.split("\r\n"); sentCommands = str.split("\r\n");
} }
......
...@@ -11,79 +11,71 @@ ...@@ -11,79 +11,71 @@
}, },
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"azure-storage": "^2.10.2", "azure-storage": "^2.10.4",
"child-process-promise": "^2.2.1", "child-process-promise": "^2.2.1",
"express": "^4.16.3", "express": "^4.17.1",
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.1",
"ignore": "^5.1.4", "ignore": "^5.1.8",
"js-base64": "^2.4.9", "js-base64": "^3.6.1",
"kubernetes-client": "^6.5.0", "kubernetes-client": "^6.12.1",
"lockfile": "^1.0.4", "lockfile": "^1.0.4",
"python-shell": "^2.0.1", "python-shell": "^3.0.0",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "5.0.0", "sqlite3": "5.0.2",
"ssh2": "^0.8.9", "ssh2": "^1.1.0",
"stream-buffers": "^3.0.2", "stream-buffers": "^3.0.2",
"tail-stream": "^0.3.4", "tail-stream": "^0.3.4",
"tar": "^6.0.2", "tar": "^6.1.0",
"tree-kill": "^1.2.2", "tree-kill": "^1.2.2",
"ts-deferred": "^1.0.4", "ts-deferred": "^1.0.4",
"typescript-ioc": "^1.2.4", "typescript-ioc": "^1.2.6",
"typescript-string-operations": "^1.3.1", "typescript-string-operations": "^1.4.1",
"webhdfs": "^1.2.0",
"ws": "^7.4.6" "ws": "^7.4.6"
}, },
"devDependencies": { "devDependencies": {
"@types/chai": "^4.1.4", "@types/chai": "^4.2.18",
"@types/chai-as-promised": "^7.1.0", "@types/chai-as-promised": "^7.1.0",
"@types/express": "^4.16.0", "@types/express": "^4.17.2",
"@types/glob": "^7.1.1", "@types/glob": "^7.1.3",
"@types/js-base64": "^2.3.1", "@types/js-base64": "^3.3.1",
"@types/js-yaml": "^3.12.5", "@types/js-yaml": "^4.0.1",
"@types/lockfile": "^1.0.0", "@types/lockfile": "^1.0.0",
"@types/mocha": "^8.0.3", "@types/mocha": "^8.2.2",
"@types/node": "10.12.18", "@types/node": "^15.12.1",
"@types/request": "^2.47.1", "@types/request": "^2.48.5",
"@types/rx": "^4.1.1", "@types/rx": "^4.1.2",
"@types/sqlite3": "^3.1.3", "@types/sqlite3": "^3.1.7",
"@types/ssh2": "^0.5.35", "@types/ssh2": "^0.5.46",
"@types/stream-buffers": "^3.0.2", "@types/stream-buffers": "^3.0.3",
"@types/tar": "^4.0.3", "@types/tar": "^4.0.4",
"@types/tmp": "^0.0.33", "@types/tmp": "^0.2.0",
"@types/ws": "^7.2.5", "@types/ws": "^7.4.4",
"@typescript-eslint/eslint-plugin": "^2.10.0", "@typescript-eslint/eslint-plugin": "^2.10.0",
"@typescript-eslint/parser": "^2.10.0", "@typescript-eslint/parser": "^4.26.0",
"chai": "^4.1.2", "chai": "^4.3.4",
"chai-as-promised": "^7.1.1", "chai-as-promised": "^7.1.1",
"eslint": "^6.7.2", "eslint": "^7.28.0",
"glob": "^7.1.3", "glob": "^7.1.7",
"mocha": "^8.1.3", "mocha": "^8.4.0",
"npx": "^10.2.2", "npx": "^10.2.2",
"nyc": "^15.0.0", "nyc": "^15.1.0",
"request": "^2.87.0", "request": "^2.88.2",
"rmdir": "^1.2.0", "rmdir": "^1.2.0",
"tmp": "^0.0.33", "tmp": "^0.2.1",
"ts-node": "^7.0.0", "ts-node": "^10.0.0",
"typescript": "^3.2.2" "typescript": "^4.3.2"
}, },
"resolutions": { "resolutions": {
"mem": "^4.0.0", "acorn": ">=8.3.0",
"lodash": ">=4.17.13", "hoek": ">=6.1.3",
"lodash.merge": ">=4.6.2", "node.extend": ">=1.1.8",
"node.extend": "^1.1.7", "npm": ">=7.16.0",
"hoek": "^4.2.1", "y18n": ">=5.0.8",
"js-yaml": "^3.13.1", "yargs-parser": ">=20.2.7",
"node-forge": ">=0.10.0", "joi": ">=17.4.0"
"dot-prop": "^4.2.1",
"npm": ">=6.14.8",
"yargs": "~16.0.3",
"yargs-parser": ">=20.2.0",
"y18n": ">=5.0.5",
"acorn": ">=8.0.4",
"serialize-javascript": ">=5.0.1"
}, },
"engines": { "engines": {
"node": ">=10.0.0" "node": "^16.3.0"
}, },
"nyc": { "nyc": {
"include": [ "include": [
......
...@@ -17,6 +17,9 @@ import { TensorboardManager, TensorboardTaskInfo } from '../common/tensorboardMa ...@@ -17,6 +17,9 @@ import { TensorboardManager, TensorboardTaskInfo } from '../common/tensorboardMa
import { ValidationSchemas } from './restValidationSchemas'; import { ValidationSchemas } from './restValidationSchemas';
import { NNIRestServer } from './nniRestServer'; import { NNIRestServer } from './nniRestServer';
import { getVersion } from '../common/utils'; import { getVersion } from '../common/utils';
import { MetricType } from '../common/datastore';
import { ProfileUpdateType } from '../common/manager';
import { LogType, TrialJobStatus } from '../common/trainingService';
const expressJoi = require('express-joi-validator'); const expressJoi = require('express-joi-validator');
...@@ -139,7 +142,7 @@ class NNIRestHandler { ...@@ -139,7 +142,7 @@ class NNIRestHandler {
private updateExperimentProfile(router: Router): void { private updateExperimentProfile(router: Router): void {
router.put('/experiment', (req: Request, res: Response) => { router.put('/experiment', (req: Request, res: Response) => {
this.nniManager.updateExperimentProfile(req.body, req.query.update_type).then(() => { this.nniManager.updateExperimentProfile(req.body, req.query.update_type as ProfileUpdateType).then(() => {
res.send(); res.send();
}).catch((err: Error) => { }).catch((err: Error) => {
this.handleError(err, res); this.handleError(err, res);
...@@ -219,7 +222,7 @@ class NNIRestHandler { ...@@ -219,7 +222,7 @@ class NNIRestHandler {
private listTrialJobs(router: Router): void { private listTrialJobs(router: Router): void {
router.get('/trial-jobs', (req: Request, res: Response) => { router.get('/trial-jobs', (req: Request, res: Response) => {
this.nniManager.listTrialJobs(req.query.status).then((jobInfos: TrialJobInfo[]) => { this.nniManager.listTrialJobs(req.query.status as TrialJobStatus).then((jobInfos: TrialJobInfo[]) => {
jobInfos.forEach((trialJob: TrialJobInfo) => { jobInfos.forEach((trialJob: TrialJobInfo) => {
this.setErrorPathForFailedJob(trialJob); this.setErrorPathForFailedJob(trialJob);
}); });
...@@ -263,7 +266,7 @@ class NNIRestHandler { ...@@ -263,7 +266,7 @@ class NNIRestHandler {
private getMetricData(router: Router): void { private getMetricData(router: Router): void {
router.get('/metric-data/:job_id*?', async (req: Request, res: Response) => { router.get('/metric-data/:job_id*?', async (req: Request, res: Response) => {
this.nniManager.getMetricData(req.params.job_id, req.query.type).then((metricsData: MetricDataRecord[]) => { this.nniManager.getMetricData(req.params.job_id, req.query.type as MetricType).then((metricsData: MetricDataRecord[]) => {
res.send(metricsData); res.send(metricsData);
}).catch((err: Error) => { }).catch((err: Error) => {
this.handleError(err, res); this.handleError(err, res);
...@@ -295,7 +298,7 @@ class NNIRestHandler { ...@@ -295,7 +298,7 @@ class NNIRestHandler {
private getTrialLog(router: Router): void { private getTrialLog(router: Router): void {
router.get('/trial-log/:id/:type', async(req: Request, res: Response) => { router.get('/trial-log/:id/:type', async(req: Request, res: Response) => {
this.nniManager.getTrialLog(req.params.id, req.params.type).then((log: string) => { this.nniManager.getTrialLog(req.params.id, req.params.type as LogType).then((log: string) => {
if (log === '') { if (log === '') {
log = 'No logs available.' log = 'No logs available.'
} }
......
...@@ -82,7 +82,7 @@ export namespace ValidationSchemas { ...@@ -82,7 +82,7 @@ export namespace ValidationSchemas {
gpuNum: joi.number().min(0).required(), gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required() command: joi.string().min(1).required()
}), }),
taskRoles: joi.array({ taskRoles: joi.array().items({
name: joi.string().min(1), name: joi.string().min(1),
taskNum: joi.number().min(1).required(), taskNum: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
...@@ -98,7 +98,7 @@ export namespace ValidationSchemas { ...@@ -98,7 +98,7 @@ export namespace ValidationSchemas {
minSucceededTaskCount: joi.number() minSucceededTaskCount: joi.number()
}) })
}), }),
imagePullSecrets: joi.array({ imagePullSecrets: joi.array().items({
name: joi.string().min(1).required() name: joi.string().min(1).required()
}), }),
// ############## adl ############### // ############## adl ###############
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
export interface DLTSClusterConfig {
dashboard: string;
cluster: string;
team: string;
email: string;
password: string;
gpuType?: string;
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
export const DLTS_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=dlts NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} \
&& cd $NNI_SYS_DIR && sh install_nni.sh \
&& cd '{6}' && python3 -m nni.tools.trial_tool.trial_keeper --trial_command '{7}' \
--nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}'`;
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import { DLTSClusterConfig } from "./dltsClusterConfig";
export class DLTSJobConfig {
public readonly team: string;
public readonly userName: string;
public readonly vcName: string;
public readonly gpuType: string;
public readonly jobType = "training";
public readonly jobtrainingtype = "RegularJob";
public readonly ssh = false;
public readonly ipython = false;
public readonly tensorboard = false;
public readonly workPath = '';
public readonly enableworkpath = true;
public readonly dataPath = '';
public readonly enabledatapath = false;
public readonly jobPath = '';
public readonly enablejobpath = true;
public readonly mountpoints = [];
public readonly env = [{ name: 'TMPDIR', value: '$HOME/tmp' }]
public readonly hostNetwork = false;
public readonly useGPUTopology = false;
public readonly isPrivileged = false;
public readonly hostIPC = false;
public readonly preemptionAllowed = "False"
public constructor(
clusterConfig: DLTSClusterConfig,
public readonly jobName: string,
public readonly resourcegpu: number,
public readonly image: string,
public readonly cmd: string,
public readonly interactivePorts: number[],
) {
if (clusterConfig.gpuType === undefined) {
throw Error('GPU type not fetched')
}
this.vcName = this.team = clusterConfig.team
this.gpuType = clusterConfig.gpuType
this.userName = clusterConfig.email
}
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import { Request, Response, Router } from 'express';
import { Inject } from 'typescript-ioc';
import * as component from '../../common/component';
import { ClusterJobRestServer } from '../common/clusterJobRestServer';
import { DLTSTrainingService } from './dltsTrainingService';
export interface ParameterFileMeta {
readonly experimentId: string;
readonly trialId: string;
readonly filePath: string;
}
/**
* DLTS Training service Rest server, provides rest API to support DLTS job metrics update
*
*/
@component.Singleton
export class DLTSJobRestServer extends ClusterJobRestServer {
private parameterFileMetaList: ParameterFileMeta[] = [];
@Inject
private readonly dltsTrainingService: DLTSTrainingService;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
constructor() {
super();
this.dltsTrainingService = component.get(DLTSTrainingService);
}
// tslint:disable-next-line:no-any
protected handleTrialMetrics(jobId: string, metrics: any[]): void {
// Split metrics array into single metric, then emit
// Warning: If not split metrics into single ones, the behavior will be UNKNOWN
for (const singleMetric of metrics) {
this.dltsTrainingService.MetricsEmitter.emit('metric', {
id : jobId,
data : singleMetric
});
}
}
protected createRestHandler(): Router {
const router: Router = super.createRestHandler();
router.post(`/parameter-file-meta`, (req: Request, res: Response) => {
try {
this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`);
this.parameterFileMetaList.push(req.body);
res.send();
} catch (err) {
this.log.error(`POST parameter-file-meta error: ${err}`);
res.status(500);
res.send(err.message);
}
});
router.get(`/parameter-file-meta`, (req: Request, res: Response) => {
try {
this.log.info(`GET /parameter-file-meta`);
res.send(this.parameterFileMetaList);
} catch (err) {
this.log.error(`GET parameter-file-meta error: ${err}`);
res.status(500);
res.send(err.message);
}
});
return router;
}
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as fs from 'fs';
import * as path from 'path';
import * as request from 'request';
import * as component from '../../common/component';
import { EventEmitter } from 'events';
import { String } from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors';
import {
NNIManagerIpConfig, TrainingService,
TrialJobApplicationForm, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService';
import { DLTS_TRIAL_COMMAND_FORMAT } from './dltsData';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData';
import { execMkdir, validateCodeDir } from '../common/util';
import { delay, uniqueString, getIPV4Address, getExperimentRootDir, getVersion, generateParamFileName } from '../../common/utils';
import { DLTSJobRestServer } from './dltsJobRestServer';
import { TrialConfigMetadataKey } from '../../training_service/common/trialConfigMetadataKey';
import { DLTSJobConfig } from './dltsJobConfig';
import { DLTSClusterConfig } from './dltsClusterConfig';
import { DLTSTrialConfig } from './dltsTrialConfig';
import { DLTSTrialJobDetail } from './dltsTrialJobDetail';
@component.Singleton
class DLTSTrainingService implements TrainingService {
private readonly log!: Logger;
private readonly metricsEmitter: EventEmitter;
//private readonly expRootDir: string;
private readonly jobQueue: string[];
private stopping: boolean = false;
private readonly experimentId!: string;
private versionCheck: boolean = true;
private logCollection: string = 'none';
private isMultiPhase: boolean = false;
private dltsRestServerHost: string;
private dltsRestServerPort?: number;
private jobMode: boolean;
private readonly trialJobsMap: Map<string, DLTSTrialJobDetail>;
private nniManagerIpConfig?: NNIManagerIpConfig;
private dltsClusterConfig?: DLTSClusterConfig;
private dltsTrialConfig?: DLTSTrialConfig;
constructor() {
this.log = getLogger();
this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map();
this.jobQueue = [];
this.experimentId = getExperimentId();
this.dltsRestServerHost = getIPV4Address();
this.jobMode = 'DLTS_JOB_ID' in process.env;
this.log.info(`Construct DLTS training service in ${this.jobMode ? 'job mode' : 'local mode'}.`);
}
public async run(): Promise<void> {
this.log.info('Run DLTS training service.');
const restServer: DLTSJobRestServer = component.get(DLTSJobRestServer);
await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`DLTS Training service rest server listening on: ${restServer.endPoint}`);
if (this.jobMode) {
await this.exposeRestServerPort(restServer.clusterRestServerPort);
} else {
this.dltsRestServerPort = restServer.clusterRestServerPort
}
await Promise.all([
this.statusCheckingLoop(),
this.submitJobLoop()]);
this.log.info('DLTS training service exit.');
}
private async exposeRestServerPort(port: number): Promise<void> {
if (this.dltsClusterConfig == null) {
throw Error('Cluster config is not set');
}
const { dashboard, cluster, email, password } = this.dltsClusterConfig;
const jobId = process.env['DLTS_JOB_ID'] + '';
const uri = `${dashboard}api/clusters/${cluster}/jobs/${jobId}/endpoints`;
const qs = { email, password };
do {
this.log.debug('Checking endpoints');
const endpoints = await new Promise((resolve, reject) => {
request.get(uri, { qs, json: true }, function (error, response, body) {
if (error) {
reject(error);
} else {
resolve(body);
}
});
});
this.log.debug('Endpoints: %o', endpoints);
if (Array.isArray(endpoints)) {
const restServerEndpoint = endpoints.find(({ podPort }) => podPort === port);
if (restServerEndpoint == null) {
this.log.debug('Exposing %d', port);
await new Promise((resolve, reject) => {
request.post(uri, {
qs,
json: true,
body: {
endpoints: [{
name: "nni-rest-server",
podPort: port
}]
}
}, function (error) {
if (error) {
reject(error);
} else {
resolve();
}
});
});
} else if (restServerEndpoint['status'] === 'running') {
// We get an exposed restserver port
this.dltsRestServerHost = restServerEndpoint['nodeName'];
this.dltsRestServerPort = restServerEndpoint['port'];
break;
}
}
} while (await new Promise(resolve => setTimeout(resolve, 1000, true)));
}
private async statusCheckingLoop(): Promise<void> {
while (!this.stopping) {
const updateDLTSTrialJobs: Promise<void>[] = [];
for (const dltsTrialJob of this.trialJobsMap.values()) {
updateDLTSTrialJobs.push(this.getDLTSTrialJobInfo(dltsTrialJob));
}
await Promise.all(updateDLTSTrialJobs);
// Calcel paused dlts job
const cancelPausedJobPromises: Promise<void>[] = [];
for (const [trialJobId, dltsTrialJob] of this.trialJobsMap) {
if (dltsTrialJob.dltsPaused && dltsTrialJob.status === 'RUNNING') {
cancelPausedJobPromises.push(this.cancelTrialJob(trialJobId));
}
}
await Promise.all(cancelPausedJobPromises);
const restServer: DLTSJobRestServer = component.get(DLTSJobRestServer);
if (restServer.getErrorMessage !== undefined) {
throw new Error(restServer.getErrorMessage);
}
await delay(3000);
}
}
private async getDLTSTrialJobInfo(dltsTrialJob: DLTSTrialJobDetail): Promise<void> {
if (this.dltsClusterConfig == null) {
throw Error('Cluster config is not set');
}
const requestOptions: request.Options = {
uri: `${this.dltsClusterConfig.dashboard}api/v2/clusters/${this.dltsClusterConfig.cluster}/jobs/${dltsTrialJob.dltsJobId}`,
qs: {
email: this.dltsClusterConfig.email,
password: this.dltsClusterConfig.password
},
json: true
};
const body = await new Promise((resolve, reject) => {
request(requestOptions, (error, response, body) => {
if (error != null) {
reject(error)
} else {
resolve(body)
}
})
}) as any;
void ((): void => {
switch (body['jobStatus']) {
case 'unapproved':
case 'queued':
case 'scheduling':
dltsTrialJob.status = "WAITING";
break;
case 'running':
dltsTrialJob.status = "RUNNING";
if (dltsTrialJob.startTime === undefined) {
dltsTrialJob.startTime = Date.parse(body['jobStatusDetail'][0]['startedAt'])
}
if (dltsTrialJob.url === undefined) {
dltsTrialJob.url = `${this.dltsClusterConfig.dashboard}job/${this.dltsClusterConfig.team}/${this.dltsClusterConfig.cluster}/${dltsTrialJob.dltsJobId}`
}
break;
case 'finished':
dltsTrialJob.status = "SUCCEEDED";
break;
case 'failed':
dltsTrialJob.status = "FAILED";
break;
case 'pausing':
case 'paused':
dltsTrialJob.status = "RUNNING";
dltsTrialJob.dltsPaused = true;
break;
case 'killing':
case 'killed':
if (dltsTrialJob.isEarlyStopped !== undefined) {
dltsTrialJob.status = dltsTrialJob.isEarlyStopped === true
? 'EARLY_STOPPED' : 'USER_CANCELED';
} else {
dltsTrialJob.status = 'SYS_CANCELED';
}
break;
default:
dltsTrialJob.status = "UNKNOWN";
}
}) ();
}
private async submitJobLoop(): Promise<void> {
while (!this.stopping) {
while (!this.stopping && this.jobQueue.length > 0) {
const trialJobId: string = this.jobQueue[0];
this.log.info(`Got job ${trialJobId}`);
if (await this.submitTrialJobToDLTS(trialJobId)) {
// Remove trial job with trialJobId from job queue
this.jobQueue.shift();
} else {
// Break the while loop since failed to submitJob
break;
}
}
await delay(3000);
}
}
public async listTrialJobs(): Promise<TrialJobDetail[]> {
return Array.from(this.trialJobsMap.values());
}
public async getTrialJob(trialJobId: string): Promise<TrialJobDetail> {
const trialJob = this.trialJobsMap.get(trialJobId);
if (trialJob === undefined) {
throw Error(`Trial job ${trialJobId} not found.`)
}
return trialJob
}
public async getTrialLog(_trialJobId: string, _logType: LogType): Promise<string> {
throw new MethodNotImplementedError();
}
public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.on('metric', listener);
}
public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void {
this.metricsEmitter.off('metric', listener);
}
public get MetricsEmitter(): EventEmitter {
return this.metricsEmitter;
}
public async submitTrialJob(form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const trialJobId: string = uniqueString(5);
const trialWorkingFolder: string = path.join(
'/nni-experiments', getExperimentId(),
'/trials/', trialJobId);
const trialJobDetail = new DLTSTrialJobDetail(
trialJobId, // id
'WAITING', // status
Date.now(), // submitTime
trialWorkingFolder, // workingDirectory
form,
`nni_exp_${this.experimentId}_trial_${trialJobId}`
);
this.trialJobsMap.set(trialJobId, trialJobDetail);
this.jobQueue.push(trialJobId);
return trialJobDetail;
}
public async cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise<void> {
const trialJobDetail: DLTSTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw Error(`cancelTrialJob: trial job id ${trialJobId} not found`);
}
if (this.dltsClusterConfig === undefined) {
throw Error('DLTS Cluster config is not initialized');
}
const options: request.Options = {
method: 'PUT',
uri: `${this.dltsClusterConfig.dashboard}api/clusters/${this.dltsClusterConfig.cluster}/jobs/${trialJobDetail.dltsJobId}/status`,
qs: {
email: this.dltsClusterConfig.email,
password: this.dltsClusterConfig.password
},
body: {
status: 'killing'
},
json: true
};
// Set trialjobDetail's early stopped field, to mark the job's cancellation source
trialJobDetail.isEarlyStopped = isEarlyStopped;
await new Promise((resolve, reject) => {
request(options, (error: Error, response: request.Response, body: any) => {
if (error) {
reject(error);
} else {
resolve(body);
}
});
});
}
private async getGpuType(): Promise<string> {
if (this.dltsClusterConfig === undefined) {
throw new Error('DLTS Cluster config is not initialized');
}
const gpuRequestOptions: request.Options = {
method: 'GET',
qs: {
email: this.dltsClusterConfig.email,
password: this.dltsClusterConfig.password
},
uri: `${this.dltsClusterConfig.dashboard}api/teams/${this.dltsClusterConfig.team}/clusters/${this.dltsClusterConfig.cluster}`,
json: true
};
return new Promise<string>((resolve, reject) => {
request(gpuRequestOptions, (error, response, data) => {
if (error) {
return reject(error)
}
try {
const metadata = JSON.parse(data['metadata'])
resolve(Object.keys(metadata)[0])
} catch (error) {
reject(error)
}
})
});
}
public async setClusterMetadata(key: string, value: string): Promise<void> {
switch (key) {
case TrialConfigMetadataKey.NNI_MANAGER_IP:
this.nniManagerIpConfig = <NNIManagerIpConfig>JSON.parse(value);
break;
case TrialConfigMetadataKey.DLTS_CLUSTER_CONFIG:
this.dltsClusterConfig = <DLTSClusterConfig>JSON.parse(value);
if (!this.dltsClusterConfig.cluster) {
this.dltsClusterConfig.cluster = '.default'
}
if (!this.dltsClusterConfig.email) {
if (process.env['DLWS_USER_EMAIL']) {
this.dltsClusterConfig.email = process.env['DLWS_USER_EMAIL'] as string
} else {
throw Error('`email` field in `dltsConfig` is not configured.')
}
}
if (!this.dltsClusterConfig.password) {
if (process.env['DLTS_JOB_TOKEN']) {
this.dltsClusterConfig.password = process.env['DLTS_JOB_TOKEN'] as string
} else {
throw Error('`password` field in `dltsConfig` is not configured.')
}
}
if (!this.dltsClusterConfig.team) {
if (process.env['DLWS_VC_NAME']) {
this.dltsClusterConfig.team = process.env['DLWS_VC_NAME'] as string
} else {
throw Error('`team` field in `dltsConfig` is not configured.')
}
}
this.dltsClusterConfig.gpuType = await this.getGpuType();
break;
case TrialConfigMetadataKey.TRIAL_CONFIG:
this.dltsTrialConfig = <DLTSTrialConfig>JSON.parse(value);
// Validate to make sure codeDir doesn't have too many files
try {
await validateCodeDir(this.dltsTrialConfig.codeDir);
} catch (error) {
this.log.error(error);
throw error;
}
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.versionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
break;
case TrialConfigMetadataKey.MULTI_PHASE:
this.isMultiPhase = (value === 'true' || value === 'True');
break;
default:
//Reject for unknown keys
throw new Error(`Uknown key: ${key}`);
}
}
public async getClusterMetadata(_key: string): Promise<string> {
return '';
}
public async cleanUp(): Promise<void> {
this.log.info('Stopping DLTS training service...');
this.stopping = true;
const restServer: DLTSJobRestServer = component.get(DLTSJobRestServer);
try {
await restServer.stop();
this.log.info('DLTS Training service rest server stopped successfully.');
return;
} catch (error) {
// tslint:disable-next-line: no-unsafe-any
this.log.error(`DLTS Training service rest server stopped failed, error: ${error.message}`);
throw error;
}
}
private async submitTrialJobToDLTS(trialJobId: string): Promise<boolean> {
const trialJobDetail: DLTSTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw new Error(`Failed to find DLTSTrialJobDetail for job ${trialJobId}`);
}
if (this.dltsClusterConfig === undefined) {
throw new Error('DLTS Cluster config is not initialized');
}
if (this.dltsTrialConfig === undefined) {
throw new Error('trial config is not initialized');
}
if (this.dltsRestServerPort === undefined) {
const restServer: DLTSJobRestServer = component.get(DLTSJobRestServer);
this.dltsRestServerPort = restServer.clusterRestServerPort;
}
// Step 1. Prepare DLTS job configuration
const trialLocalFolder = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally.
await execMkdir(trialLocalFolder);
const runScriptContent: string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Write file content ( parameter.cfg ) to local tmp folders
if (trialJobDetail.form !== undefined) {
await fs.promises.writeFile(
path.join(trialLocalFolder, generateParamFileName(trialJobDetail.form.hyperParameters)),
trialJobDetail.form.hyperParameters.value, { encoding: 'utf8' }
);
}
// tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : this.dltsRestServerHost;
const version: string = this.versionCheck ? await getVersion() : '';
const nniDLTSTrialCommand: string = String.Format(
DLTS_TRIAL_COMMAND_FORMAT,
trialLocalFolder,
path.join(trialLocalFolder, 'nnioutput'),
trialJobId,
this.experimentId,
trialJobDetail.form.sequenceId,
false,
this.dltsTrialConfig.codeDir,
this.dltsTrialConfig.command,
nniManagerIp,
this.dltsRestServerPort,
version,
this.logCollection
)
.replace(/\r\n|\n|\r/gm, '');
// Step 2. Submit DLTS job via Rest call
const dltsJobConfig: DLTSJobConfig = new DLTSJobConfig(
this.dltsClusterConfig,
trialJobDetail.dltsJobName,
this.dltsTrialConfig.gpuNum,
this.dltsTrialConfig.image,
nniDLTSTrialCommand,
[]
);
const submitJobRequest: request.Options = {
method: 'POST',
uri: `${this.dltsClusterConfig.dashboard}api/clusters/${this.dltsClusterConfig.cluster}/jobs`,
qs: {
email: this.dltsClusterConfig.email,
password: this.dltsClusterConfig.password
},
body: dltsJobConfig,
json: true
}
const responseData = await new Promise<any>((resolve, reject) => {
request(submitJobRequest, function (error, response, data) {
if (error) {
return reject(error)
} else {
return resolve(data)
}
})
});
trialJobDetail.dltsJobId = responseData['jobId']
return true;
}
public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise<TrialJobDetail> {
const trialJobDetail: undefined | TrialJobDetail = this.trialJobsMap.get(trialJobId);
if (trialJobDetail === undefined) {
throw new Error(`updateTrialJob failed: ${trialJobId} not found`);
}
if (this.dltsClusterConfig === undefined) {
throw new Error('DLTS Cluster config is not initialized');
}
if (this.dltsTrialConfig === undefined) {
throw new Error('DLTS trial config is not initialized');
}
const hyperParameters = form.hyperParameters;
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
const hpFileName: string = generateParamFileName(hyperParameters);
const localFilepath: string = path.join(trialLocalTempFolder, hpFileName);
await fs.promises.writeFile(localFilepath, hyperParameters.value, { encoding: 'utf8' });
const parameterFileMeta = {
experimentId: this.experimentId,
trialId: trialJobId
};
const restServer: DLTSJobRestServer = component.get(DLTSJobRestServer);
const req: request.Options = {
uri: `${restServer.endPoint}${restServer.apiRootUrl}/parameter-file-meta`,
method: 'POST',
json: true,
body: parameterFileMeta
};
await new Promise((resolve, reject) => {
request(req, (err: Error, _res: request.Response) => {
if (err) {
reject(err);
} else {
resolve();
}
});
});
return trialJobDetail;
}
public get isMultiPhaseJobSupported(): boolean {
return false;
}
public getTrialOutputLocalPath(_trialJobId: string): Promise<string> {
throw new MethodNotImplementedError();
}
public fetchTrialOutput(_trialJobId: string, _subpath: string): Promise<void> {
throw new MethodNotImplementedError();
}
}
export { DLTSTrainingService };
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import { TrialConfig } from "training_service/common/trialConfig";
export class DLTSTrialConfig extends TrialConfig {
public constructor(
command: string,
codeDir: string,
gpuNum: number,
public readonly image: string
) {
super(command, codeDir, gpuNum);
}
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
import {
TrialJobDetail,
TrialJobStatus,
TrialJobApplicationForm
} from "../../common/trainingService";
export class DLTSTrialJobDetail implements TrialJobDetail {
public startTime?: number;
public endTime?: number;
public tags?: string[];
public url?: string;
public isEarlyStopped?: boolean;
// DLTS staff
public dltsJobId?: string;
public dltsPaused: boolean = false;
public constructor (
public id: string,
public status: TrialJobStatus,
public submitTime: number,
public workingDirectory: string,
public form: TrialJobApplicationForm,
// DLTS staff
public dltsJobName: string,
) {}
}
...@@ -277,7 +277,7 @@ class ShellExecutor { ...@@ -277,7 +277,7 @@ class ShellExecutor {
this.log.debug(`copyFileToRemote(${commandIndex}): localFilePath: ${localFilePath}, remoteFilePath: ${remoteFilePath}`); this.log.debug(`copyFileToRemote(${commandIndex}): localFilePath: ${localFilePath}, remoteFilePath: ${remoteFilePath}`);
const deferred: Deferred<boolean> = new Deferred<boolean>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
this.sshClient.sftp((err: Error, sftp: SFTPWrapper) => { this.sshClient.sftp((err: Error | undefined, sftp: SFTPWrapper) => {
if (err !== undefined && err !== null) { if (err !== undefined && err !== null) {
this.log.error(`copyFileToRemote(${commandIndex}): ${err}`); this.log.error(`copyFileToRemote(${commandIndex}): ${err}`);
deferred.reject(err); deferred.reject(err);
...@@ -328,7 +328,7 @@ class ShellExecutor { ...@@ -328,7 +328,7 @@ class ShellExecutor {
const commandIndex = randomInt(10000); const commandIndex = randomInt(10000);
this.log.debug(`getRemoteFileContent(${commandIndex}): filePath: ${filePath}`); this.log.debug(`getRemoteFileContent(${commandIndex}): filePath: ${filePath}`);
const deferred: Deferred<string> = new Deferred<string>(); const deferred: Deferred<string> = new Deferred<string>();
this.sshClient.sftp((err: Error, sftp: SFTPWrapper) => { this.sshClient.sftp((err: Error | undefined, sftp: SFTPWrapper) => {
if (err !== undefined && err !== null) { if (err !== undefined && err !== null) {
this.log.error(`getRemoteFileContent(${commandIndex}) sftp: ${err}`); this.log.error(`getRemoteFileContent(${commandIndex}) sftp: ${err}`);
deferred.reject(new Error(`SFTP error: ${err}`)); deferred.reject(new Error(`SFTP error: ${err}`));
...@@ -376,7 +376,7 @@ class ShellExecutor { ...@@ -376,7 +376,7 @@ class ShellExecutor {
// Windows always uses shell, and it needs to disable to get it works. // Windows always uses shell, and it needs to disable to get it works.
useShell = useShell && !this.isWindows; useShell = useShell && !this.isWindows;
const callback = (err: Error, channel: ClientChannel): void => { const callback = (err: Error | undefined, channel: ClientChannel): void => {
if (err !== undefined && err !== null) { if (err !== undefined && err !== null) {
this.log.error(`remoteExeCommand(${commandIndex}): ${err.message}`); this.log.error(`remoteExeCommand(${commandIndex}): ${err.message}`);
deferred.reject(err); deferred.reject(err);
......
...@@ -310,7 +310,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService { ...@@ -310,7 +310,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
} }
} }
} }
return yaml.safeDump(nniJobConfig); return yaml.dump(nniJobConfig);
} }
protected formatPAIHost(host: string): string { protected formatPAIHost(host: string): string {
......
...@@ -16,14 +16,14 @@ export abstract class StorageService { ...@@ -16,14 +16,14 @@ export abstract class StorageService {
protected logger: Logger; protected logger: Logger;
protected abstract internalConfig(key: string, value: string): void; protected abstract internalConfig(key: string, value: string): void;
protected abstract async internalRemove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise<void>; protected abstract internalRemove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise<void>;
protected abstract async internalRename(remotePath: string, newName: string): Promise<void>; protected abstract internalRename(remotePath: string, newName: string): Promise<void>;
protected abstract async internalMkdir(remotePath: string): Promise<void>; protected abstract internalMkdir(remotePath: string): Promise<void>;
protected abstract async internalCopy(sourcePath: string, targetPath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise<string>; protected abstract internalCopy(sourcePath: string, targetPath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise<string>;
protected abstract async internalExists(remotePath: string): Promise<boolean>; protected abstract internalExists(remotePath: string): Promise<boolean>;
protected abstract async internalRead(remotePath: string, offset: number, length: number): Promise<string>; protected abstract internalRead(remotePath: string, offset: number, length: number): Promise<string>;
protected abstract async internalList(remotePath: string): Promise<string[]>; protected abstract internalList(remotePath: string): Promise<string[]>;
protected abstract async internalAttach(remotePath: string, content: string): Promise<boolean>; protected abstract internalAttach(remotePath: string, content: string): Promise<boolean>;
protected abstract internalIsRelativePath(path: string): boolean; protected abstract internalIsRelativePath(path: string): boolean;
protected abstract internalJoin(...paths: string[]): string; protected abstract internalJoin(...paths: string[]): string;
protected abstract internalDirname(...paths: string[]): string; protected abstract internalDirname(...paths: string[]): string;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment