Commit 22993e5d authored by demianzhang's avatar demianzhang Committed by chicm-ms
Browse files

Pass tslint for training service (#1177)

* fix local and remote training services tslint
parent ae7a72bc
...@@ -67,10 +67,10 @@ The tool dispatches and runs trial jobs generated by tuning algorithms to search ...@@ -67,10 +67,10 @@ The tool dispatches and runs trial jobs generated by tuning algorithms to search
<li><a href="docs/en_US/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li> <li><a href="docs/en_US/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/en_US/BuiltinTuner.md#BOHB">BOHB</a></li> <li><a href="docs/en_US/BuiltinTuner.md#BOHB">BOHB</a></li>
</ul> </ul>
<a href="docs/en_US/BuiltinAssessors.md">Assessor</a> <a href="docs/en_US/BuiltinAssessor.md">Assessor</a>
<ul> <ul>
<li><a href="docs/en_US/BuiltinAssessors.md#Medianstop">Median Stop</a></li> <li><a href="docs/en_US/BuiltinAssessor.md#Medianstop">Median Stop</a></li>
<li><a href="docs/en_US/BuiltinAssessors.md#Curvefitting">Curve Fitting</a></li> <li><a href="docs/en_US/BuiltinAssessor.md#Curvefitting">Curve Fitting</a></li>
</ul> </ul>
</td> </td>
<td> <td>
...@@ -118,12 +118,6 @@ python3 -m pip install --upgrade nni ...@@ -118,12 +118,6 @@ python3 -m pip install --upgrade nni
Windows Windows
**IMPORTANT** Make sure `ExecutionPolicy` has been set to `Unrestricted` before installation. To set the policy, **run PowerShell as administrator** with the following command:
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
Once ExecutionPolicy is unrestricted, run the following command to install NNI:
```bash ```bash
python -m pip install --upgrade nni python -m pip install --upgrade nni
``` ```
......
...@@ -21,16 +21,6 @@ For other examples you need to change trial command `python3` into `python` in e ...@@ -21,16 +21,6 @@ For other examples you need to change trial command `python3` into `python` in e
Make sure C++ 14.0 compiler installed. Make sure C++ 14.0 compiler installed.
>building 'simplejson._speedups' extension error: [WinError 3] The system cannot find the path specified >building 'simplejson._speedups' extension error: [WinError 3] The system cannot find the path specified
### Fail to run PowerShell when install NNI from source
If you run PowerShell script for the first time and did not set the execution policies for executing the script, you will meet this error below. Try to run PowerShell as administrator with this command first:
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
>...cannot be loaded because running scripts is disabled on this system.
### Trial failed with missing DLL in command line or PowerShell ### Trial failed with missing DLL in command line or PowerShell
This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install SciPy. Using Anaconda or Miniconda with Python(64-bit) can solve it. This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install SciPy. Using Anaconda or Miniconda with Python(64-bit) can solve it.
...@@ -38,11 +28,7 @@ This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install ...@@ -38,11 +28,7 @@ This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install
### Trial failed on webUI ### Trial failed on webUI
Please check the trial log file stderr for more details. If there is no such file and NNI is installed through pip, then you need to run PowerShell as administrator with this command first: Please check the trial log file stderr for more details.
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
If there is a stderr file, please check out. Two possible cases are as follows: If there is a stderr file, please check out. Two possible cases are as follows:
......
...@@ -91,6 +91,7 @@ interface TrialJobMetric { ...@@ -91,6 +91,7 @@ interface TrialJobMetric {
* define TrainingServiceError * define TrainingServiceError
*/ */
class TrainingServiceError extends Error { class TrainingServiceError extends Error {
private errCode: number; private errCode: number;
constructor(errorCode: number, errorMessage: string) { constructor(errorCode: number, errorMessage: string) {
...@@ -136,5 +137,3 @@ export { ...@@ -136,5 +137,3 @@ export {
TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters, TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters,
HostJobApplicationForm, JobApplicationForm, JobType, NNIManagerIpConfig HostJobApplicationForm, JobApplicationForm, JobType, NNIManagerIpConfig
}; };
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
"@types/chai-as-promised": "^7.1.0", "@types/chai-as-promised": "^7.1.0",
"@types/express": "^4.16.0", "@types/express": "^4.16.0",
"@types/glob": "^7.1.1", "@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1",
"@types/mocha": "^5.2.5", "@types/mocha": "^5.2.5",
"@types/node": "10.12.18", "@types/node": "10.12.18",
"@types/request": "^2.47.1", "@types/request": "^2.47.1",
......
...@@ -20,22 +20,24 @@ ...@@ -20,22 +20,24 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { Request, Response, Router } from 'express'; // tslint:disable-next-line:no-implicit-dependencies
import * as bodyParser from 'body-parser'; import * as bodyParser from 'body-parser';
import { Request, Response, Router } from 'express';
import * as fs from 'fs';
import * as path from 'path';
import { Writable } from 'stream';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component'; import * as component from '../../common/component';
import * as fs from 'fs'
import * as path from 'path'
import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo'; import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo';
import { RestServer } from '../../common/restServer' import { RestServer } from '../../common/restServer';
import { getLogDir } from '../../common/utils'; import { getLogDir } from '../../common/utils';
import { Writable } from 'stream';
/** /**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update * Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export abstract class ClusterJobRestServer extends RestServer{ export abstract class ClusterJobRestServer extends RestServer {
private readonly API_ROOT_URL: string = '/api/v1/nni-pai'; private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`; private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;
...@@ -51,19 +53,20 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -51,19 +53,20 @@ export abstract class ClusterJobRestServer extends RestServer{
constructor() { constructor() {
super(); super();
const basePort: number = getBasePort(); const basePort: number = getBasePort();
assert(basePort && basePort > 1024); assert(basePort !== undefined && basePort > 1024);
this.port = basePort + 1; this.port = basePort + 1;
} }
public get clusterRestServerPort(): number { public get clusterRestServerPort(): number {
if(!this.port) { if (this.port === undefined) {
throw new Error('PAI Rest server port is undefined'); throw new Error('PAI Rest server port is undefined');
} }
return this.port; return this.port;
} }
public get getErrorMessage(): string | undefined{ public get getErrorMessage(): string | undefined {
return this.errorMessage; return this.errorMessage;
} }
...@@ -79,11 +82,15 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -79,11 +82,15 @@ export abstract class ClusterJobRestServer extends RestServer{
this.app.use(this.API_ROOT_URL, this.createRestHandler()); this.app.use(this.API_ROOT_URL, this.createRestHandler());
} }
// Abstract method to handle trial metrics data
// tslint:disable-next-line:no-any
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
// tslint:disable: no-unsafe-any no-any
private createRestHandler() : Router { private createRestHandler() : Router {
const router: Router = Router(); const router: Router = Router();
// tslint:disable-next-line:typedef router.use((req: Request, res: Response, next: any) => {
router.use((req: Request, res: Response, next) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`); this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json'); res.setHeader('Content-Type', 'application/json');
next(); next();
...@@ -92,7 +99,7 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -92,7 +99,7 @@ export abstract class ClusterJobRestServer extends RestServer{
router.post(`/version/${this.expId}/:trialId`, (req: Request, res: Response) => { router.post(`/version/${this.expId}/:trialId`, (req: Request, res: Response) => {
if (this.enableVersionCheck) { if (this.enableVersionCheck) {
try { try {
const checkResultSuccess: boolean = req.body.tag === 'VCSuccess'? true: false; const checkResultSuccess: boolean = req.body.tag === 'VCSuccess' ? true : false;
if (this.versionCheckSuccess !== undefined && this.versionCheckSuccess !== checkResultSuccess) { if (this.versionCheckSuccess !== undefined && this.versionCheckSuccess !== checkResultSuccess) {
this.errorMessage = 'Version check error, version check result is inconsistent!'; this.errorMessage = 'Version check error, version check result is inconsistent!';
this.log.error(this.errorMessage); this.log.error(this.errorMessage);
...@@ -103,7 +110,7 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -103,7 +110,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this.versionCheckSuccess = false; this.versionCheckSuccess = false;
this.errorMessage = req.body.msg; this.errorMessage = req.body.msg;
} }
} catch(err) { } catch (err) {
this.log.error(`json parse metrics error: ${err}`); this.log.error(`json parse metrics error: ${err}`);
res.status(500); res.status(500);
res.send(err.message); res.send(err.message);
...@@ -122,8 +129,7 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -122,8 +129,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this.handleTrialMetrics(req.body.jobId, req.body.metrics); this.handleTrialMetrics(req.body.jobId, req.body.metrics);
res.send(); res.send();
} } catch (err) {
catch(err) {
this.log.error(`json parse metrics error: ${err}`); this.log.error(`json parse metrics error: ${err}`);
res.status(500); res.status(500);
res.send(err.message); res.send(err.message);
...@@ -131,35 +137,37 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -131,35 +137,37 @@ export abstract class ClusterJobRestServer extends RestServer{
}); });
router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => { router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => {
if(this.enableVersionCheck && !this.versionCheckSuccess && !this.errorMessage) { if (this.enableVersionCheck && (this.versionCheckSuccess === undefined || !this.versionCheckSuccess)
this.errorMessage = `Version check failed, didn't get version check response from trialKeeper, please check your NNI version in ` && this.errorMessage === undefined) {
+ `NNIManager and TrialKeeper!` this.errorMessage = `Version check failed, didn't get version check response from trialKeeper,`
+ ` please check your NNI version in NNIManager and TrialKeeper!`;
} }
const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`); const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`);
try { try {
let skipLogging: boolean = false; let skipLogging: boolean = false;
if(req.body.tag === 'trial' && req.body.msg !== undefined) { if (req.body.tag === 'trial' && req.body.msg !== undefined) {
const metricsContent = req.body.msg.match(this.NNI_METRICS_PATTERN); const metricsContent: any = req.body.msg.match(this.NNI_METRICS_PATTERN);
if(metricsContent && metricsContent.groups) { if (metricsContent && metricsContent.groups) {
this.handleTrialMetrics(req.params.trialId, [metricsContent.groups['metrics']]); const key: string = 'metrics';
this.handleTrialMetrics(req.params.trialId, [metricsContent.groups[key]]);
skipLogging = true; skipLogging = true;
} }
} }
if(!skipLogging){ if (!skipLogging) {
// Construct write stream to write remote trial's log into local file // Construct write stream to write remote trial's log into local file
// tslint:disable-next-line:non-literal-fs-path
const writeStream: Writable = fs.createWriteStream(trialLogPath, { const writeStream: Writable = fs.createWriteStream(trialLogPath, {
flags: 'a+', flags: 'a+',
encoding: 'utf8', encoding: 'utf8',
autoClose: true autoClose: true
}); });
writeStream.write(req.body.msg + '\n'); writeStream.write(String.Format('{0}\n', req.body.msg));
writeStream.end(); writeStream.end();
} }
res.send(); res.send();
} } catch (err) {
catch(err) {
this.log.error(`json parse stdout data error: ${err}`); this.log.error(`json parse stdout data error: ${err}`);
res.status(500); res.status(500);
res.send(err.message); res.send(err.message);
...@@ -168,7 +176,5 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -168,7 +176,5 @@ export abstract class ClusterJobRestServer extends RestServer{
return router; return router;
} }
// tslint:enable: no-unsafe-any no-any
/** Abstract method to handle trial metrics data */ }
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
}
\ No newline at end of file
...@@ -27,4 +27,4 @@ if python3 -c 'import nni' > /dev/null 2>&1; then ...@@ -27,4 +27,4 @@ if python3 -c 'import nni' > /dev/null 2>&1; then
else else
# Install nni # Install nni
python3 -m pip install --user --upgrade nni python3 -m pip install --user --upgrade nni
fi`; fi`;
\ No newline at end of file
...@@ -65,11 +65,11 @@ export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string = ...@@ -65,11 +65,11 @@ export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
export METRIC_OUTPUT_DIR={0} export METRIC_OUTPUT_DIR={0}
echo $$ >{1} echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector python3 -m nni_gpu_tool.gpu_metrics_collector
` `;
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
` `
$env:METRIC_OUTPUT_DIR="{0}" $env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow $app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8 Write $app.ID | Out-File {1} -NoNewline -encoding utf8
` `;
\ No newline at end of file
...@@ -21,7 +21,10 @@ ...@@ -21,7 +21,10 @@
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
// tslint:disable-next-line:max-classes-per-file /**
* Trial job metrics class
* Representing trial job metrics properties
*/
export class JobMetrics { export class JobMetrics {
public readonly jobId: string; public readonly jobId: string;
public readonly metrics: string[]; public readonly metrics: string[];
......
...@@ -24,13 +24,13 @@ ...@@ -24,13 +24,13 @@
* Representing trial job configurable properties * Representing trial job configurable properties
*/ */
export class TrialConfig { export class TrialConfig {
/** Trail command */ // Trail command
public readonly command : string; public readonly command : string;
/** Code directory */ // Code directory
public readonly codeDir : string; public readonly codeDir : string;
/** Required GPU number for trial job. The number should be in [0,100] */ // Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum : number; public readonly gpuNum : number;
/** /**
...@@ -44,4 +44,4 @@ export class TrialConfig { ...@@ -44,4 +44,4 @@ export class TrialConfig {
this.codeDir = codeDir; this.codeDir = codeDir;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
} }
} }
\ No newline at end of file
import { getLogger } from "common/log";
/** /**
* Copyright (c) Microsoft Corporation * Copyright (c) Microsoft Corporation
* All rights reserved. * All rights reserved.
...@@ -21,16 +19,15 @@ import { getLogger } from "common/log"; ...@@ -21,16 +19,15 @@ import { getLogger } from "common/log";
'use strict'; 'use strict';
import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import * as cp from 'child_process'; import * as cp from 'child_process';
import * as os from 'os';
import * as fs from 'fs'; import * as fs from 'fs';
import { getNewLine } from '../../common/utils'; import * as os from 'os';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import * as path from 'path'; import * as path from 'path';
import { String } from 'typescript-string-operations'; import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp"; import { countFilesRecursively, getNewLine } from '../../common/utils';
import { file } from '../../node_modules/@types/tmp';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
/** /**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken * Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
...@@ -38,16 +35,17 @@ import { file } from "../../node_modules/@types/tmp"; ...@@ -38,16 +35,17 @@ import { file } from "../../node_modules/@types/tmp";
* @param codeDir codeDir in nni config file * @param codeDir codeDir in nni config file
* @returns file number under codeDir * @returns file number under codeDir
*/ */
// tslint:disable: no-redundant-jsdoc
export async function validateCodeDir(codeDir: string) : Promise<number> { export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined; let fileCount: number | undefined;
try { try {
fileCount = await countFilesRecursively(codeDir); fileCount = await countFilesRecursively(codeDir);
} catch(error) { } catch (error) {
throw new Error(`Call count file error: ${error}`); throw new Error(`Call count file error: ${error}`);
} }
if(fileCount && fileCount > 1000) { if (fileCount !== undefined && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},` const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`; + ` please check if it's a valid code dir`;
throw new Error(errMessage); throw new Error(errMessage);
...@@ -66,6 +64,7 @@ export async function execMkdir(directory: string): Promise<void> { ...@@ -66,6 +64,7 @@ export async function execMkdir(directory: string): Promise<void> {
} else { } else {
await cpp.exec(`mkdir -p ${directory}`); await cpp.exec(`mkdir -p ${directory}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
...@@ -80,6 +79,7 @@ export async function execCopydir(source: string, destination: string): Promise< ...@@ -80,6 +79,7 @@ export async function execCopydir(source: string, destination: string): Promise<
} else { } else {
await cpp.exec(`cp -r ${source} ${destination}`); await cpp.exec(`cp -r ${source} ${destination}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
...@@ -93,14 +93,15 @@ export async function execNewFile(filename: string): Promise<void> { ...@@ -93,14 +93,15 @@ export async function execNewFile(filename: string): Promise<void> {
} else { } else {
await cpp.exec(`touch ${filename}`); await cpp.exec(`touch ${filename}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* run script * run script using powershell or bash
* @param filePath * @param filePath
*/ */
export function execScript(filePath: string): cp.ChildProcess { export function runScript(filePath: string): cp.ChildProcess {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return cp.exec(`powershell.exe -ExecutionPolicy Bypass -file ${filePath}`); return cp.exec(`powershell.exe -ExecutionPolicy Bypass -file ${filePath}`);
} else { } else {
...@@ -119,6 +120,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis ...@@ -119,6 +120,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
} else { } else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`); cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
} }
return Promise.resolve(cmdresult); return Promise.resolve(cmdresult);
} }
...@@ -132,6 +134,7 @@ export async function execRemove(directory: string): Promise<void> { ...@@ -132,6 +134,7 @@ export async function execRemove(directory: string): Promise<void> {
} else { } else {
await cpp.exec(`rm -rf ${directory}`); await cpp.exec(`rm -rf ${directory}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
...@@ -145,37 +148,39 @@ export async function execKill(pid: string): Promise<void> { ...@@ -145,37 +148,39 @@ export async function execKill(pid: string): Promise<void> {
} else { } else {
await cpp.exec(`pkill -P ${pid}`); await cpp.exec(`pkill -P ${pid}`);
} }
return Promise.resolve(); return Promise.resolve();
} }
/** /**
* set environment variable * get command of setting environment variable
* @param variable * @param variable
* @returns command string * @returns command string
*/ */
export function setEnvironmentVariable(variable: { key: string; value: string }): string { export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`; return `$env:${variable.key}="${variable.value}"`;
} } else {
else{
return `export ${variable.key}=${variable.value}`; return `export ${variable.key}=${variable.value}`;
} }
} }
/** /**
* Compress files in directory to tar file * Compress files in directory to tar file
* @param source_path * @param sourcePath
* @param tar_path * @param tarPath
*/ */
export async function tarAdd(tar_path: string, source_path: string): Promise<void> { export async function tarAdd(tarPath: string, sourcePath: string): Promise<void> {
if (process.platform === 'win32') { if (process.platform === 'win32') {
tar_path = tar_path.split('\\').join('\\\\'); const tarFilePath: string = tarPath.split('\\')
source_path = source_path.split('\\').join('\\\\'); .join('\\\\');
let script: string[] = []; const sourceFilePath: string = sourcePath.split('\\')
.join('\\\\');
const script: string[] = [];
script.push( script.push(
`import os`, `import os`,
`import tarfile`, `import tarfile`,
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path), String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tarFilePath, sourceFilePath),
` for file in files:`, ` for file in files:`,
` fullpath = os.path.join(root,file)`, ` fullpath = os.path.join(root,file)`,
` tar.add(fullpath, arcname=file)`, ` tar.add(fullpath, arcname=file)`,
...@@ -184,8 +189,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi ...@@ -184,8 +189,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const tarScript: string = path.join(os.tmpdir(), 'tar.py'); const tarScript: string = path.join(os.tmpdir(), 'tar.py');
await cpp.exec(`python ${tarScript}`); await cpp.exec(`python ${tarScript}`);
} else { } else {
await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`); await cpp.exec(`tar -czf ${tarPath} -C ${sourcePath} .`);
} }
return Promise.resolve(); return Promise.resolve();
} }
...@@ -195,9 +201,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi ...@@ -195,9 +201,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
*/ */
export function getScriptName(fileNamePrefix: string): string { export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') { if (process.platform === 'win32') {
return fileNamePrefix + '.ps1'; return String.Format('{0}.ps1', fileNamePrefix);
} else { } else {
return fileNamePrefix + '.sh'; return String.Format('{0}.sh', fileNamePrefix);
} }
} }
...@@ -206,17 +212,17 @@ export function getScriptName(fileNamePrefix: string): string { ...@@ -206,17 +212,17 @@ export function getScriptName(fileNamePrefix: string): string {
* @param gpuMetricCollectorScriptFolder * @param gpuMetricCollectorScriptFolder
*/ */
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string { export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') { if (process.platform === 'win32') {
return String.Format( return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS, GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder, gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'), path.join(gpuMetricCollectorScriptFolder, 'pid')
); );
} else { } else {
return String.Format( return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder, gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'), path.join(gpuMetricCollectorScriptFolder, 'pid')
); );
} }
} }
...@@ -19,12 +19,15 @@ ...@@ -19,12 +19,15 @@
'use strict'; 'use strict';
import * as fs from 'fs' import * as azureStorage from 'azure-storage';
import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import { getLogger } from '../../common/log'; import { getLogger } from '../../common/log';
import { mkDirP } from '../../common/utils'; import { mkDirP } from '../../common/utils';
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export namespace AzureStorageClientUtility { export namespace AzureStorageClientUtility {
/** /**
...@@ -32,16 +35,18 @@ export namespace AzureStorageClientUtility { ...@@ -32,16 +35,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient * @param fileServerClient
* @param azureShare * @param azureShare
*/ */
export async function createShare(fileServerClient: any, azureShare: any): Promise<void>{ export async function createShare(fileServerClient: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createShareIfNotExists(azureShare, function(error: any, result: any, response: any) { fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
if(error){ if (error) {
getLogger().error(`Create share failed:, ${error}`); getLogger()
deferred.reject(error) .error(`Create share failed:, ${error}`);
}else{ deferred.reject(error);
deferred.resolve() } else {
deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
...@@ -51,16 +56,18 @@ export namespace AzureStorageClientUtility { ...@@ -51,16 +56,18 @@ export namespace AzureStorageClientUtility {
* @param azureFoler * @param azureFoler
* @param azureShare * @param azureShare
*/ */
export async function createDirectory(fileServerClient: any, azureFoler: any, azureShare: any): Promise<void>{ export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, function(error: any, result: any, response: any) { fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
if(error){ if (error) {
getLogger().error(`Create directory failed:, ${error}`); getLogger()
.error(`Create directory failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
...@@ -69,16 +76,18 @@ export namespace AzureStorageClientUtility { ...@@ -69,16 +76,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient * @param fileServerClient
* @param azureDirectory * @param azureDirectory
*/ */
export async function createDirectoryRecursive(fileServerClient: any, azureDirectory: any, azureShare: any): Promise<void>{ export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string,
azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
let directories = azureDirectory.split("/"); const directories: string[] = azureDirectory.split('/');
let rootDirectory = "" let rootDirectory: string = '';
for(let directory of directories){ for (const directory of directories) {
rootDirectory += directory; rootDirectory += directory;
await createDirectory(fileServerClient, rootDirectory, azureShare); await createDirectory(fileServerClient, rootDirectory, azureShare);
rootDirectory += '/'; rootDirectory += '/';
} }
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
...@@ -90,16 +99,20 @@ export namespace AzureStorageClientUtility { ...@@ -90,16 +99,20 @@ export namespace AzureStorageClientUtility {
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function uploadFileToAzure(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, function(error: any, result: any, response: any) { await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
if(error){ (error: any, result: any, response: any) => {
getLogger().error(`Upload file failed:, ${error}`); if (error) {
getLogger()
.error(`Upload file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
...@@ -111,16 +124,21 @@ export namespace AzureStorageClientUtility { ...@@ -111,16 +124,21 @@ export namespace AzureStorageClientUtility {
* @param azureShare * @param azureShare
* @param localFilePath * @param localFilePath
*/ */
async function downloadFile(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{ async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), function(error: any, result: any, response: any) { // tslint:disable-next-line:non-literal-fs-path
if(error){ await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
getLogger().error(`Download file failed:, ${error}`); (error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Download file failed:, ${error}`);
deferred.reject(error); deferred.reject(error);
}else{ } else {
deferred.resolve(); deferred.resolve();
} }
}) });
return deferred.promise; return deferred.promise;
} }
...@@ -131,26 +149,31 @@ export namespace AzureStorageClientUtility { ...@@ -131,26 +149,31 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used * @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded * @param localDirectory : local directory to be uploaded
*/ */
export async function uploadDirectory(fileServerClient: any, azureDirectory: any, azureShare: any, localDirectory: any): Promise<void>{ // tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
for(let fileName of fileNameArray){ for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName); const fullFilePath: string = path.join(localDirectory, fileName);
try { try {
if (fs.lstatSync(fullFilePath).isFile()) { if (fs.lstatSync(fullFilePath)
.isFile()) {
await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath);
} else { } else {
// If filePath is a directory, recuisively copy it to azure // If filePath is a directory, recuisively copy it to azure
await uploadDirectory(fileServerClient, azureDirectory + '/' + fileName, azureShare, fullFilePath); await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath);
} }
} catch(error) { } catch (error) {
deferred.reject(error); deferred.reject(error);
return deferred.promise; return deferred.promise;
} }
} }
// All files/directories are copied successfully, resolve // All files/directories are copied successfully, resolve
deferred.resolve(); deferred.resolve();
return deferred.promise; return deferred.promise;
} }
...@@ -161,37 +184,44 @@ export namespace AzureStorageClientUtility { ...@@ -161,37 +184,44 @@ export namespace AzureStorageClientUtility {
* @param azureShare * @param azureShare
* @param localDirectory * @param localDirectory
*/ */
export async function downloadDirectory(fileServerClient: any, azureDirectory:any, azureShare: any, localDirectory: any): Promise<void>{ export async function downloadDirectory(fileServerClient: any, azureDirectory: string, azureShare: any, localDirectory: string):
Promise<void> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<void> = new Deferred<void>();
mkDirP(localDirectory); await mkDirP(localDirectory);
fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null', function(error: any, result: any, response: any) { fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null',
if(('entries' in result) === false){ async (error: any, result: any, response: any) => {
getLogger().error(`list files failed, can't get entries in result`); if (('entries' in result) === false) {
getLogger()
.error(`list files failed, can't get entries in result`);
throw new Error(`list files failed, can't get entries in result`); throw new Error(`list files failed, can't get entries in result`);
} }
if(('files' in result['entries']) === false){ if (('files' in result.entries) === false) {
getLogger().error(`list files failed, can't get files in result['entries']`); getLogger()
.error(`list files failed, can't get files in result['entries']`);
throw new Error(`list files failed, can't get files in result['entries']`); throw new Error(`list files failed, can't get files in result['entries']`);
} }
if(('directories' in result['directories']) === false){ if (('directories' in result.directories) === false) {
getLogger().error(`list files failed, can't get directories in result['entries']`); getLogger()
.error(`list files failed, can't get directories in result['entries']`);
throw new Error(`list files failed, can't get directories in result['entries']`); throw new Error(`list files failed, can't get directories in result['entries']`);
} }
for(var fileName of result['entries']['files']){ for (const fileName of result.entries.files) {
const fullFilePath: string = path.join(localDirectory, fileName.name); const fullFilePath: string = path.join(localDirectory, fileName.name);
downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath) await downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath);
} }
for(var directoryName of result['entries']['directories']){ for (const directoryName of result.entries.directories) {
const fullDirectoryPath: string = path.join(localDirectory, directoryName.name) const fullDirectoryPath: string = path.join(localDirectory, directoryName.name);
const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name) const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name);
downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath) await downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath);
} }
deferred.resolve(); deferred.resolve();
}) });
return deferred.promise; return deferred.promise;
} }
} }
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
/** /**
* Copyright (c) Microsoft Corporation * Copyright (c) Microsoft Corporation
* All rights reserved. * All rights reserved.
...@@ -20,21 +21,29 @@ ...@@ -20,21 +21,29 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient'; import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
abstract class FrameworkControllerClient extends KubernetesCRDClient{ /**
* FrameworkController Client
*/
abstract class FrameworkControllerClient extends KubernetesCRDClient {
/** /**
* Factory method to generate operator cliet * Factory method to generate operator client
*/ */
// tslint:disable-next-line:function-name
public static generateFrameworkControllerClient(): KubernetesCRDClient { public static generateFrameworkControllerClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1(); return new FrameworkControllerClientV1();
} }
} }
/**
* FrameworkController ClientV1
*/
class FrameworkControllerClientV1 extends FrameworkControllerClient { class FrameworkControllerClientV1 extends FrameworkControllerClient {
/** /**
* constructor, to initialize frameworkcontroller CRD definition * constructor, to initialize frameworkcontroller CRD definition
*/ */
// tslint:disable: no-unsafe-any no-any
public constructor() { public constructor() {
super(); super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8')); this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
...@@ -42,8 +51,9 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient { ...@@ -42,8 +51,9 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["frameworkcontroller.microsoft.com"].v1.namespaces('default').frameworks; return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks;
} }
// tslint:enable: no-unsafe-any no-any
public get containerName(): string { public get containerName(): string {
return 'framework'; return 'framework';
...@@ -51,4 +61,3 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient { ...@@ -51,4 +61,3 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
} }
export { FrameworkControllerClient, GeneralK8sClient }; export { FrameworkControllerClient, GeneralK8sClient };
...@@ -20,10 +20,11 @@ ...@@ -20,10 +20,11 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { KubernetesTrialConfig, KubernetesTrialConfigTemplate, KubernetesClusterConfigAzure, import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesClusterConfigNFS, NFSConfig, KubernetesStorageKind, keyVaultConfig, AzureStorage, KubernetesClusterConfig, KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
StorageConfig } from '../kubernetesConfig' } from '../kubernetesConfig';
// tslint:disable:completed-docs
export class FrameworkAttemptCompletionPolicy { export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number; public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number; public readonly minSucceededTaskCount: number;
...@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy { ...@@ -36,13 +37,13 @@ export class FrameworkAttemptCompletionPolicy {
/** /**
* Trial job configuration for FrameworkController * Trial job configuration for FrameworkController
*/ */
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate{ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy; public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string; public readonly name: string;
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name; this.name = name;
...@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
} }
} }
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig{ export class FrameworkControllerTrialConfig extends KubernetesTrialConfig {
public readonly taskRoles: FrameworkControllerTrialConfigTemplate[]; public readonly taskRoles: FrameworkControllerTrialConfigTemplate[];
public readonly codeDir: string; public readonly codeDir: string;
constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) { constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) {
...@@ -68,6 +69,7 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig { ...@@ -68,6 +69,7 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:function-name
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS { export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor( constructor(
...@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig ...@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <FrameworkControllerClusterConfigNFS>jsonObject; const kubeflowClusterConfigObjectNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined) assert (kubeflowClusterConfigObjectNFS !== undefined);
return new FrameworkControllerClusterConfigNFS( return new FrameworkControllerClusterConfigNFS(
kubeflowClusterConfigObjectNFS.serviceAccountName, kubeflowClusterConfigObjectNFS.serviceAccountName,
kubeflowClusterConfigObjectNFS.apiVersion, kubeflowClusterConfigObjectNFS.apiVersion,
...@@ -98,16 +101,17 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf ...@@ -98,16 +101,17 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage, storage);
this.serviceAccountName = serviceAccountName; this.serviceAccountName = serviceAccountName;
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <FrameworkControllerClusterConfigAzure>jsonObject; const kubeflowClusterConfigObjectAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>jsonObject;
return new FrameworkControllerClusterConfigAzure( return new FrameworkControllerClusterConfigAzure(
kubeflowClusterConfigObjectAzure.serviceAccountName, kubeflowClusterConfigObjectAzure.serviceAccountName,
kubeflowClusterConfigObjectAzure.apiVersion, kubeflowClusterConfigObjectAzure.apiVersion,
...@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf ...@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export class FrameworkControllerClusterConfigFactory { export class FrameworkControllerClusterConfigFactory {
public static generateFrameworkControllerClusterConfig(jsonObject: object): FrameworkControllerClusterConfig { public static generateFrameworkControllerClusterConfig(jsonObject: object): FrameworkControllerClusterConfig {
let storageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if(!storageConfig) { if (storageConfig === undefined) {
throw new Error("Invalid json object as a StorageConfig instance"); throw new Error('Invalid json object as a StorageConfig instance');
} }
if(storageConfig.storage && storageConfig.storage === 'azureStorage') { if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return FrameworkControllerClusterConfigAzure.getInstance(jsonObject); return FrameworkControllerClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') { } else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return FrameworkControllerClusterConfigNFS.getInstance(jsonObject); return FrameworkControllerClusterConfigNFS.getInstance(jsonObject);
...@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory { ...@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
} }
} }
export type FrameworkControllerJobStatus = 'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted'; export type FrameworkControllerJobStatus =
'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed'; export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed';
\ No newline at end of file
...@@ -19,66 +19,74 @@ ...@@ -19,66 +19,74 @@
'use strict'; 'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient'; import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector'; import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { FrameworkControllerJobStatus, FrameworkControllerJobCompleteStatus } from './frameworkcontrollerConfig'; import { FrameworkControllerJobCompleteStatus, FrameworkControllerJobStatus } from './frameworkcontrollerConfig';
/** /**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally * Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/ */
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector{ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesCRDClient === undefined) { if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable-next-line:no-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch (error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesJobInfo.status && kubernetesJobInfo.status.state) { // tslint:disable: no-unsafe-any
if (kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state; const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
switch(frameworkJobType) { switch (frameworkJobType) {
case 'AttemptCreationPending' || 'AttemptCreationRequested' || 'AttemptPreparing': case 'AttemptCreationPending':
case 'AttemptCreationRequested':
case 'AttemptPreparing':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
break; break;
case 'AttemptRunning': case 'AttemptRunning':
kubernetesTrialJob.status = 'RUNNING'; kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) { if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime); kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
} }
break; break;
case 'Completed': case 'Completed':
const completedJobType : FrameworkControllerJobCompleteStatus = <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name; const completedJobType : FrameworkControllerJobCompleteStatus =
switch(completedJobType) { <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch (completedJobType) {
case 'Succeeded': case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED'; kubernetesTrialJob.status = 'SUCCEEDED';
break; break;
case 'Failed': case 'Failed':
kubernetesTrialJob.status = 'FAILED'; kubernetesTrialJob.status = 'FAILED';
break; break;
default:
} }
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime); kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break; break;
default: default:
break;
} }
} }
return Promise.resolve(); return Promise.resolve();
} }
} // tslint:enable: no-unsafe-any
\ No newline at end of file }
...@@ -20,16 +20,16 @@ ...@@ -20,16 +20,16 @@
'use strict'; 'use strict';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService'; import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update * frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{ export class FrameworkControllerJobRestServer extends KubernetesJobRestServer {
constructor() { constructor() {
super(component.get(FrameworkControllerTrainingService)); super(component.get(FrameworkControllerTrainingService));
} }
} }
\ No newline at end of file
...@@ -20,18 +20,22 @@ ...@@ -20,18 +20,22 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
import { KubeflowOperator } from './kubeflowConfig'; import { KubeflowOperator } from './kubeflowConfig';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class KubeflowOperatorClient extends KubernetesCRDClient{ /**
* KubeflowOperator Client
*/
abstract class KubeflowOperatorClient extends KubernetesCRDClient {
/** /**
* Factory method to generate operator cliet * Factory method to generate operator client
*/ */
// tslint:disable-next-line:function-name
public static generateOperatorClient(kubeflowOperator: KubeflowOperator, public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient { operatorApiVersion: string): KubernetesCRDClient {
switch(kubeflowOperator) { switch (kubeflowOperator) {
case 'tf-operator': { case 'tf-operator': {
switch(operatorApiVersion) { switch (operatorApiVersion) {
case 'v1alpha2': { case 'v1alpha2': {
return new TFOperatorClientV1Alpha2(); return new TFOperatorClientV1Alpha2();
} }
...@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': { case 'v1beta2': {
return new TFOperatorClientV1Beta2(); return new TFOperatorClientV1Beta2();
} }
default:
throw new Error(`Invalid tf-operator apiVersion ${operatorApiVersion}`);
} }
break;
} }
case 'pytorch-operator': { case 'pytorch-operator': {
switch(operatorApiVersion) { switch (operatorApiVersion) {
case 'v1alpha2': { case 'v1alpha2': {
return new PyTorchOperatorClientV1Alpha2(); return new PyTorchOperatorClientV1Alpha2();
} }
...@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{ ...@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': { case 'v1beta2': {
return new PyTorchOperatorClientV1Beta2(); return new PyTorchOperatorClientV1Beta2();
} }
default:
throw new Error(`Invalid pytorch-operator apiVersion ${operatorApiVersion}`);
} }
} }
default:
throw new Error(`Invalid operator ${kubeflowOperator}`);
} }
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
} }
} }
// tslint:disable: no-unsafe-any no-any completed-docs
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/** /**
* constructor, to initialize tfjob CRD definition * constructor, to initialize tfjob CRD definition
...@@ -73,7 +82,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -73,7 +82,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -92,7 +101,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -92,7 +101,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -111,12 +120,12 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').tfjobs; return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').tfjobs;
} }
public get containerName(): string { public get containerName(): string {
return 'tensorflow'; return 'tensorflow';
} }
} }
class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
...@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient { ...@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient { ...@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').pytorchjobs; return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').pytorchjobs;
} }
public get containerName(): string { public get containerName(): string {
...@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient { ...@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
} }
} }
// tslint:enable: no-unsafe-any
export { KubeflowOperatorClient, GeneralK8sClient }; export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -20,16 +20,20 @@ ...@@ -20,16 +20,20 @@
'use strict'; 'use strict';
import * as assert from 'assert'; import * as assert from 'assert';
import { KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, KubernetesStorageKind, NFSConfig, AzureStorage, keyVaultConfig,
KubernetesTrialConfig, KubernetesTrialConfigTemplate, StorageConfig, KubernetesClusterConfig } from '../kubernetesConfig'
import { MethodNotImplementedError } from '../../../common/errors'; import { MethodNotImplementedError } from '../../../common/errors';
import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
} from '../kubernetesConfig';
/** operator types that kubeflow supported */ // operator types that kubeflow supported
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master'; export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded'; export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2'; export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2';
/**
* Kubeflow Cluster Configuration
*/
export class KubeflowClusterConfig extends KubernetesClusterConfig { export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor(apiVersion: string, operator: KubeflowOperator) { constructor(apiVersion: string, operator: KubeflowOperator) {
...@@ -38,6 +42,7 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig { ...@@ -38,6 +42,7 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
} }
} }
// tslint:disable:completed-docs
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
...@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return 'nfs'; return 'nfs';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS { public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <KubeflowClusterConfigNFS>jsonObject; const kubeflowClusterConfigObjectNFS: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined) assert (kubeflowClusterConfigObjectNFS !== undefined);
return new KubeflowClusterConfigNFS( return new KubeflowClusterConfigNFS(
kubeflowClusterConfigObjectNFS.operator, kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion, kubeflowClusterConfigObjectNFS.apiVersion,
...@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS { ...@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
} }
} }
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
constructor( constructor(
operator: KubeflowOperator, operator: KubeflowOperator,
apiVersion: string, apiVersion: string,
keyVault: keyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind
) { ) {
super(apiVersion, keyVault, azureStorage,storage); super(apiVersion, keyVault, azureStorage, storage);
this.operator = operator; this.operator = operator;
} }
public get storageType(): KubernetesStorageKind{ public get storageType(): KubernetesStorageKind {
return 'azureStorage'; return 'azureStorage';
} }
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure { public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <KubeflowClusterConfigAzure>jsonObject; const kubeflowClusterConfigObjectAzure: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>jsonObject;
return new KubeflowClusterConfigAzure( return new KubeflowClusterConfigAzure(
kubeflowClusterConfigObjectAzure.operator, kubeflowClusterConfigObjectAzure.operator,
kubeflowClusterConfigObjectAzure.apiVersion, kubeflowClusterConfigObjectAzure.apiVersion,
...@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{ ...@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export class KubeflowClusterConfigFactory { export class KubeflowClusterConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig { public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
let storageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if(!storageConfig) { if (storageConfig === undefined) {
throw new Error("Invalid json object as a StorageConfig instance"); throw new Error('Invalid json object as a StorageConfig instance');
} }
if(storageConfig.storage && storageConfig.storage === 'azureStorage') { if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return KubeflowClusterConfigAzure.getInstance(jsonObject); return KubeflowClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') { } else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return KubeflowClusterConfigNFS.getInstance(jsonObject); return KubeflowClusterConfigNFS.getInstance(jsonObject);
...@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -122,10 +132,10 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
} }
} }
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{ export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image);
this.replicas = replicas; this.replicas = replicas;
} }
...@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig { ...@@ -163,22 +173,25 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export class KubeflowTrialConfigFactory { export class KubeflowTrialConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig { public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if(operator === 'tf-operator'){ if (operator === 'tf-operator') {
let kubeflowTrialConfigObject = <KubeflowTrialConfigTensorflow>jsonObject; const kubeflowTrialConfigObject: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>jsonObject;
return new KubeflowTrialConfigTensorflow( return new KubeflowTrialConfigTensorflow(
kubeflowTrialConfigObject.codeDir, kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.worker, kubeflowTrialConfigObject.worker,
kubeflowTrialConfigObject.ps kubeflowTrialConfigObject.ps
); );
}else if(operator === 'pytorch-operator'){ } else if (operator === 'pytorch-operator') {
let kubeflowTrialConfigObject = <KubeflowTrialConfigPytorch>jsonObject; const kubeflowTrialConfigObject: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>jsonObject;
return new KubeflowTrialConfigPytorch( return new KubeflowTrialConfigPytorch(
kubeflowTrialConfigObject.codeDir, kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.master, kubeflowTrialConfigObject.master,
kubeflowTrialConfigObject.worker kubeflowTrialConfigObject.worker
); );
} }
throw new Error(`Invalid json object ${jsonObject}`); throw new Error(`Invalid json object ${jsonObject}`);
} }
} }
...@@ -19,50 +19,52 @@ ...@@ -19,50 +19,52 @@
'use strict'; 'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient'; import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector'; import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { KubeflowJobStatus } from './kubeflowConfig'; import { KubeflowJobStatus } from './kubeflowConfig';
/** /**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/ */
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) { constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap); super(jobMap);
} }
protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined, protected async retrieveSingleTrialJobInfo(kubernetesCRDClient: KubernetesCRDClient | undefined,
kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> { kubernetesTrialJob : KubernetesTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubernetesTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesCRDClient === undefined) { if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined'); return Promise.reject('kubernetesCRDClient is undefined');
} }
// tslint:disable:no-any no-unsafe-any
let kubernetesJobInfo: any; let kubernetesJobInfo: any;
try { try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName); kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) { } catch (error) {
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed. // Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`); this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status //This is not treat as a error status
return Promise.resolve(); return Promise.resolve();
} }
if(kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) { if (kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1]; const latestCondition: any = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type; const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch(tfJobType) { switch (tfJobType) {
case 'Created': case 'Created':
kubernetesTrialJob.status = 'WAITING'; kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
case 'Running': case 'Running':
kubernetesTrialJob.status = 'RUNNING'; kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) { if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
} }
break; break;
...@@ -75,9 +77,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{ ...@@ -75,9 +77,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime); kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break; break;
default: default:
break;
} }
} }
// tslint:enable:no-any no-unsafe-any
return Promise.resolve(); return Promise.resolve();
} }
} }
\ No newline at end of file
...@@ -20,19 +20,19 @@ ...@@ -20,19 +20,19 @@
'use strict'; 'use strict';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { KubeflowTrainingService } from './kubeflowTrainingService'; import { KubeflowTrainingService } from './kubeflowTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/** /**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update * Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
* *
*/ */
@component.Singleton @component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{ export class KubeflowJobRestServer extends KubernetesJobRestServer {
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
constructor() { constructor() {
super(component.get(KubeflowTrainingService)); super(component.get(KubeflowTrainingService));
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment