Commit 22993e5d authored by demianzhang's avatar demianzhang Committed by chicm-ms
Browse files

Pass tslint for training service (#1177)

* fix local and remote training services tslint
parent ae7a72bc
......@@ -67,10 +67,10 @@ The tool dispatches and runs trial jobs generated by tuning algorithms to search
<li><a href="docs/en_US/BuiltinTuner.md#MetisTuner">Metis Tuner</a></li>
<li><a href="docs/en_US/BuiltinTuner.md#BOHB">BOHB</a></li>
</ul>
<a href="docs/en_US/BuiltinAssessors.md">Assessor</a>
<a href="docs/en_US/BuiltinAssessor.md">Assessor</a>
<ul>
<li><a href="docs/en_US/BuiltinAssessors.md#Medianstop">Median Stop</a></li>
<li><a href="docs/en_US/BuiltinAssessors.md#Curvefitting">Curve Fitting</a></li>
<li><a href="docs/en_US/BuiltinAssessor.md#Medianstop">Median Stop</a></li>
<li><a href="docs/en_US/BuiltinAssessor.md#Curvefitting">Curve Fitting</a></li>
</ul>
</td>
<td>
......@@ -118,12 +118,6 @@ python3 -m pip install --upgrade nni
Windows
**IMPORTANT** Make sure `ExecutionPolicy` has been set to `Unrestricted` before installation. To set the policy, **run PowerShell as administrator** with the following command:
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
Once ExecutionPolicy is unrestricted, run the following command to install NNI:
```bash
python -m pip install --upgrade nni
```
......
......@@ -21,16 +21,6 @@ For other examples you need to change trial command `python3` into `python` in e
Make sure C++ 14.0 compiler installed.
>building 'simplejson._speedups' extension error: [WinError 3] The system cannot find the path specified
### Fail to run PowerShell when install NNI from source
If you run PowerShell script for the first time and did not set the execution policies for executing the script, you will meet this error below. Try to run PowerShell as administrator with this command first:
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
>...cannot be loaded because running scripts is disabled on this system.
### Trial failed with missing DLL in command line or PowerShell
This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install SciPy. Using Anaconda or Miniconda with Python(64-bit) can solve it.
......@@ -38,11 +28,7 @@ This error caused by missing LIBIFCOREMD.DLL and LIBMMD.DLL and fail to install
### Trial failed on webUI
Please check the trial log file stderr for more details. If there is no such file and NNI is installed through pip, then you need to run PowerShell as administrator with this command first:
```bash
Set-ExecutionPolicy -ExecutionPolicy Unrestricted
```
Please check the trial log file stderr for more details.
If there is a stderr file, please check out. Two possible cases are as follows:
......
......@@ -91,6 +91,7 @@ interface TrialJobMetric {
* define TrainingServiceError
*/
class TrainingServiceError extends Error {
private errCode: number;
constructor(errorCode: number, errorMessage: string) {
......@@ -136,5 +137,3 @@ export {
TrainingServiceMetadata, TrialJobDetail, TrialJobMetric, HyperParameters,
HostJobApplicationForm, JobApplicationForm, JobType, NNIManagerIpConfig
};
......@@ -33,6 +33,7 @@
"@types/chai-as-promised": "^7.1.0",
"@types/express": "^4.16.0",
"@types/glob": "^7.1.1",
"@types/js-base64": "^2.3.1",
"@types/mocha": "^5.2.5",
"@types/node": "10.12.18",
"@types/request": "^2.47.1",
......
......@@ -20,22 +20,24 @@
'use strict';
import * as assert from 'assert';
import { Request, Response, Router } from 'express';
// tslint:disable-next-line:no-implicit-dependencies
import * as bodyParser from 'body-parser';
import { Request, Response, Router } from 'express';
import * as fs from 'fs';
import * as path from 'path';
import { Writable } from 'stream';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import * as fs from 'fs'
import * as path from 'path'
import { getBasePort, getExperimentId } from '../../common/experimentStartupInfo';
import { RestServer } from '../../common/restServer'
import { RestServer } from '../../common/restServer';
import { getLogDir } from '../../common/utils';
import { Writable } from 'stream';
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
*
*/
@component.Singleton
export abstract class ClusterJobRestServer extends RestServer{
export abstract class ClusterJobRestServer extends RestServer {
private readonly API_ROOT_URL: string = '/api/v1/nni-pai';
private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?<metrics>.*?)'`;
......@@ -51,19 +53,20 @@ export abstract class ClusterJobRestServer extends RestServer{
constructor() {
super();
const basePort: number = getBasePort();
assert(basePort && basePort > 1024);
assert(basePort !== undefined && basePort > 1024);
this.port = basePort + 1;
}
public get clusterRestServerPort(): number {
if(!this.port) {
if (this.port === undefined) {
throw new Error('PAI Rest server port is undefined');
}
return this.port;
}
public get getErrorMessage(): string | undefined{
public get getErrorMessage(): string | undefined {
return this.errorMessage;
}
......@@ -79,11 +82,15 @@ export abstract class ClusterJobRestServer extends RestServer{
this.app.use(this.API_ROOT_URL, this.createRestHandler());
}
// Abstract method to handle trial metrics data
// tslint:disable-next-line:no-any
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
// tslint:disable: no-unsafe-any no-any
private createRestHandler() : Router {
const router: Router = Router();
// tslint:disable-next-line:typedef
router.use((req: Request, res: Response, next) => {
router.use((req: Request, res: Response, next: any) => {
this.log.info(`${req.method}: ${req.url}: body:\n${JSON.stringify(req.body, undefined, 4)}`);
res.setHeader('Content-Type', 'application/json');
next();
......@@ -92,7 +99,7 @@ export abstract class ClusterJobRestServer extends RestServer{
router.post(`/version/${this.expId}/:trialId`, (req: Request, res: Response) => {
if (this.enableVersionCheck) {
try {
const checkResultSuccess: boolean = req.body.tag === 'VCSuccess'? true: false;
const checkResultSuccess: boolean = req.body.tag === 'VCSuccess' ? true : false;
if (this.versionCheckSuccess !== undefined && this.versionCheckSuccess !== checkResultSuccess) {
this.errorMessage = 'Version check error, version check result is inconsistent!';
this.log.error(this.errorMessage);
......@@ -103,7 +110,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this.versionCheckSuccess = false;
this.errorMessage = req.body.msg;
}
} catch(err) {
} catch (err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
......@@ -122,8 +129,7 @@ export abstract class ClusterJobRestServer extends RestServer{
this.handleTrialMetrics(req.body.jobId, req.body.metrics);
res.send();
}
catch(err) {
} catch (err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
......@@ -131,35 +137,37 @@ export abstract class ClusterJobRestServer extends RestServer{
});
router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => {
if(this.enableVersionCheck && !this.versionCheckSuccess && !this.errorMessage) {
this.errorMessage = `Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+ `NNIManager and TrialKeeper!`
if (this.enableVersionCheck && (this.versionCheckSuccess === undefined || !this.versionCheckSuccess)
&& this.errorMessage === undefined) {
this.errorMessage = `Version check failed, didn't get version check response from trialKeeper,`
+ ` please check your NNI version in NNIManager and TrialKeeper!`;
}
const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`);
try {
let skipLogging: boolean = false;
if(req.body.tag === 'trial' && req.body.msg !== undefined) {
const metricsContent = req.body.msg.match(this.NNI_METRICS_PATTERN);
if(metricsContent && metricsContent.groups) {
this.handleTrialMetrics(req.params.trialId, [metricsContent.groups['metrics']]);
if (req.body.tag === 'trial' && req.body.msg !== undefined) {
const metricsContent: any = req.body.msg.match(this.NNI_METRICS_PATTERN);
if (metricsContent && metricsContent.groups) {
const key: string = 'metrics';
this.handleTrialMetrics(req.params.trialId, [metricsContent.groups[key]]);
skipLogging = true;
}
}
if(!skipLogging){
if (!skipLogging) {
// Construct write stream to write remote trial's log into local file
// tslint:disable-next-line:non-literal-fs-path
const writeStream: Writable = fs.createWriteStream(trialLogPath, {
flags: 'a+',
encoding: 'utf8',
autoClose: true
});
writeStream.write(req.body.msg + '\n');
writeStream.write(String.Format('{0}\n', req.body.msg));
writeStream.end();
}
res.send();
}
catch(err) {
} catch (err) {
this.log.error(`json parse stdout data error: ${err}`);
res.status(500);
res.send(err.message);
......@@ -168,7 +176,5 @@ export abstract class ClusterJobRestServer extends RestServer{
return router;
}
/** Abstract method to handle trial metrics data */
protected abstract handleTrialMetrics(jobId : string, trialMetrics : any[]) : void;
// tslint:enable: no-unsafe-any no-any
}
......@@ -65,11 +65,11 @@ export const GPU_INFO_COLLECTOR_FORMAT_LINUX: string =
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
`;
export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string =
`
$env:METRIC_OUTPUT_DIR="{0}"
$app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow
Write $app.ID | Out-File {1} -NoNewline -encoding utf8
`
\ No newline at end of file
`;
......@@ -21,7 +21,10 @@
import { TrialJobStatus } from '../../common/trainingService';
// tslint:disable-next-line:max-classes-per-file
/**
* Trial job metrics class
* Representing trial job metrics properties
*/
export class JobMetrics {
public readonly jobId: string;
public readonly metrics: string[];
......
......@@ -24,13 +24,13 @@
* Representing trial job configurable properties
*/
export class TrialConfig {
/** Trail command */
// Trail command
public readonly command : string;
/** Code directory */
// Code directory
public readonly codeDir : string;
/** Required GPU number for trial job. The number should be in [0,100] */
// Required GPU number for trial job. The number should be in [0,100]
public readonly gpuNum : number;
/**
......
import { getLogger } from "common/log";
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
......@@ -21,16 +19,15 @@ import { getLogger } from "common/log";
'use strict';
import { countFilesRecursively } from '../../common/utils'
import * as cpp from 'child-process-promise';
import * as cp from 'child_process';
import * as os from 'os';
import * as fs from 'fs';
import { getNewLine } from '../../common/utils';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
import * as os from 'os';
import * as path from 'path';
import { String } from 'typescript-string-operations';
import { file } from "../../node_modules/@types/tmp";
import { countFilesRecursively, getNewLine } from '../../common/utils';
import { file } from '../../node_modules/@types/tmp';
import { GPU_INFO_COLLECTOR_FORMAT_LINUX, GPU_INFO_COLLECTOR_FORMAT_WINDOWS } from './gpuData';
/**
* Validate codeDir, calculate file count recursively under codeDir, and throw error if any rule is broken
......@@ -38,16 +35,17 @@ import { file } from "../../node_modules/@types/tmp";
* @param codeDir codeDir in nni config file
* @returns file number under codeDir
*/
// tslint:disable: no-redundant-jsdoc
export async function validateCodeDir(codeDir: string) : Promise<number> {
let fileCount: number | undefined;
try {
fileCount = await countFilesRecursively(codeDir);
} catch(error) {
} catch (error) {
throw new Error(`Call count file error: ${error}`);
}
if(fileCount && fileCount > 1000) {
if (fileCount !== undefined && fileCount > 1000) {
const errMessage: string = `Too many files(${fileCount} found}) in ${codeDir},`
+ ` please check if it's a valid code dir`;
throw new Error(errMessage);
......@@ -66,6 +64,7 @@ export async function execMkdir(directory: string): Promise<void> {
} else {
await cpp.exec(`mkdir -p ${directory}`);
}
return Promise.resolve();
}
......@@ -80,6 +79,7 @@ export async function execCopydir(source: string, destination: string): Promise<
} else {
await cpp.exec(`cp -r ${source} ${destination}`);
}
return Promise.resolve();
}
......@@ -93,14 +93,15 @@ export async function execNewFile(filename: string): Promise<void> {
} else {
await cpp.exec(`touch ${filename}`);
}
return Promise.resolve();
}
/**
* run script
* run script using powershell or bash
* @param filePath
*/
export function execScript(filePath: string): cp.ChildProcess {
export function runScript(filePath: string): cp.ChildProcess {
if (process.platform === 'win32') {
return cp.exec(`powershell.exe -ExecutionPolicy Bypass -file ${filePath}`);
} else {
......@@ -119,6 +120,7 @@ export async function execTail(filePath: string): Promise<cpp.childProcessPromis
} else {
cmdresult = await cpp.exec(`tail -n 1 ${filePath}`);
}
return Promise.resolve(cmdresult);
}
......@@ -132,6 +134,7 @@ export async function execRemove(directory: string): Promise<void> {
} else {
await cpp.exec(`rm -rf ${directory}`);
}
return Promise.resolve();
}
......@@ -145,37 +148,39 @@ export async function execKill(pid: string): Promise<void> {
} else {
await cpp.exec(`pkill -P ${pid}`);
}
return Promise.resolve();
}
/**
* set environment variable
* get command of setting environment variable
* @param variable
* @returns command string
*/
export function setEnvironmentVariable(variable: { key: string; value: string }): string {
if (process.platform === 'win32') {
return `$env:${variable.key}="${variable.value}"`;
}
else{
} else {
return `export ${variable.key}=${variable.value}`;
}
}
/**
* Compress files in directory to tar file
* @param source_path
* @param tar_path
* @param sourcePath
* @param tarPath
*/
export async function tarAdd(tar_path: string, source_path: string): Promise<void> {
export async function tarAdd(tarPath: string, sourcePath: string): Promise<void> {
if (process.platform === 'win32') {
tar_path = tar_path.split('\\').join('\\\\');
source_path = source_path.split('\\').join('\\\\');
let script: string[] = [];
const tarFilePath: string = tarPath.split('\\')
.join('\\\\');
const sourceFilePath: string = sourcePath.split('\\')
.join('\\\\');
const script: string[] = [];
script.push(
`import os`,
`import tarfile`,
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tar_path, source_path),
String.Format(`tar = tarfile.open("{0}","w:gz")\r\nfor root,dir,files in os.walk("{1}"):`, tarFilePath, sourceFilePath),
` for file in files:`,
` fullpath = os.path.join(root,file)`,
` tar.add(fullpath, arcname=file)`,
......@@ -184,8 +189,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
const tarScript: string = path.join(os.tmpdir(), 'tar.py');
await cpp.exec(`python ${tarScript}`);
} else {
await cpp.exec(`tar -czf ${tar_path} -C ${source_path} .`);
await cpp.exec(`tar -czf ${tarPath} -C ${sourcePath} .`);
}
return Promise.resolve();
}
......@@ -195,9 +201,9 @@ export async function tarAdd(tar_path: string, source_path: string): Promise<voi
*/
export function getScriptName(fileNamePrefix: string): string {
if (process.platform === 'win32') {
return fileNamePrefix + '.ps1';
return String.Format('{0}.ps1', fileNamePrefix);
} else {
return fileNamePrefix + '.sh';
return String.Format('{0}.sh', fileNamePrefix);
}
}
......@@ -206,17 +212,17 @@ export function getScriptName(fileNamePrefix: string): string {
* @param gpuMetricCollectorScriptFolder
*/
export function getgpuMetricsCollectorScriptContent(gpuMetricCollectorScriptFolder: string): string {
if(process.platform === 'win32') {
if (process.platform === 'win32') {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_WINDOWS,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
path.join(gpuMetricCollectorScriptFolder, 'pid')
);
} else {
return String.Format(
GPU_INFO_COLLECTOR_FORMAT_LINUX,
gpuMetricCollectorScriptFolder,
path.join(gpuMetricCollectorScriptFolder, 'pid'),
path.join(gpuMetricCollectorScriptFolder, 'pid')
);
}
}
......@@ -19,12 +19,15 @@
'use strict';
import * as fs from 'fs'
import * as azureStorage from 'azure-storage';
import * as fs from 'fs';
import * as path from 'path';
import { Deferred } from 'ts-deferred';
import { String } from 'typescript-string-operations';
import { getLogger } from '../../common/log';
import { mkDirP } from '../../common/utils';
// tslint:disable: no-redundant-jsdoc no-any no-unsafe-any
export namespace AzureStorageClientUtility {
/**
......@@ -32,16 +35,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param azureShare
*/
export async function createShare(fileServerClient: any, azureShare: any): Promise<void>{
export async function createShare(fileServerClient: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createShareIfNotExists(azureShare, function(error: any, result: any, response: any) {
if(error){
getLogger().error(`Create share failed:, ${error}`);
deferred.reject(error)
}else{
deferred.resolve()
}
})
fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Create share failed:, ${error}`);
deferred.reject(error);
} else {
deferred.resolve();
}
});
return deferred.promise;
}
......@@ -51,16 +56,18 @@ export namespace AzureStorageClientUtility {
* @param azureFoler
* @param azureShare
*/
export async function createDirectory(fileServerClient: any, azureFoler: any, azureShare: any): Promise<void>{
export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, function(error: any, result: any, response: any) {
if(error){
getLogger().error(`Create directory failed:, ${error}`);
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Create directory failed:, ${error}`);
deferred.reject(error);
}else{
} else {
deferred.resolve();
}
})
});
return deferred.promise;
}
......@@ -69,16 +76,18 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient
* @param azureDirectory
*/
export async function createDirectoryRecursive(fileServerClient: any, azureDirectory: any, azureShare: any): Promise<void>{
export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string,
azureShare: any): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
let directories = azureDirectory.split("/");
let rootDirectory = ""
for(let directory of directories){
const directories: string[] = azureDirectory.split('/');
let rootDirectory: string = '';
for (const directory of directories) {
rootDirectory += directory;
await createDirectory(fileServerClient, rootDirectory, azureShare);
rootDirectory += '/';
}
deferred.resolve();
return deferred.promise;
}
......@@ -90,16 +99,20 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localFilePath
*/
async function uploadFileToAzure(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{
async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, function(error: any, result: any, response: any) {
if(error){
getLogger().error(`Upload file failed:, ${error}`);
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
(error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Upload file failed:, ${error}`);
deferred.reject(error);
}else{
} else {
deferred.resolve();
}
})
});
return deferred.promise;
}
......@@ -111,16 +124,21 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localFilePath
*/
async function downloadFile(fileServerClient: any, azureDirectory: any, azureFileName: any, azureShare: any, localFilePath: any): Promise<void>{
async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), function(error: any, result: any, response: any) {
if(error){
getLogger().error(`Download file failed:, ${error}`);
// tslint:disable-next-line:non-literal-fs-path
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
(error: any, result: any, response: any) => {
if (error) {
getLogger()
.error(`Download file failed:, ${error}`);
deferred.reject(error);
}else{
} else {
deferred.resolve();
}
})
});
return deferred.promise;
}
......@@ -131,26 +149,31 @@ export namespace AzureStorageClientUtility {
* @param azureShare : the azure share used
* @param localDirectory : local directory to be uploaded
*/
export async function uploadDirectory(fileServerClient: any, azureDirectory: any, azureShare: any, localDirectory: any): Promise<void>{
// tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
const fileNameArray: string[] = fs.readdirSync(localDirectory);
await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
for(let fileName of fileNameArray){
for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName);
try {
if (fs.lstatSync(fullFilePath).isFile()) {
if (fs.lstatSync(fullFilePath)
.isFile()) {
await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath);
} else {
// If filePath is a directory, recuisively copy it to azure
await uploadDirectory(fileServerClient, azureDirectory + '/' + fileName, azureShare, fullFilePath);
await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath);
}
} catch(error) {
} catch (error) {
deferred.reject(error);
return deferred.promise;
}
}
// All files/directories are copied successfully, resolve
deferred.resolve();
return deferred.promise;
}
......@@ -161,37 +184,44 @@ export namespace AzureStorageClientUtility {
* @param azureShare
* @param localDirectory
*/
export async function downloadDirectory(fileServerClient: any, azureDirectory:any, azureShare: any, localDirectory: any): Promise<void>{
export async function downloadDirectory(fileServerClient: any, azureDirectory: string, azureShare: any, localDirectory: string):
Promise<void> {
const deferred: Deferred<void> = new Deferred<void>();
mkDirP(localDirectory);
fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null', function(error: any, result: any, response: any) {
if(('entries' in result) === false){
getLogger().error(`list files failed, can't get entries in result`);
await mkDirP(localDirectory);
fileServerClient.listFilesAndDirectoriesSegmented(azureShare, azureDirectory, 'null',
async (error: any, result: any, response: any) => {
if (('entries' in result) === false) {
getLogger()
.error(`list files failed, can't get entries in result`);
throw new Error(`list files failed, can't get entries in result`);
}
if(('files' in result['entries']) === false){
getLogger().error(`list files failed, can't get files in result['entries']`);
if (('files' in result.entries) === false) {
getLogger()
.error(`list files failed, can't get files in result['entries']`);
throw new Error(`list files failed, can't get files in result['entries']`);
}
if(('directories' in result['directories']) === false){
getLogger().error(`list files failed, can't get directories in result['entries']`);
if (('directories' in result.directories) === false) {
getLogger()
.error(`list files failed, can't get directories in result['entries']`);
throw new Error(`list files failed, can't get directories in result['entries']`);
}
for(var fileName of result['entries']['files']){
for (const fileName of result.entries.files) {
const fullFilePath: string = path.join(localDirectory, fileName.name);
downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath)
await downloadFile(fileServerClient, azureDirectory, fileName.name, azureShare, fullFilePath);
}
for(var directoryName of result['entries']['directories']){
const fullDirectoryPath: string = path.join(localDirectory, directoryName.name)
const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name)
downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath)
for (const directoryName of result.entries.directories) {
const fullDirectoryPath: string = path.join(localDirectory, directoryName.name);
const fullAzureDirectory: string = path.join(azureDirectory, directoryName.name);
await downloadDirectory(fileServerClient, fullAzureDirectory, azureShare, fullDirectoryPath);
}
deferred.resolve();
})
});
return deferred.promise;
}
}
// tslint:enable: no-redundant-jsdoc no-any no-unsafe-any
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
......@@ -20,21 +21,29 @@
'use strict';
import * as fs from 'fs';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
abstract class FrameworkControllerClient extends KubernetesCRDClient{
/**
* FrameworkController Client
*/
abstract class FrameworkControllerClient extends KubernetesCRDClient {
/**
* Factory method to generate operator cliet
* Factory method to generate operator client
*/
// tslint:disable-next-line:function-name
public static generateFrameworkControllerClient(): KubernetesCRDClient {
return new FrameworkControllerClientV1();
}
}
/**
* FrameworkController ClientV1
*/
class FrameworkControllerClientV1 extends FrameworkControllerClient {
/**
* constructor, to initialize frameworkcontroller CRD definition
*/
// tslint:disable: no-unsafe-any no-any
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
......@@ -42,8 +51,9 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
protected get operator(): any {
return this.client.apis["frameworkcontroller.microsoft.com"].v1.namespaces('default').frameworks;
return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks;
}
// tslint:enable: no-unsafe-any no-any
public get containerName(): string {
return 'framework';
......@@ -51,4 +61,3 @@ class FrameworkControllerClientV1 extends FrameworkControllerClient {
}
export { FrameworkControllerClient, GeneralK8sClient };
......@@ -20,10 +20,11 @@
'use strict';
import * as assert from 'assert';
import { KubernetesTrialConfig, KubernetesTrialConfigTemplate, KubernetesClusterConfigAzure,
KubernetesClusterConfigNFS, NFSConfig, KubernetesStorageKind, keyVaultConfig, AzureStorage, KubernetesClusterConfig,
StorageConfig } from '../kubernetesConfig'
import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
} from '../kubernetesConfig';
// tslint:disable:completed-docs
export class FrameworkAttemptCompletionPolicy {
public readonly minFailedTaskCount: number;
public readonly minSucceededTaskCount: number;
......@@ -36,7 +37,7 @@ export class FrameworkAttemptCompletionPolicy {
/**
* Trial job configuration for FrameworkController
*/
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate{
export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy;
public readonly name: string;
public readonly taskNum: number;
......@@ -50,7 +51,7 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
}
}
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig{
export class FrameworkControllerTrialConfig extends KubernetesTrialConfig {
public readonly taskRoles: FrameworkControllerTrialConfigTemplate[];
public readonly codeDir: string;
constructor(codeDir: string, taskRoles: FrameworkControllerTrialConfigTemplate[]) {
......@@ -68,6 +69,7 @@ export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:function-name
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string;
constructor(
......@@ -81,8 +83,9 @@ export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfig
}
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <FrameworkControllerClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined)
const kubeflowClusterConfigObjectNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined);
return new FrameworkControllerClusterConfigNFS(
kubeflowClusterConfigObjectNFS.serviceAccountName,
kubeflowClusterConfigObjectNFS.apiVersion,
......@@ -98,16 +101,17 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
constructor(
serviceAccountName: string,
apiVersion: string,
keyVault: keyVaultConfig,
keyVault: KeyVaultConfig,
azureStorage: AzureStorage,
storage?: KubernetesStorageKind
) {
super(apiVersion, keyVault, azureStorage,storage);
super(apiVersion, keyVault, azureStorage, storage);
this.serviceAccountName = serviceAccountName;
}
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <FrameworkControllerClusterConfigAzure>jsonObject;
const kubeflowClusterConfigObjectAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>jsonObject;
return new FrameworkControllerClusterConfigAzure(
kubeflowClusterConfigObjectAzure.serviceAccountName,
kubeflowClusterConfigObjectAzure.apiVersion,
......@@ -121,11 +125,11 @@ export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConf
export class FrameworkControllerClusterConfigFactory {
public static generateFrameworkControllerClusterConfig(jsonObject: object): FrameworkControllerClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
if(!storageConfig) {
throw new Error("Invalid json object as a StorageConfig instance");
const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if (storageConfig === undefined) {
throw new Error('Invalid json object as a StorageConfig instance');
}
if(storageConfig.storage && storageConfig.storage === 'azureStorage') {
if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return FrameworkControllerClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return FrameworkControllerClusterConfigNFS.getInstance(jsonObject);
......@@ -134,6 +138,7 @@ export class FrameworkControllerClusterConfigFactory {
}
}
export type FrameworkControllerJobStatus = 'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobStatus =
'AttemptRunning' | 'Completed' | 'AttemptCreationPending' | 'AttemptCreationRequested' | 'AttemptPreparing' | 'AttemptCompleted';
export type FrameworkControllerJobCompleteStatus = 'Succeeded' | 'Failed';
......@@ -19,15 +19,15 @@
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { FrameworkControllerJobStatus, FrameworkControllerJobCompleteStatus } from './frameworkcontrollerConfig';
import { FrameworkControllerJobCompleteStatus, FrameworkControllerJobStatus } from './frameworkcontrollerConfig';
/**
* Collector frameworkcontroller jobs info from Kubernetes cluster, and update frameworkcontroller job status locally
*/
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector{
export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
......@@ -38,47 +38,55 @@ export class FrameworkControllerJobInfoCollector extends KubernetesJobInfoCollec
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
// tslint:disable-next-line:no-any
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
} catch (error) {
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
// tslint:disable: no-unsafe-any
if (kubernetesJobInfo.status && kubernetesJobInfo.status.state) {
const frameworkJobType: FrameworkControllerJobStatus = <FrameworkControllerJobStatus>kubernetesJobInfo.status.state;
switch(frameworkJobType) {
case 'AttemptCreationPending' || 'AttemptCreationRequested' || 'AttemptPreparing':
switch (frameworkJobType) {
case 'AttemptCreationPending':
case 'AttemptCreationRequested':
case 'AttemptPreparing':
kubernetesTrialJob.status = 'WAITING';
break;
case 'AttemptRunning':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>kubernetesJobInfo.status.startTime);
}
break;
case 'Completed':
const completedJobType : FrameworkControllerJobCompleteStatus = <FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch(completedJobType) {
const completedJobType : FrameworkControllerJobCompleteStatus =
<FrameworkControllerJobCompleteStatus>kubernetesJobInfo.status.attemptStatus.completionStatus.type.name;
switch (completedJobType) {
case 'Succeeded':
kubernetesTrialJob.status = 'SUCCEEDED';
break;
case 'Failed':
kubernetesTrialJob.status = 'FAILED';
break;
default:
}
kubernetesTrialJob.endTime = Date.parse(<string>kubernetesJobInfo.status.completionTime);
break;
default:
break;
}
}
return Promise.resolve();
}
// tslint:enable: no-unsafe-any
}
......@@ -20,15 +20,15 @@
'use strict';
import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { FrameworkControllerTrainingService } from './frameworkcontrollerTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* frameworkcontroller Training service Rest server, provides rest API to support frameworkcontroller job metrics update
*
*/
@component.Singleton
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer{
export class FrameworkControllerJobRestServer extends KubernetesJobRestServer {
constructor() {
super(component.get(FrameworkControllerTrainingService));
}
......
......@@ -20,18 +20,22 @@
'use strict';
import * as fs from 'fs';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient';
import { KubeflowOperator } from './kubeflowConfig';
import { KubernetesCRDClient, GeneralK8sClient } from '../kubernetesApiClient';
abstract class KubeflowOperatorClient extends KubernetesCRDClient{
/**
* KubeflowOperator Client
*/
abstract class KubeflowOperatorClient extends KubernetesCRDClient {
/**
* Factory method to generate operator cliet
* Factory method to generate operator client
*/
// tslint:disable-next-line:function-name
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: string): KubernetesCRDClient {
switch(kubeflowOperator) {
switch (kubeflowOperator) {
case 'tf-operator': {
switch(operatorApiVersion) {
switch (operatorApiVersion) {
case 'v1alpha2': {
return new TFOperatorClientV1Alpha2();
}
......@@ -41,11 +45,12 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': {
return new TFOperatorClientV1Beta2();
}
default:
throw new Error(`Invalid tf-operator apiVersion ${operatorApiVersion}`);
}
break;
}
case 'pytorch-operator': {
switch(operatorApiVersion) {
switch (operatorApiVersion) {
case 'v1alpha2': {
return new PyTorchOperatorClientV1Alpha2();
}
......@@ -55,13 +60,17 @@ abstract class KubeflowOperatorClient extends KubernetesCRDClient{
case 'v1beta2': {
return new PyTorchOperatorClientV1Beta2();
}
default:
throw new Error(`Invalid pytorch-operator apiVersion ${operatorApiVersion}`);
}
}
default:
throw new Error(`Invalid operator ${kubeflowOperator}`);
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
}
// tslint:disable: no-unsafe-any no-any completed-docs
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
......@@ -73,7 +82,7 @@ class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
......@@ -92,7 +101,7 @@ class TFOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
......@@ -111,7 +120,7 @@ class TFOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').tfjobs;
return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').tfjobs;
}
public get containerName(): string {
......@@ -130,7 +139,7 @@ class PyTorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
......@@ -149,7 +158,7 @@ class PyTorchOperatorClientV1Beta1 extends KubernetesCRDClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
......@@ -168,7 +177,7 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta2.namespaces('default').pytorchjobs;
return this.client.apis['kubeflow.org'].v1beta2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
......@@ -176,5 +185,5 @@ class PyTorchOperatorClientV1Beta2 extends KubernetesCRDClient {
}
}
// tslint:enable: no-unsafe-any
export { KubeflowOperatorClient, GeneralK8sClient };
......@@ -20,16 +20,20 @@
'use strict';
import * as assert from 'assert';
import { KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, KubernetesStorageKind, NFSConfig, AzureStorage, keyVaultConfig,
KubernetesTrialConfig, KubernetesTrialConfigTemplate, StorageConfig, KubernetesClusterConfig } from '../kubernetesConfig'
import { MethodNotImplementedError } from '../../../common/errors';
import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig
} from '../kubernetesConfig';
/** operator types that kubeflow supported */
// operator types that kubeflow supported
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type DistTrainRole = 'worker' | 'ps' | 'master';
export type KubeflowJobStatus = 'Created' | 'Running' | 'Failed' | 'Succeeded';
export type OperatorApiVersion = 'v1alpha2' | 'v1beta1' | 'v1beta2';
/**
* Kubeflow Cluster Configuration
*/
export class KubeflowClusterConfig extends KubernetesClusterConfig {
public readonly operator: KubeflowOperator;
constructor(apiVersion: string, operator: KubeflowOperator) {
......@@ -38,6 +42,7 @@ export class KubeflowClusterConfig extends KubernetesClusterConfig {
}
}
// tslint:disable:completed-docs
export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly operator: KubeflowOperator;
constructor(
......@@ -54,9 +59,11 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
return 'nfs';
}
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigNFS {
let kubeflowClusterConfigObjectNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined)
const kubeflowClusterConfigObjectNFS: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined);
return new KubeflowClusterConfigNFS(
kubeflowClusterConfigObjectNFS.operator,
kubeflowClusterConfigObjectNFS.apiVersion,
......@@ -66,26 +73,28 @@ export class KubeflowClusterConfigNFS extends KubernetesClusterConfigNFS {
}
}
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly operator: KubeflowOperator;
constructor(
operator: KubeflowOperator,
apiVersion: string,
keyVault: keyVaultConfig,
keyVault: KeyVaultConfig,
azureStorage: AzureStorage,
storage?: KubernetesStorageKind
) {
super(apiVersion, keyVault, azureStorage,storage);
super(apiVersion, keyVault, azureStorage, storage);
this.operator = operator;
}
public get storageType(): KubernetesStorageKind{
public get storageType(): KubernetesStorageKind {
return 'azureStorage';
}
// tslint:disable-next-line:function-name
public static getInstance(jsonObject: object): KubeflowClusterConfigAzure {
let kubeflowClusterConfigObjectAzure = <KubeflowClusterConfigAzure>jsonObject;
const kubeflowClusterConfigObjectAzure: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>jsonObject;
return new KubeflowClusterConfigAzure(
kubeflowClusterConfigObjectAzure.operator,
kubeflowClusterConfigObjectAzure.apiVersion,
......@@ -98,12 +107,13 @@ export class KubeflowClusterConfigAzure extends KubernetesClusterConfigAzure{
export class KubeflowClusterConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowClusterConfig(jsonObject: object): KubeflowClusterConfig {
let storageConfig = <StorageConfig>jsonObject;
if(!storageConfig) {
throw new Error("Invalid json object as a StorageConfig instance");
const storageConfig: StorageConfig = <StorageConfig>jsonObject;
if (storageConfig === undefined) {
throw new Error('Invalid json object as a StorageConfig instance');
}
if(storageConfig.storage && storageConfig.storage === 'azureStorage') {
if (storageConfig.storage !== undefined && storageConfig.storage === 'azureStorage') {
return KubeflowClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return KubeflowClusterConfigNFS.getInstance(jsonObject);
......@@ -122,7 +132,7 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
}
}
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate{
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) {
......@@ -163,16 +173,19 @@ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfig {
export class KubeflowTrialConfigFactory {
// tslint:disable-next-line:function-name
public static generateKubeflowTrialConfig(jsonObject: object, operator: KubeflowOperator): KubeflowTrialConfig {
if(operator === 'tf-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigTensorflow>jsonObject;
if (operator === 'tf-operator') {
const kubeflowTrialConfigObject: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>jsonObject;
return new KubeflowTrialConfigTensorflow(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.worker,
kubeflowTrialConfigObject.ps
);
}else if(operator === 'pytorch-operator'){
let kubeflowTrialConfigObject = <KubeflowTrialConfigPytorch>jsonObject;
} else if (operator === 'pytorch-operator') {
const kubeflowTrialConfigObject: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>jsonObject;
return new KubeflowTrialConfigPytorch(
kubeflowTrialConfigObject.codeDir,
kubeflowTrialConfigObject.master,
......
......@@ -19,15 +19,15 @@
'use strict';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesCRDClient } from '../kubernetesApiClient';
import { KubernetesTrialJobDetail} from '../kubernetesData';
import { KubernetesJobInfoCollector } from '../kubernetesJobInfoCollector';
import { KubeflowJobStatus } from './kubeflowConfig';
/**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
*/
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector {
constructor(jobMap: Map<string, KubernetesTrialJobDetail>) {
super(jobMap);
}
......@@ -38,31 +38,33 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
return Promise.resolve();
}
if(kubernetesCRDClient === undefined) {
if (kubernetesCRDClient === undefined) {
return Promise.reject('kubernetesCRDClient is undefined');
}
// tslint:disable:no-any no-unsafe-any
let kubernetesJobInfo: any;
try {
kubernetesJobInfo = await kubernetesCRDClient.getKubernetesJob(kubernetesTrialJob.kubernetesJobName);
} catch(error) {
} catch (error) {
// Notice: it maynot be a 'real' error since cancel trial job can also cause getKubernetesJob failed.
this.log.error(`Get job ${kubernetesTrialJob.kubernetesJobName} info failed, error is ${error}`);
//This is not treat as a error status
return Promise.resolve();
}
if(kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
if (kubernetesJobInfo.status && kubernetesJobInfo.status.conditions) {
const latestCondition: any = kubernetesJobInfo.status.conditions[kubernetesJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowJobStatus = <KubeflowJobStatus>latestCondition.type;
switch(tfJobType) {
switch (tfJobType) {
case 'Created':
kubernetesTrialJob.status = 'WAITING';
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
case 'Running':
kubernetesTrialJob.status = 'RUNNING';
if(!kubernetesTrialJob.startTime) {
if (kubernetesTrialJob.startTime === undefined) {
kubernetesTrialJob.startTime = Date.parse(<string>latestCondition.lastUpdateTime);
}
break;
......@@ -75,9 +77,10 @@ export class KubeflowJobInfoCollector extends KubernetesJobInfoCollector{
kubernetesTrialJob.endTime = Date.parse(<string>latestCondition.lastUpdateTime);
break;
default:
break;
}
}
// tslint:enable:no-any no-unsafe-any
return Promise.resolve();
}
}
......@@ -20,15 +20,15 @@
'use strict';
import * as component from '../../../common/component';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer';
import { KubeflowTrainingService } from './kubeflowTrainingService';
import { KubernetesJobRestServer } from '../kubernetesJobRestServer'
/**
* Kubeflow Training service Rest server, provides rest API to support kubeflow job metrics update
*
*/
@component.Singleton
export class KubeflowJobRestServer extends KubernetesJobRestServer{
export class KubeflowJobRestServer extends KubernetesJobRestServer {
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment