Unverified Commit d8e55165 authored by fishyds's avatar fishyds Committed by GitHub
Browse files

[Kubeflow training service] Use Kubernete API server to replace kubectl dependency (#472)

[Kubeflow training service] Use Kubernete API server to replace kubectl dependency
parent 07e19a30
...@@ -24,6 +24,8 @@ trial: ...@@ -24,6 +24,8 @@ trial:
image: msranni/nni:latest image: msranni/nni:latest
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
apiVersion: v1alpha2
storage: nfs
nfs: nfs:
server: 10.10.10.10 server: 10.10.10.10
path: /var/nfs/general path: /var/nfs/general
\ No newline at end of file
...@@ -37,6 +37,8 @@ trial: ...@@ -37,6 +37,8 @@ trial:
image: msranni/nni:latest image: msranni/nni:latest
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
apiVersion: v1alpha2
storage: nfs
nfs: nfs:
# Your NFS server IP, like 10.10.10.10 # Your NFS server IP, like 10.10.10.10
server: {your_nfs_server_ip} server: {your_nfs_server_ip}
......
...@@ -24,6 +24,8 @@ trial: ...@@ -24,6 +24,8 @@ trial:
image: msranni/nni:latest image: msranni/nni:latest
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
apiVersion: v1alpha2
storage: nfs
nfs: nfs:
server: 10.10.10.10 server: 10.10.10.10
path: /var/nfs/general path: /var/nfs/general
\ No newline at end of file
...@@ -25,6 +25,8 @@ trial: ...@@ -25,6 +25,8 @@ trial:
image: msranni/nni:latest image: msranni/nni:latest
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
apiVersion: v1alpha2
storage: nfs
nfs: nfs:
server: 10.10.10.10 server: 10.10.10.10
path: /var/nfs/general path: /var/nfs/general
\ No newline at end of file
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1alpha2",
"group": "kubeflow.org",
"names": {
"kind": "PyTorchJob",
"plural": "pytorchjobs",
"singular": "pytorchjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "pytorchjobs.kubeflow.org"
}
}
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta1",
"group": "kubeflow.org",
"names": {
"kind": "PyTorchJob",
"plural": "pytorchjobs",
"singular": "pytorchjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "pytorchjobs.kubeflow.org"
}
}
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1alpha2",
"group": "kubeflow.org",
"names": {
"kind": "TFJob",
"plural": "tfjobs",
"singular": "tfjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "tfjobs.kubeflow.org"
}
}
{
"kind": "CustomResourceDefinition",
"spec": {
"scope": "Namespaced",
"version": "v1beta1",
"group": "kubeflow.org",
"names": {
"kind": "TFJob",
"plural": "tfjobs",
"singular": "tfjob"
}
},
"apiVersion": "apiextensions.k8s.io/v1beta1",
"metadata": {
"name": "tfjobs.kubeflow.org"
}
}
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"version": "1.0.0", "version": "1.0.0",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"postbuild": "cp -rf scripts ./dist/", "postbuild": "cp -rf scripts ./dist/ && cp -rf config ./dist/",
"build": "tsc", "build": "tsc",
"test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors", "test": "mocha -r ts-node/register -t 15000 --recursive **/*.test.ts --colors",
"start": "node dist/main.js" "start": "node dist/main.js"
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
"express": "^4.16.3", "express": "^4.16.3",
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.0",
"node-nvidia-smi": "^1.0.0", "node-nvidia-smi": "^1.0.0",
"node-yaml": "^3.1.1",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^4.0.2", "sqlite3": "^4.0.2",
"ssh2": "^0.6.1", "ssh2": "^0.6.1",
...@@ -26,7 +25,8 @@ ...@@ -26,7 +25,8 @@
"typescript-ioc": "^1.2.4", "typescript-ioc": "^1.2.4",
"typescript-string-operations": "^1.3.1", "typescript-string-operations": "^1.3.1",
"webhdfs":"^1.2.0", "webhdfs":"^1.2.0",
"azure-storage": "^2.10.2" "azure-storage": "^2.10.2",
"kubernetes-client": "^6.5.0"
}, },
"devDependencies": { "devDependencies": {
"@types/chai": "^4.1.4", "@types/chai": "^4.1.4",
......
...@@ -78,6 +78,7 @@ export namespace ValidationSchemas { ...@@ -78,6 +78,7 @@ export namespace ValidationSchemas {
kubeflow_config: joi.object({ kubeflow_config: joi.object({
operator: joi.string().min(1).required(), operator: joi.string().min(1).required(),
storage: joi.string().min(1), storage: joi.string().min(1),
apiVersion: joi.string().min(1),
nfs: joi.object({ nfs: joi.object({
server: joi.string().min(1).required(), server: joi.string().min(1).required(),
path: joi.string().min(1).required() path: joi.string().min(1).required()
......
import { TrialConfig } from "../common/trialConfig";
/** /**
* Copyright (c) Microsoft Corporation * Copyright (c) Microsoft Corporation
* All rights reserved. * All rights reserved.
...@@ -21,28 +19,11 @@ import { TrialConfig } from "../common/trialConfig"; ...@@ -21,28 +19,11 @@ import { TrialConfig } from "../common/trialConfig";
'use strict'; 'use strict';
/** operator types that kubeflow supported */ /** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ; export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' ;
export type KubeflowOperatorJobKind = 'TFJob' | 'PyTorchJob';
export type KubeflowStorageKind = 'nfs' | 'azureStorage'; export type KubeflowStorageKind = 'nfs' | 'azureStorage';
export type DistTrainRole = 'worker' | 'ps' | 'master';
/** export type OperatorApiVersion = 'v1alpha2' | 'v1beta1';
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs']
]);
/**
* map from Kubeflow operator name to its job kind name in K8S
*/
export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperatorJobKind> = new Map<KubeflowOperator, KubeflowOperatorJobKind>([
['tf-operator' , 'TFJob'],
['pytorch-operator', 'PyTorchJob']
]);
/** /**
* Kuberflow cluster configuration * Kuberflow cluster configuration
...@@ -51,7 +32,8 @@ export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperator ...@@ -51,7 +32,8 @@ export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperator
export class KubeflowClusterConfigBase { export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */ /** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
public readonly storage?: KubeflowStorageKind; public readonly apiVersion: OperatorApiVersion;
public readonly storage?: KubeflowStorageKind;
/** /**
* Constructor * Constructor
...@@ -59,8 +41,9 @@ export class KubeflowClusterConfigBase { ...@@ -59,8 +41,9 @@ export class KubeflowClusterConfigBase {
* @param passWord password of Kubeflow Cluster * @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster * @param host Host IP of Kubeflow Cluster
*/ */
constructor(operator: KubeflowOperator, storage?: KubeflowStorageKind) { constructor(operator: KubeflowOperator, apiVersion: OperatorApiVersion, storage?: KubeflowStorageKind) {
this.operator = operator; this.operator = operator;
this.apiVersion = apiVersion;
this.storage = storage; this.storage = storage;
} }
} }
...@@ -68,8 +51,10 @@ export class KubeflowClusterConfigBase { ...@@ -68,8 +51,10 @@ export class KubeflowClusterConfigBase {
export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{ export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{
public readonly nfs: NFSConfig; public readonly nfs: NFSConfig;
constructor(operator: KubeflowOperator, nfs: NFSConfig, storage?: KubeflowStorageKind) { constructor(operator: KubeflowOperator,
super(operator, storage) apiVersion: OperatorApiVersion,
nfs: NFSConfig, storage?: KubeflowStorageKind) {
super(operator, apiVersion, storage);
this.nfs = nfs; this.nfs = nfs;
} }
} }
...@@ -78,8 +63,12 @@ export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{ ...@@ -78,8 +63,12 @@ export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{
public readonly keyVault: keyVaultConfig; public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
constructor(operator: KubeflowOperator, keyVault: keyVaultConfig, azureStorage: AzureStorage, storage?: KubeflowStorageKind) { constructor(operator: KubeflowOperator,
super(operator, storage) apiVersion: OperatorApiVersion,
keyVault: keyVaultConfig,
azureStorage: AzureStorage,
storage?: KubeflowStorageKind) {
super(operator, apiVersion, storage);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
} }
...@@ -184,10 +173,10 @@ export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{ ...@@ -184,10 +173,10 @@ export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
} }
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{ export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master?: KubeflowTrialConfigTemplate; public readonly master: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate; public readonly worker?: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, master?: KubeflowTrialConfigTemplate) { constructor(codeDir: string, master: KubeflowTrialConfigTemplate, worker?: KubeflowTrialConfigTemplate) {
super(codeDir); super(codeDir);
this.master = master; this.master = master;
this.worker = worker; this.worker = worker;
......
...@@ -38,11 +38,10 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -38,11 +38,10 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
public kubeflowJobName: string; public kubeflowJobName: string;
public sequenceId: number; public sequenceId: number;
public queryJobFailedCount: number; public queryJobFailedCount: number;
public k8sPluralName: string
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, workingDirectory: string, form: JobApplicationForm,
kubeflowJobName: string, sequenceId: number, url: string, k8sPluralName: string) { kubeflowJobName: string, sequenceId: number, url: string) {
this.id = id; this.id = id;
this.status = status; this.status = status;
this.submitTime = submitTime; this.submitTime = submitTime;
...@@ -53,7 +52,6 @@ export class KubeflowTrialJobDetail implements TrialJobDetail { ...@@ -53,7 +52,6 @@ export class KubeflowTrialJobDetail implements TrialJobDetail {
this.tags = []; this.tags = [];
this.queryJobFailedCount = 0; this.queryJobFailedCount = 0;
this.url = url; this.url = url;
this.k8sPluralName = k8sPluralName;
} }
} }
......
...@@ -19,11 +19,13 @@ ...@@ -19,11 +19,13 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import * as cpp from 'child-process-promise'; import * as cpp from 'child-process-promise';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData'; import { KubeflowTrialJobDetail, KubeflowTFJobType} from './kubeflowData';
import { NNIError, NNIErrorNames } from '../../common/errors'; import { NNIError, NNIErrorNames } from '../../common/errors';
import { TrialJobStatus } from '../../common/trainingService'; import { TrialJobStatus } from '../../common/trainingService';
import { KubeflowOperatorClient } from './kubernetesApiClient';
/** /**
* Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally * Collector Kubeflow jobs info from Kubernetes cluster, and update kubeflow job status locally
...@@ -32,14 +34,14 @@ export class KubeflowJobInfoCollector { ...@@ -32,14 +34,14 @@ export class KubeflowJobInfoCollector {
private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>; private readonly trialJobsMap : Map<string, KubeflowTrialJobDetail>;
private readonly log: Logger = getLogger(); private readonly log: Logger = getLogger();
private readonly statusesNeedToCheck: TrialJobStatus[]; private readonly statusesNeedToCheck: TrialJobStatus[];
private readonly MAX_FAILED_QUERY_JOB_NUMBER: number = 30;
constructor(jobMap: Map<string, KubeflowTrialJobDetail>) { constructor(jobMap: Map<string, KubeflowTrialJobDetail>) {
this.trialJobsMap = jobMap; this.trialJobsMap = jobMap;
this.statusesNeedToCheck = ['RUNNING', 'WAITING']; this.statusesNeedToCheck = ['RUNNING', 'WAITING'];
} }
public async retrieveTrialStatus() : Promise<void> { public async retrieveTrialStatus(operatorClient: KubeflowOperatorClient | undefined) : Promise<void> {
assert(operatorClient !== undefined);
const updateKubeflowTrialJobs : Promise<void>[] = []; const updateKubeflowTrialJobs : Promise<void>[] = [];
for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) { for(let [trialJobId, kubeflowTrialJob] of this.trialJobsMap) {
if (!kubeflowTrialJob) { if (!kubeflowTrialJob) {
...@@ -49,33 +51,30 @@ export class KubeflowJobInfoCollector { ...@@ -49,33 +51,30 @@ export class KubeflowJobInfoCollector {
if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) { if( Date.now() - kubeflowTrialJob.submitTime < 20 * 1000) {
return Promise.resolve(); return Promise.resolve();
} }
updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(kubeflowTrialJob)) updateKubeflowTrialJobs.push(this.retrieveSingleTrialJobInfo(operatorClient, kubeflowTrialJob))
} }
await Promise.all(updateKubeflowTrialJobs); await Promise.all(updateKubeflowTrialJobs);
} }
private async retrieveSingleTrialJobInfo(kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> { private async retrieveSingleTrialJobInfo(operatorClient: KubeflowOperatorClient | undefined,
kubeflowTrialJob : KubeflowTrialJobDetail) : Promise<void> {
if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) { if (!this.statusesNeedToCheck.includes(kubeflowTrialJob.status)) {
return Promise.resolve(); return Promise.resolve();
} }
let result : cpp.childProcessPromise.Result; if(operatorClient === undefined) {
return Promise.reject('operatorClient is undefined');
}
let kubeflowJobInfo: any;
try { try {
result = await cpp.exec(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} -o json`); kubeflowJobInfo = await operatorClient.getKubeflowJob(kubeflowTrialJob.kubeflowJobName);
if(result.stderr) {
this.log.error(`Get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed. Error is ${result.stderr}, failed checking number is ${kubeflowTrialJob.queryJobFailedCount}`);
kubeflowTrialJob.queryJobFailedCount++;
if(kubeflowTrialJob.queryJobFailedCount >= this.MAX_FAILED_QUERY_JOB_NUMBER) {
kubeflowTrialJob.status = 'UNKNOWN';
}
}
} catch(error) { } catch(error) {
this.log.error(`kubectl get ${kubeflowTrialJob.k8sPluralName} ${kubeflowTrialJob.kubeflowJobName} failed, error is ${error}`); this.log.error(`Get job ${kubeflowTrialJob.kubeflowJobName} info failed, error is ${error}`);
return Promise.resolve(); return Promise.resolve();
} }
const kubeflowJobInfo = JSON.parse(result.stdout);
if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) { if(kubeflowJobInfo.status && kubeflowJobInfo.status.conditions) {
const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1]; const latestCondition = kubeflowJobInfo.status.conditions[kubeflowJobInfo.status.conditions.length - 1];
const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type; const tfJobType : KubeflowTFJobType = <KubeflowTFJobType>latestCondition.type;
......
/**
* Copyright (c) Microsoft Corporation
* All rights reserved.
*
* MIT License
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
* documentation files (the "Software"), to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
* to permit persons to whom the Software is furnished to do so, subject to the following conditions:
* The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
* BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
'use strict';
import * as fs from 'fs';
import * as os from 'os'
import * as path from 'path';
import { getLogger, Logger } from '../../common/log';
import { KubeflowOperator, OperatorApiVersion } from './kubeflowConfig';
var K8SClient = require('kubernetes-client').Client;
var K8SConfig = require('kubernetes-client').config;
/**
* Generict Kubernetes client, target version >= 1.9
*/
class GeneralK8sClient {
protected readonly client: any;
protected readonly log: Logger = getLogger();
constructor() {
this.client = new K8SClient({ config: K8SConfig.fromKubeconfig(path.join(os.homedir(), '.kube', 'config')), version: '1.9'});
this.client.loadSpec();
}
public async createSecret(secretManifest: any): Promise<boolean> {
let result: Promise<boolean>;
const response : any = await this.client.api.v1.namespaces('default').secrets.post({body: secretManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true);
} else {
result = Promise.reject(`Create secrets failed, statusCode is ${response.statusCode}`);
}
return result;
}
}
abstract class KubeflowOperatorClient {
protected readonly client: any;
protected readonly log: Logger = getLogger();
protected crdSchema: any;
constructor() {
this.client = new K8SClient({ config: K8SConfig.fromKubeconfig(path.join(os.homedir(), '.kube', 'config'))});
this.client.loadSpec();
}
protected abstract get operator(): any;
public abstract get containerName(): string;
/**
* Factory method to generate operator cliet
*/
public static generateOperatorClient(kubeflowOperator: KubeflowOperator,
operatorApiVersion: OperatorApiVersion): KubeflowOperatorClient {
if(kubeflowOperator === 'tf-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new TFOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new TFOperatorClientV1Beta1();
}
} else if(kubeflowOperator === 'pytorch-operator') {
if(operatorApiVersion == 'v1alpha2') {
return new PytorchOperatorClientV1Alpha2();
} else if(operatorApiVersion == 'v1beta1') {
return new PytorchOperatorClientV1Beta1();
}
}
throw new Error(`Invalid operator ${kubeflowOperator} or apiVersion ${operatorApiVersion}`);
}
public get jobKind(): string {
if(this.crdSchema
&& this.crdSchema.spec
&& this.crdSchema.spec.names
&& this.crdSchema.spec.names.kind) {
return this.crdSchema.spec.names.kind;
} else {
throw new Error('KubeflowOperatorClient: getJobKind failed, kind is undefined in crd schema!');
}
}
public get apiVersion(): string {
if(this.crdSchema
&& this.crdSchema.spec
&& this.crdSchema.spec.version) {
return this.crdSchema.spec.version;
} else {
throw new Error('KubeflowOperatorClient: get apiVersion failed, version is undefined in crd schema!');
}
}
public async createKubeflowJob(jobManifest: any): Promise<boolean> {
let result: Promise<boolean>;
const response : any = await this.operator.post({body: jobManifest});
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(true);
} else {
result = Promise.reject(`KubeflowOperatorClient create tfjobs failed, statusCode is ${response.statusCode}`);
}
return result;
}
//TODO : replace any
public async getKubeflowJob(kubeflowJobName: string): Promise<any> {
let result: Promise<any>;
const response : any = await this.operator(kubeflowJobName).get();
if(response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(response.body);
} else {
result = Promise.reject(`KubeflowOperatorClient get tfjobs failed, statusCode is ${response.statusCode}`);
}
return result;
}
public async deleteKubeflowJob(labels: Map<string, string>): Promise<boolean> {
let result: Promise<boolean>;
// construct match query from labels for deleting tfjob
const matchQuery: string = Array.from(labels.keys()).map(labelKey => `${labelKey}=${labels.get(labelKey)}`).join(',');
try {
const deleteResult : any = await this.operator().delete({ qs: { labelSelector: matchQuery } });
if(deleteResult.statusCode && deleteResult.statusCode >= 200 && deleteResult.statusCode <= 299) {
result = Promise.resolve(true);
} else {
result = Promise.reject(`KubeflowOperatorClient, delete labels ${matchQuery} get wrong statusCode ${deleteResult.statusCode}`);
}
} catch(err) {
result = Promise.reject(err);
}
return result;
}
}
class TFOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class TFOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/tfjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').tfjobs;
}
public get containerName(): string {
return 'tensorflow';
}
}
class PytorchOperatorClientV1Alpha2 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1alpha2.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1alpha2.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
class PytorchOperatorClientV1Beta1 extends KubeflowOperatorClient {
/**
* constructor, to initialize tfjob CRD definition
*/
public constructor() {
super();
this.crdSchema = JSON.parse(fs.readFileSync('./config/kubeflow/pytorchjob-crd-v1beta1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema);
}
protected get operator(): any {
return this.client.apis["kubeflow.org"].v1beta1.namespaces('default').pytorchjobs;
}
public get containerName(): string {
return 'pytorch';
}
}
export { KubeflowOperatorClient, GeneralK8sClient };
...@@ -140,7 +140,7 @@ kubeflow_trial_schema = { ...@@ -140,7 +140,7 @@ kubeflow_trial_schema = {
'memoryMB': int, 'memoryMB': int,
'image': str 'image': str
}, },
'worker':{ Optional('worker'):{
'replicas': int, 'replicas': int,
'command': str, 'command': str,
'gpuNum': And(int, lambda x: 0 <= x <= 99999), 'gpuNum': And(int, lambda x: 0 <= x <= 99999),
...@@ -154,6 +154,7 @@ kubeflow_trial_schema = { ...@@ -154,6 +154,7 @@ kubeflow_trial_schema = {
kubeflow_config_schema = { kubeflow_config_schema = {
'kubeflowConfig':Or({ 'kubeflowConfig':Or({
'operator': Or('tf-operator', 'pytorch-operator'), 'operator': Or('tf-operator', 'pytorch-operator'),
'apiVersion': str,
Optional('storage'): Or('nfs', 'azureStorage'), Optional('storage'): Or('nfs', 'azureStorage'),
'nfs': { 'nfs': {
'server': str, 'server': str,
......
...@@ -94,10 +94,16 @@ def validate_kubeflow_operators(experiment_config): ...@@ -94,10 +94,16 @@ def validate_kubeflow_operators(experiment_config):
if experiment_config.get('trial').get('master') is not None: if experiment_config.get('trial').get('master') is not None:
print_error('kubeflow with tf-operator can not set master') print_error('kubeflow with tf-operator can not set master')
exit(1) exit(1)
if experiment_config.get('trial').get('worker') is None:
print_error('kubeflow with tf-operator must set worker')
exit(1)
elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator': elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator':
if experiment_config.get('trial').get('ps') is not None: if experiment_config.get('trial').get('ps') is not None:
print_error('kubeflow with pytorch-operator can not set ps') print_error('kubeflow with pytorch-operator can not set ps')
exit(1) exit(1)
if experiment_config.get('trial').get('master') is None:
print_error('kubeflow with pytorch-operator must set master')
exit(1)
if experiment_config.get('kubeflowConfig').get('storage') == 'nfs': if experiment_config.get('kubeflowConfig').get('storage') == 'nfs':
if experiment_config.get('kubeflowConfig').get('nfs') is None: if experiment_config.get('kubeflowConfig').get('nfs') is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment