Unverified Commit 555334de authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support private registry in PAI, Kubeflow and FrameworkController mode (#1354)

parent 279dbdef
...@@ -198,6 +198,8 @@ Trial configuration in kubeflow mode have the following configuration keys: ...@@ -198,6 +198,8 @@ Trial configuration in kubeflow mode have the following configuration keys:
* image * image
* Required key. In kubeflow mode, your trial program will be scheduled by Kubernetes to run in [Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/). This key is used to specify the Docker image used to create the pod where your trail program will run. * Required key. In kubeflow mode, your trial program will be scheduled by Kubernetes to run in [Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/). This key is used to specify the Docker image used to create the pod where your trail program will run.
* We already build a docker image [msranni/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it. * We already build a docker image [msranni/nni](https://hub.docker.com/r/msranni/nni/) on [Docker Hub](https://hub.docker.com/). It contains NNI python packages, Node modules and javascript artifact files required to start experiment, and all of NNI dependencies. The docker file used to build this image can be found at [here](https://github.com/Microsoft/nni/tree/master/deployment/docker/Dockerfile). You can either use this image directly in your config file, or build your own image based on it.
* privateRegistryAuthPath
* Optional field, specify `config.json` file path that holds an authorization token of docker registry, used to pull image from private registry. [Refer](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/).
* apiVersion * apiVersion
* Required key. The API version of your Kubeflow. * Required key. The API version of your Kubeflow.
* ps (optional). This config section is used to configure Tensorflow parameter server role. * ps (optional). This config section is used to configure Tensorflow parameter server role.
......
...@@ -53,6 +53,8 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod ...@@ -53,6 +53,8 @@ Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMod
* Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster. * Optional key. Set the virtualCluster of OpenPAI. If omitted, the job will run on default virtual cluster.
* shmMB * shmMB
* Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role. * Optional key. Set the shmMB configuration of OpenPAI, it set the shared memory for one task in the task role.
* authFile
* Optional key, Set the auth file path for private registry while using PAI mode, [Refer](https://github.com/microsoft/pai/blob/2ea69b45faa018662bc164ed7733f6fdbb4c42b3/docs/faq.md#q-how-to-use-private-docker-registry-job-image-when-submitting-an-openpai-job).
Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command Once complete to fill NNI experiment config file and save (for example, save as exp_pai.yml), then run the following command
``` ```
......
...@@ -51,10 +51,12 @@ export namespace ValidationSchemas { ...@@ -51,10 +51,12 @@ export namespace ValidationSchemas {
command: joi.string().min(1), command: joi.string().min(1),
virtualCluster: joi.string(), virtualCluster: joi.string(),
shmMB: joi.number(), shmMB: joi.number(),
authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'),
worker: joi.object({ worker: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -64,6 +66,7 @@ export namespace ValidationSchemas { ...@@ -64,6 +66,7 @@ export namespace ValidationSchemas {
ps: joi.object({ ps: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -73,6 +76,7 @@ export namespace ValidationSchemas { ...@@ -73,6 +76,7 @@ export namespace ValidationSchemas {
master: joi.object({ master: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -83,6 +87,7 @@ export namespace ValidationSchemas { ...@@ -83,6 +87,7 @@ export namespace ValidationSchemas {
name: joi.string().min(1), name: joi.string().min(1),
taskNum: joi.number().min(1).required(), taskNum: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
......
...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy, privateRegistryFilePath?: string | undefined) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryFilePath);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name; this.name = name;
this.taskNum = taskNum; this.taskNum = taskNum;
......
...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
// Generate frameworkcontroller job resource config object // Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = const frameworkcontrollerJobConfig: any =
this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources); await this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig); return Promise.resolve(frameworkcontrollerJobConfig);
} }
...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name * @param frameworkcontrollerJobName job name
* @param podResources pod template * @param podResources pod template
*/ */
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, private async generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string,
frameworkcontrollerJobName : string, podResources : any) : any { frameworkcontrollerJobName : string, podResources : any) : Promise<any> {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if (containerPort === undefined) { if (containerPort === undefined) {
throw new Error('Container port is not initialized'); throw new Error('Container port is not initialized');
} }
const taskRole: any = this.generateTaskRoleConfig( const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.fcTrialConfig.taskRoles[index].image, this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`, `run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index], podResources[index],
containerPort containerPort,
await this.createRegistrySecret(this.fcTrialConfig.taskRoles[index].privateRegistryAuthPath)
); );
taskRoles.push({ taskRoles.push({
name: this.fcTrialConfig.taskRoles[index].name, name: this.fcTrialConfig.taskRoles[index].name,
...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}); });
} }
return { return Promise.resolve({
apiVersion: `frameworkcontroller.microsoft.com/v1`, apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework', kind: 'Framework',
metadata: { metadata: {
...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
executionType: 'Start', executionType: 'Start',
taskRoles: taskRoles taskRoles: taskRoles
} }
}; });
} }
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string,
podResources: any, containerPort: number): any { podResources: any, containerPort: number, privateRegistrySecretName: string | undefined): any {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}] }]
}]; }];
const spec: any = {
let spec: any = {
containers: containers, containers: containers,
initContainers: initContainers, initContainers: initContainers,
restartPolicy: 'OnFailure', restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes'), volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false hostNetwork: false
}; };
if(privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}
]
}
if (this.fcClusterConfig.serviceAccountName !== undefined) { if (this.fcClusterConfig.serviceAccountName !== undefined) {
spec.serviceAccountName = this.fcClusterConfig.serviceAccountName; spec.serviceAccountName = this.fcClusterConfig.serviceAccountName;
} }
......
...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate { export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryAuthPath);
this.replicas = replicas; this.replicas = replicas;
} }
} }
......
...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources,
nonWorkerResources); nonWorkerResources);
return Promise.resolve(kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig);
...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template * @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master * @param nonWorkerPodResources non-worker pod template, like ps or master
*/ */
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any,
nonWorkerPodResources?: any) : any { nonWorkerPodResources?: any) : Promise<any> {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const replicaSpecsObj: any = {}; const replicaSpecsObj: any = {};
const replicaSpecsObjMap: Map<string, object> = new Map<string, object>(); const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if (pytorchTrialConfig.worker !== undefined) { if (pytorchTrialConfig.worker !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
} }
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName);
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
} }
return { return Promise.resolve({
apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.kubernetesCRDClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}, },
spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; });
} }
/** /**
...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param podResources pod resource config section * @param podResources pod resource config section
*/ */
private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string,
podResources: any): any { podResources: any, privateRegistrySecretName: string | undefined): any {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (this.kubernetesCRDClient === undefined) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
// The config spec for volume field
const volumeSpecMap: Map<string, object> = new Map<string, object>(); const volumeSpecMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowClusterConfig.storageType === 'azureStorage') { if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
...@@ -459,16 +462,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -459,16 +462,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}]); }]);
} }
// The config spec for container field
return { const containersSpecMap: Map<string, object> = new Map<string, object>();
replicas: replicaNumber, containersSpecMap.set('containers', [
template: {
metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null
},
spec: {
containers: [
{ {
// Kubeflow tensorflow operator requires that containers' name must be tensorflow // Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type // TODO: change the name based on operator's type
...@@ -481,12 +477,29 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -481,12 +477,29 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
mountPath: this.CONTAINER_MOUNT_PATH mountPath: this.CONTAINER_MOUNT_PATH
}], }],
resources: podResources resources: podResources
}], }
]);
let spec: any = {
containers: containersSpecMap.get('containers'),
restartPolicy: 'ExitCode', restartPolicy: 'ExitCode',
volumes: volumeSpecMap.get('nniVolumes') volumes: volumeSpecMap.get('nniVolumes')
} }
if (privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}]
}
return {
replicas: replicaNumber,
template: {
metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null
},
spec: spec
}
} }
};
} }
} }
// tslint:enable: no-unsafe-any no-any // tslint:enable: no-unsafe-any no-any
......
...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate { ...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate {
// Docker image // Docker image
public readonly image: string; public readonly image: string;
// Private registry config file path to download docker iamge
public readonly privateRegistryAuthPath?: string;
// Trail command // Trail command
public readonly command : string; public readonly command : string;
...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate { ...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate {
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.privateRegistryAuthPath = privateRegistryAuthPath;
} }
} }
......
...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig'; ...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var fs = require('fs');
/** /**
* Training Service implementation for Kubernetes * Training Service implementation for Kubernetes
*/ */
...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService { ...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
protected async createRegistrySecret(filePath: string | undefined): Promise<string | undefined> {
if(filePath === undefined || filePath === '') {
return undefined;
}
let body = fs.readFileSync(filePath).toString('base64');
let registrySecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase());
await this.genericK8sClient.createSecret(
{
apiVersion: 'v1',
kind: 'Secret',
metadata: {
name: registrySecretName,
namespace: 'default',
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId()
}
},
type: 'kubernetes.io/dockerconfigjson',
data: {
'.dockerconfigjson': body
}
}
);
return registrySecretName;
}
} }
export { KubernetesTrainingService }; export { KubernetesTrainingService };
...@@ -71,6 +71,8 @@ export class PAIJobConfig { ...@@ -71,6 +71,8 @@ export class PAIJobConfig {
public readonly image: string; public readonly image: string;
// Code directory on HDFS // Code directory on HDFS
public readonly codeDir: string; public readonly codeDir: string;
//authentication file used for private Docker registry
public readonly authFile?: string;
// List of taskRole, one task role at least // List of taskRole, one task role at least
public taskRoles: PAITaskRole[]; public taskRoles: PAITaskRole[];
...@@ -87,12 +89,13 @@ export class PAIJobConfig { ...@@ -87,12 +89,13 @@ export class PAIJobConfig {
* @param taskRoles List of taskRole, one task role at least * @param taskRoles List of taskRole, one task role at least
*/ */
constructor(jobName: string, image : string, codeDir : string, constructor(jobName: string, image : string, codeDir : string,
taskRoles : PAITaskRole[], virtualCluster: string) { taskRoles : PAITaskRole[], virtualCluster: string, authFile?: string) {
this.jobName = jobName; this.jobName = jobName;
this.image = image; this.image = image;
this.codeDir = codeDir; this.codeDir = codeDir;
this.taskRoles = taskRoles; this.taskRoles = taskRoles;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.authFile = authFile;
} }
} }
...@@ -129,14 +132,17 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -129,14 +132,17 @@ export class NNIPAITrialConfig extends TrialConfig {
public virtualCluster?: string; public virtualCluster?: string;
//Shared memory for one task in the task role //Shared memory for one task in the task role
public shmMB?: number; public shmMB?: number;
//authentication file used for private Docker registry
public authFile?: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number) { image: string, virtualCluster?: string, shmMB?: number, authFile?: string) {
super(command, codeDir, gpuNum); super(command, codeDir, gpuNum);
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.shmMB = shmMB; this.shmMB = shmMB;
this.authFile = authFile;
} }
} }
...@@ -442,7 +442,7 @@ class PAITrainingService implements TrainingService { ...@@ -442,7 +442,7 @@ class PAITrainingService implements TrainingService {
// Task command // Task command
nniPaiTrialCommand, nniPaiTrialCommand,
// Task shared memory // Task shared memory
this.paiTrialConfig.shmMB this.paiTrialConfig.shmMB,
) )
]; ];
...@@ -456,7 +456,9 @@ class PAITrainingService implements TrainingService { ...@@ -456,7 +456,9 @@ class PAITrainingService implements TrainingService {
// PAI Task roles // PAI Task roles
paiTaskRoles, paiTaskRoles,
// Add Virutal Cluster // Add Virutal Cluster
this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString() this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString(),
//Task auth File
this.paiTrialConfig.authFile
); );
// Step 2. Upload code files in codeDir onto HDFS // Step 2. Upload code files in codeDir onto HDFS
......
...@@ -233,6 +233,8 @@ pai_trial_schema = { ...@@ -233,6 +233,8 @@ pai_trial_schema = {
'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int), 'memoryMB': setType('memoryMB', int),
'image': setType('image', str), 'image': setType('image', str),
Optional('authFile'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: authFile format error, authFile format is hdfs://xxx.xxx.xxx.xxx:xxx'),
Optional('shmMB'): setType('shmMB', int), Optional('shmMB'): setType('shmMB', int),
Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ Optional('dataDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), error='ERROR: dataDir format error, dataDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
...@@ -261,7 +263,8 @@ kubeflow_trial_schema = { ...@@ -261,7 +263,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999), 'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int), 'memoryMB': setType('memoryMB', int),
'image': setType('image', str) 'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
}, },
Optional('master'): { Optional('master'): {
'replicas': setType('replicas', int), 'replicas': setType('replicas', int),
...@@ -269,7 +272,8 @@ kubeflow_trial_schema = { ...@@ -269,7 +272,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999), 'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int), 'memoryMB': setType('memoryMB', int),
'image': setType('image', str) 'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
}, },
Optional('worker'):{ Optional('worker'):{
'replicas': setType('replicas', int), 'replicas': setType('replicas', int),
...@@ -277,7 +281,8 @@ kubeflow_trial_schema = { ...@@ -277,7 +281,8 @@ kubeflow_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999), 'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int), 'memoryMB': setType('memoryMB', int),
'image': setType('image', str) 'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
} }
} }
} }
...@@ -324,7 +329,8 @@ frameworkcontroller_trial_schema = { ...@@ -324,7 +329,8 @@ frameworkcontroller_trial_schema = {
'gpuNum': setNumberRange('gpuNum', int, 0, 99999), 'gpuNum': setNumberRange('gpuNum', int, 0, 99999),
'cpuNum': setNumberRange('cpuNum', int, 0, 99999), 'cpuNum': setNumberRange('cpuNum', int, 0, 99999),
'memoryMB': setType('memoryMB', int), 'memoryMB': setType('memoryMB', int),
'image': setType('image', str) 'image': setType('image', str),
Optional('privateRegistryAuthPath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'privateRegistryAuthPath')
}] }]
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment