Unverified Commit fbffbc7c authored by Markus Bauer's avatar Markus Bauer Committed by GitHub
Browse files

[WIP] Enable optional Pod Spec for FrameworkController platform (#3379)

parent 38c9a734
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
/ts/nni_manager/metrics.json /ts/nni_manager/metrics.json
/ts/nni_manager/trial_jobs.json /ts/nni_manager/trial_jobs.json
# Logs # Logs
logs logs
*.log *.log
......
...@@ -28,6 +28,16 @@ Prerequisite for Azure Kubernetes Service ...@@ -28,6 +28,16 @@ Prerequisite for Azure Kubernetes Service
#. Follow the `guideline <https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?tabs=portal>`__ to create azure file storage account. If you use Azure Kubernetes Service, NNI need Azure Storage Service to store code files and the output files. #. Follow the `guideline <https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?tabs=portal>`__ to create azure file storage account. If you use Azure Kubernetes Service, NNI need Azure Storage Service to store code files and the output files.
#. To access Azure storage service, NNI need the access key of the storage account, and NNI uses `Azure Key Vault <https://azure.microsoft.com/en-us/services/key-vault/>`__ Service to protect your private key. Set up Azure Key Vault Service, add a secret to Key Vault to store the access key of Azure storage account. Follow this `guideline <https://docs.microsoft.com/en-us/azure/key-vault/quick-create-cli>`__ to store the access key. #. To access Azure storage service, NNI need the access key of the storage account, and NNI uses `Azure Key Vault <https://azure.microsoft.com/en-us/services/key-vault/>`__ Service to protect your private key. Set up Azure Key Vault Service, add a secret to Key Vault to store the access key of Azure storage account. Follow this `guideline <https://docs.microsoft.com/en-us/azure/key-vault/quick-create-cli>`__ to store the access key.
Prerequisite for PVC storage mode
-----------------------------------------
In order to use persistent volume claims instead of NFS or Azure storage, related storage must
be created manually, in the namespace your trials will run later. This restriction is due to the
fact, that persistent volume claims are hard to recycle and thus can quickly mess with a cluster's
storage management. Persistent volume claims can be created by e.g. using kubectl. Please refer
to the official Kubernetes documentation for `further information <https://kubernetes.io/docs/concepts/storage/persistent-volumes/#persistentvolumeclaims>`__.
Setup FrameworkController Setup FrameworkController
------------------------- -------------------------
...@@ -116,6 +126,37 @@ Trial configuration in frameworkcontroller mode have the following configuration ...@@ -116,6 +126,37 @@ Trial configuration in frameworkcontroller mode have the following configuration
* image: the docker image used to create pod and run the program. * image: the docker image used to create pod and run the program.
* frameworkAttemptCompletionPolicy: the policy to run framework, please refer the `user-manual <https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md#frameworkattemptcompletionpolicy>`__ to get the specific information. Users could use the policy to control the pod, for example, if ps does not stop, only worker stops, The completion policy could helps stop ps. * frameworkAttemptCompletionPolicy: the policy to run framework, please refer the `user-manual <https://github.com/Microsoft/frameworkcontroller/blob/master/doc/user-manual.md#frameworkattemptcompletionpolicy>`__ to get the specific information. Users could use the policy to control the pod, for example, if ps does not stop, only worker stops, The completion policy could helps stop ps.
NNI also offers the possibility to include a customized frameworkcontroller template similar
to the aforementioned tensorflow example. A valid configuration the may look like:
.. code-block:: yaml
experimentName: example_mnist_pytorch
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 2
logLevel: trace
trainingServicePlatform: frameworkcontroller
searchSpacePath: search_space.json
tuner:
builtinTunerName: TPE
classArgs:
optimize_mode: maximize
assessor:
builtinAssessorName: Medianstop
classArgs:
optimize_mode: maximize
trial:
codeDir: .
frameworkcontrollerConfig:
configPath: fc_template.yml
storage: pvc
namespace: twin-pipelines
pvc:
path: /mnt/data
Note that in this example a persistent volume claim has been used, that must be created manually in the specified namespace beforehand. Stick to the mnist-pytorch example (:githublink: `<examples/trials/mnist-pytorch>`__) for a more detailed config (:githublink: `<examples/trials/mnist-pytorch/config_frameworkcontroller_custom.yml>`__) and frameworkcontroller template (:githublink: `<examples/trials/fc_template.yml>`__).
How to run example How to run example
------------------ ------------------
......
authorName: default
experimentName: example_mnist_pytorch
trialConcurrency: 1
maxExecDuration: 1h
maxTrialNum: 10
logLevel: trace
#choice: local, remote, pai, kubeflow
trainingServicePlatform: frameworkcontroller
searchSpacePath: search_space.json
#choice: true, false
useAnnotation: false
tuner:
#choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
builtinTunerName: TPE
classArgs:
#choice: maximize, minimize
optimize_mode: maximize
assessor:
builtinAssessorName: Medianstop
classArgs:
optimize_mode: maximize
trial:
codeDir: .
frameworkcontrollerConfig:
configPath: fc_template.yml
storage: pvc
namespace: "default"
pvc:
path: "/tmp/mount"
apiVersion: frameworkcontroller.microsoft.com/v1
kind: Framework
metadata:
name: pytorchcpu
namespace: default
spec:
executionType: Start
retryPolicy:
fancyRetryPolicy: true
maxRetryCount: 2
taskRoles:
- name: worker
taskNumber: 1
frameworkAttemptCompletionPolicy:
minFailedTaskCount: 1
minSucceededTaskCount: 3
task:
retryPolicy:
fancyRetryPolicy: false
maxRetryCount: 0
podGracefulDeletionTimeoutSec: 1800
pod:
spec:
restartPolicy: Never
hostNetwork: false
containers:
- name: mnist-pytorch
image: msranni/nni:latest
command: ["python", "mnist.py"]
ports:
- containerPort: 5001
volumeMounts:
- name: frameworkbarrier-volume
mountPath: /mnt/frameworkbarrier
- name: data-volume
mountPath: /tmp/mount
serviceAccountName: frameworkbarrier
initContainers:
- name: frameworkbarrier
image: frameworkcontroller/frameworkbarrier
volumeMounts:
- name: frameworkbarrier-volume
mountPath: /mnt/frameworkbarrier
volumes:
- name: frameworkbarrier-volume
emptyDir: {}
- name: data-volume
persistentVolumeClaim:
claimName: nni-storage
...@@ -4,11 +4,17 @@ ...@@ -4,11 +4,17 @@
import json import json
import logging import logging
import os import os
import netifaces import netifaces
from schema import Schema, And, Optional, Regex, Or, SchemaError from nni.tools.package_utils import (
from nni.tools.package_utils import create_validator_instance, get_all_builtin_names, get_registered_algo_meta create_validator_instance,
from .constants import SCHEMA_TYPE_ERROR, SCHEMA_RANGE_ERROR, SCHEMA_PATH_ERROR get_all_builtin_names,
get_registered_algo_meta,
)
from schema import And, Optional, Or, Regex, Schema, SchemaError
from .common_utils import get_yml_content, print_warning from .common_utils import get_yml_content, print_warning
from .constants import SCHEMA_PATH_ERROR, SCHEMA_RANGE_ERROR, SCHEMA_TYPE_ERROR
def setType(key, valueType): def setType(key, valueType):
...@@ -183,9 +189,9 @@ pai_yarn_trial_schema = { ...@@ -183,9 +189,9 @@ pai_yarn_trial_schema = {
Optional('virtualCluster'): setType('virtualCluster', str), Optional('virtualCluster'): setType('virtualCluster', str),
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
Optional('portList'): [{ Optional('portList'): [{
"label": setType('label', str), 'label': setType('label', str),
"beginAt": setType('beginAt', int), 'beginAt': setType('beginAt', int),
"portNumber": setType('portNumber', int) 'portNumber': setType('portNumber', int)
}] }]
} }
} }
...@@ -376,7 +382,7 @@ kubeflow_config_schema = { ...@@ -376,7 +382,7 @@ kubeflow_config_schema = {
frameworkcontroller_trial_schema = { frameworkcontroller_trial_schema = {
'trial': { 'trial': {
'codeDir': setPathCheck('codeDir'), 'codeDir': setPathCheck('codeDir'),
'taskRoles': [{ Optional('taskRoles'): [{
'name': setType('name', str), 'name': setType('name', str),
'taskNum': setType('taskNum', int), 'taskNum': setType('taskNum', int),
'frameworkAttemptCompletionPolicy': { 'frameworkAttemptCompletionPolicy': {
...@@ -395,14 +401,22 @@ frameworkcontroller_trial_schema = { ...@@ -395,14 +401,22 @@ frameworkcontroller_trial_schema = {
frameworkcontroller_config_schema = { frameworkcontroller_config_schema = {
'frameworkcontrollerConfig': Or({ 'frameworkcontrollerConfig': Or({
Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), Optional('storage'): setChoice('storage', 'nfs', 'azureStorage', 'pvc'),
Optional('serviceAccountName'): setType('serviceAccountName', str), Optional('serviceAccountName'): setType('serviceAccountName', str),
'nfs': { 'nfs': {
'server': setType('server', str), 'server': setType('server', str),
'path': setType('path', str) 'path': setType('path', str)
} },
Optional('namespace'): setType('namespace', str),
Optional('configPath'): setType('configPath', str),
}, { }, {
Optional('storage'): setChoice('storage', 'nfs', 'azureStorage'), Optional('storage'): setChoice('storage', 'nfs', 'azureStorage', 'pvc'),
Optional('serviceAccountName'): setType('serviceAccountName', str),
'configPath': setType('configPath', str),
'pvc': {'path': setType('server', str)},
Optional('namespace'): setType('namespace', str),
}, {
Optional('storage'): setChoice('storage', 'nfs', 'azureStorage', 'pvc'),
Optional('serviceAccountName'): setType('serviceAccountName', str), Optional('serviceAccountName'): setType('serviceAccountName', str),
'keyVault': { 'keyVault': {
'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'), 'vaultName': And(Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
...@@ -416,7 +430,9 @@ frameworkcontroller_config_schema = { ...@@ -416,7 +430,9 @@ frameworkcontroller_config_schema = {
'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'), 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),
error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
}, },
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999) Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999),
Optional('namespace'): setType('namespace', str),
Optional('configPath'): setType('configPath', str),
}) })
} }
...@@ -479,6 +495,7 @@ class NNIConfigSchema: ...@@ -479,6 +495,7 @@ class NNIConfigSchema:
self.validate_kubeflow_operators(experiment_config) self.validate_kubeflow_operators(experiment_config)
self.validate_eth0_device(experiment_config) self.validate_eth0_device(experiment_config)
self.validate_hybrid_platforms(experiment_config) self.validate_hybrid_platforms(experiment_config)
self.validate_frameworkcontroller_trial_config(experiment_config)
def validate_tuner_adivosr_assessor(self, experiment_config): def validate_tuner_adivosr_assessor(self, experiment_config):
if experiment_config.get('advisor'): if experiment_config.get('advisor'):
...@@ -601,3 +618,24 @@ class NNIConfigSchema: ...@@ -601,3 +618,24 @@ class NNIConfigSchema:
if config_name and not experiment_config.get(config_name): if config_name and not experiment_config.get(config_name):
raise SchemaError('Need to set {0} for {1} in hybrid mode!'.format(config_name, platform)) raise SchemaError('Need to set {0} for {1} in hybrid mode!'.format(config_name, platform))
def validate_frameworkcontroller_trial_config(self, experiment_config):
if experiment_config.get('trainingServicePlatform') == 'frameworkcontroller':
if not experiment_config.get('trial').get('taskRoles'):
if not experiment_config.get('frameworkcontrollerConfig').get('configPath'):
raise SchemaError("""If no taskRoles are specified a valid custom frameworkcontroller config should
be set using the configPath attribute in frameworkcontrollerConfig!""")
config_content = get_yml_content(experiment_config.get('frameworkcontrollerConfig').get('configPath'))
if not config_content.get('spec').get('taskRoles') or not len(config_content.get('spec').get('taskRoles')):
raise SchemaError('Invalid frameworkcontroller config! No taskRoles were specified!')
if not config_content.get('spec').get('taskRoles')[0].get('task'):
raise SchemaError('Invalid frameworkcontroller config! No task was specified for taskRole!')
names = []
for taskRole in config_content.get('spec').get('taskRoles'):
if not "name" in taskRole:
raise SchemaError('Invalid frameworkcontroller config! Name is missing for taskRole!')
names.append(taskRole.get("name"))
if len(names) > len(set(names)):
raise SchemaError('Invalid frameworkcontroller config! Duplicate taskrole names!')
if not config_content.get('metadata').get('name'):
raise SchemaError('Invalid frameworkcontroller config! No experiment name was specified!')
...@@ -100,6 +100,11 @@ def parse_path(experiment_config, config_path): ...@@ -100,6 +100,11 @@ def parse_path(experiment_config, config_path):
if experiment_config['trial'].get('paiConfigPath'): if experiment_config['trial'].get('paiConfigPath'):
parse_relative_path(root_path, experiment_config['trial'], 'paiConfigPath') parse_relative_path(root_path, experiment_config['trial'], 'paiConfigPath')
# For frameworkcontroller a custom configuration path may be specified
if experiment_config.get('frameworkcontrollerConfig'):
if experiment_config['frameworkcontrollerConfig'].get('configPath'):
parse_relative_path(root_path, experiment_config['frameworkcontrollerConfig'], 'configPath')
def set_default_values(experiment_config): def set_default_values(experiment_config):
if experiment_config.get('maxExecDuration') is None: if experiment_config.get('maxExecDuration') is None:
experiment_config['maxExecDuration'] = '999d' experiment_config['maxExecDuration'] = '999d'
......
...@@ -152,6 +152,10 @@ export namespace ValidationSchemas { ...@@ -152,6 +152,10 @@ export namespace ValidationSchemas {
frameworkcontroller_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase frameworkcontroller_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
storage: joi.string().min(1), storage: joi.string().min(1),
serviceAccountName: joi.string().min(1), serviceAccountName: joi.string().min(1),
pvc: joi.object({
path: joi.string().min(1).required()
}),
configPath: joi.string().min(1),
nfs: joi.object({ nfs: joi.object({
server: joi.string().min(1).required(), server: joi.string().min(1).required(),
path: joi.string().min(1).required() path: joi.string().min(1).required()
...@@ -164,7 +168,8 @@ export namespace ValidationSchemas { ...@@ -164,7 +168,8 @@ export namespace ValidationSchemas {
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}), }),
uploadRetryCount: joi.number().min(1) uploadRetryCount: joi.number().min(1),
namespace: joi.string().min(1)
}), }),
dlts_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase dlts_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase
dashboard: joi.string().min(1), dashboard: joi.string().min(1),
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
'use strict'; 'use strict';
import * as fs from 'fs'; import * as fs from 'fs';
import { GeneralK8sClient, KubernetesCRDClient } from '../kubernetesApiClient'; import {GeneralK8sClient, KubernetesCRDClient} from '../kubernetesApiClient';
/** /**
* FrameworkController ClientV1 * FrameworkController ClientV1
...@@ -13,14 +13,16 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient { ...@@ -13,14 +13,16 @@ class FrameworkControllerClientV1 extends KubernetesCRDClient {
/** /**
* constructor, to initialize frameworkcontroller CRD definition * constructor, to initialize frameworkcontroller CRD definition
*/ */
public constructor() { public namespace: string;
public constructor(namespace?: string) {
super(); super();
this.namespace = namespace ? namespace : "default"
this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8')); this.crdSchema = JSON.parse(fs.readFileSync('./config/frameworkcontroller/frameworkcontrollerjob-crd-v1.json', 'utf8'));
this.client.addCustomResourceDefinition(this.crdSchema); this.client.addCustomResourceDefinition(this.crdSchema);
} }
protected get operator(): any { protected get operator(): any {
return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces('default').frameworks; return this.client.apis['frameworkcontroller.microsoft.com'].v1.namespaces(this.namespace).frameworks;
} }
public get containerName(): string { public get containerName(): string {
...@@ -35,9 +37,9 @@ class FrameworkControllerClientFactory { ...@@ -35,9 +37,9 @@ class FrameworkControllerClientFactory {
/** /**
* Factory method to generate operator client * Factory method to generate operator client
*/ */
public static createClient(): KubernetesCRDClient { public static createClient(namespace?: string): KubernetesCRDClient {
return new FrameworkControllerClientV1(); return new FrameworkControllerClientV1(namespace);
} }
} }
export { FrameworkControllerClientFactory, GeneralK8sClient }; export {FrameworkControllerClientFactory, GeneralK8sClient};
...@@ -5,8 +5,10 @@ ...@@ -5,8 +5,10 @@
import * as assert from 'assert'; import * as assert from 'assert';
import { AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS, import {
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig AzureStorage, KeyVaultConfig, KubernetesClusterConfig, KubernetesClusterConfigAzure, KubernetesClusterConfigNFS,
KubernetesStorageKind, KubernetesTrialConfig, KubernetesTrialConfigTemplate, NFSConfig, StorageConfig, KubernetesClusterConfigPVC,
PVCConfig,
} from '../kubernetesConfig'; } from '../kubernetesConfig';
export class FrameworkAttemptCompletionPolicy { export class FrameworkAttemptCompletionPolicy {
...@@ -47,60 +49,97 @@ export class FrameworkControllerTrialConfig extends KubernetesTrialConfig { ...@@ -47,60 +49,97 @@ export class FrameworkControllerTrialConfig extends KubernetesTrialConfig {
export class FrameworkControllerClusterConfig extends KubernetesClusterConfig { export class FrameworkControllerClusterConfig extends KubernetesClusterConfig {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
constructor(apiVersion: string, serviceAccountName: string) { constructor(apiVersion: string, serviceAccountName: string, configPath?: string, namespace?: string) {
super(apiVersion); super(apiVersion, undefined, namespace);
this.serviceAccountName = serviceAccountName; this.serviceAccountName = serviceAccountName;
} }
} }
export class FrameworkControllerClusterConfigPVC extends KubernetesClusterConfigPVC {
public readonly serviceAccountName: string;
public readonly configPath: string;
constructor(serviceAccountName: string, apiVersion: string, pvc: PVCConfig, configPath: string,
storage?: KubernetesStorageKind, namespace?: string) {
super(apiVersion, pvc, storage, namespace);
this.serviceAccountName = serviceAccountName;
this.configPath = configPath
}
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigPVC {
const kubernetesClusterConfigObjectPVC: FrameworkControllerClusterConfigPVC = <FrameworkControllerClusterConfigPVC>jsonObject;
assert(kubernetesClusterConfigObjectPVC !== undefined);
return new FrameworkControllerClusterConfigPVC(
kubernetesClusterConfigObjectPVC.serviceAccountName,
kubernetesClusterConfigObjectPVC.apiVersion,
kubernetesClusterConfigObjectPVC.pvc,
kubernetesClusterConfigObjectPVC.configPath,
kubernetesClusterConfigObjectPVC.storage,
kubernetesClusterConfigObjectPVC.namespace
);
}
}
export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS { export class FrameworkControllerClusterConfigNFS extends KubernetesClusterConfigNFS {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
public readonly configPath?: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind,
namespace?: string,
configPath?: string
) { ) {
super(apiVersion, nfs, storage); super(apiVersion, nfs, storage, namespace);
this.serviceAccountName = serviceAccountName; this.serviceAccountName = serviceAccountName;
this.configPath = configPath
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigNFS {
const kubeflowClusterConfigObjectNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>jsonObject; const kubernetesClusterConfigObjectNFS: FrameworkControllerClusterConfigNFS = <FrameworkControllerClusterConfigNFS>jsonObject;
assert (kubeflowClusterConfigObjectNFS !== undefined); assert(kubernetesClusterConfigObjectNFS !== undefined);
return new FrameworkControllerClusterConfigNFS( return new FrameworkControllerClusterConfigNFS(
kubeflowClusterConfigObjectNFS.serviceAccountName, kubernetesClusterConfigObjectNFS.serviceAccountName,
kubeflowClusterConfigObjectNFS.apiVersion, kubernetesClusterConfigObjectNFS.apiVersion,
kubeflowClusterConfigObjectNFS.nfs, kubernetesClusterConfigObjectNFS.nfs,
kubeflowClusterConfigObjectNFS.storage kubernetesClusterConfigObjectNFS.storage,
kubernetesClusterConfigObjectNFS.namespace
); );
} }
} }
export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure { export class FrameworkControllerClusterConfigAzure extends KubernetesClusterConfigAzure {
public readonly serviceAccountName: string; public readonly serviceAccountName: string;
public readonly configPath?: string;
constructor( constructor(
serviceAccountName: string, serviceAccountName: string,
apiVersion: string, apiVersion: string,
keyVault: KeyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind,
uploadRetryCount?: number,
namespace?: string,
configPath?: string
) { ) {
super(apiVersion, keyVault, azureStorage, storage); super(apiVersion, keyVault, azureStorage, storage, uploadRetryCount, namespace);
this.serviceAccountName = serviceAccountName; this.serviceAccountName = serviceAccountName;
this.configPath = configPath
} }
public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure { public static getInstance(jsonObject: object): FrameworkControllerClusterConfigAzure {
const kubeflowClusterConfigObjectAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>jsonObject; const kubernetesClusterConfigObjectAzure: FrameworkControllerClusterConfigAzure = <FrameworkControllerClusterConfigAzure>jsonObject;
return new FrameworkControllerClusterConfigAzure( return new FrameworkControllerClusterConfigAzure(
kubeflowClusterConfigObjectAzure.serviceAccountName, kubernetesClusterConfigObjectAzure.serviceAccountName,
kubeflowClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.apiVersion,
kubeflowClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.keyVault,
kubeflowClusterConfigObjectAzure.azureStorage, kubernetesClusterConfigObjectAzure.azureStorage,
kubeflowClusterConfigObjectAzure.storage kubernetesClusterConfigObjectAzure.storage,
kubernetesClusterConfigObjectAzure.uploadRetryCount,
kubernetesClusterConfigObjectAzure.namespace
); );
} }
} }
...@@ -116,6 +155,8 @@ export class FrameworkControllerClusterConfigFactory { ...@@ -116,6 +155,8 @@ export class FrameworkControllerClusterConfigFactory {
return FrameworkControllerClusterConfigAzure.getInstance(jsonObject); return FrameworkControllerClusterConfigAzure.getInstance(jsonObject);
} else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') { } else if (storageConfig.storage === undefined || storageConfig.storage === 'nfs') {
return FrameworkControllerClusterConfigNFS.getInstance(jsonObject); return FrameworkControllerClusterConfigNFS.getInstance(jsonObject);
} else if (storageConfig.storage !== undefined && storageConfig.storage === 'pvc') {
return FrameworkControllerClusterConfigPVC.getInstance(jsonObject);
} }
throw new Error(`Invalid json object ${jsonObject}`); throw new Error(`Invalid json object ${jsonObject}`);
} }
......
...@@ -202,8 +202,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -202,8 +202,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig; const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, azureKubeflowClusterConfig.uploadRetryCount); return await this.uploadFolderToAzureStorage(srcDirectory, destDirectory, azureKubeflowClusterConfig.uploadRetryCount);
} else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/${destDirectory}`); await cpp.exec(`mkdir -p ${this.trialLocalTempFolder}/${destDirectory}`);
await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalNFSTempFolder}/${destDirectory}/.`); await cpp.exec(`cp -r ${srcDirectory}/* ${this.trialLocalTempFolder}/${destDirectory}/.`);
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs; const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
return `nfs://${nfsConfig.server}:${destDirectory}`; return `nfs://${nfsConfig.server}:${destDirectory}`;
......
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
'use strict'; 'use strict';
// eslint-disable-next-line @typescript-eslint/camelcase // eslint-disable-next-line @typescript-eslint/camelcase
import { Client1_10, config } from 'kubernetes-client'; import {Client1_10, config} from 'kubernetes-client';
import { getLogger, Logger } from '../../common/log'; import {getLogger, Logger} from '../../common/log';
/** /**
* Generic Kubernetes client, target version >= 1.9 * Generic Kubernetes client, target version >= 1.9
...@@ -16,13 +16,16 @@ class GeneralK8sClient { ...@@ -16,13 +16,16 @@ class GeneralK8sClient {
protected namespace: string = 'default'; protected namespace: string = 'default';
constructor() { constructor() {
this.client = new Client1_10({ config: config.fromKubeconfig(), version: '1.9'}); this.client = new Client1_10({config: config.fromKubeconfig(), version: '1.9'});
this.client.loadSpec(); this.client.loadSpec();
} }
public set setNamespace(namespace: string) { public set setNamespace(namespace: string) {
this.namespace = namespace; this.namespace = namespace;
} }
public get getNamespace(): string {
return this.namespace;
}
private matchStorageClass(response: any): string { private matchStorageClass(response: any): string {
const adlSupportedProvisioners: RegExp[] = [ const adlSupportedProvisioners: RegExp[] = [
...@@ -66,7 +69,7 @@ class GeneralK8sClient { ...@@ -66,7 +69,7 @@ class GeneralK8sClient {
public async createDeployment(deploymentManifest: any): Promise<string> { public async createDeployment(deploymentManifest: any): Promise<string> {
let result: Promise<string>; let result: Promise<string>;
const response: any = await this.client.apis.apps.v1.namespaces(this.namespace) const response: any = await this.client.apis.apps.v1.namespaces(this.namespace)
.deployments.post({ body: deploymentManifest }) .deployments.post({body: deploymentManifest})
if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) { if (response.statusCode && (response.statusCode >= 200 && response.statusCode <= 299)) {
result = Promise.resolve(response.body.metadata.uid); result = Promise.resolve(response.body.metadata.uid);
} else { } else {
...@@ -136,7 +139,7 @@ abstract class KubernetesCRDClient { ...@@ -136,7 +139,7 @@ abstract class KubernetesCRDClient {
protected crdSchema: any; protected crdSchema: any;
constructor() { constructor() {
this.client = new Client1_10({ config: config.fromKubeconfig() }); this.client = new Client1_10({config: config.fromKubeconfig()});
this.client.loadSpec(); this.client.loadSpec();
} }
...@@ -219,4 +222,4 @@ abstract class KubernetesCRDClient { ...@@ -219,4 +222,4 @@ abstract class KubernetesCRDClient {
} }
} }
export { KubernetesCRDClient, GeneralK8sClient }; export {KubernetesCRDClient, GeneralK8sClient};
...@@ -3,16 +3,18 @@ ...@@ -3,16 +3,18 @@
'use strict'; 'use strict';
export type KubernetesStorageKind = 'nfs' | 'azureStorage'; export type KubernetesStorageKind = 'nfs' | 'azureStorage' | 'pvc';
import { MethodNotImplementedError } from '../../common/errors'; import {MethodNotImplementedError} from '../../common/errors';
export abstract class KubernetesClusterConfig { export abstract class KubernetesClusterConfig {
public readonly storage?: KubernetesStorageKind; public readonly storage?: KubernetesStorageKind;
public readonly apiVersion: string; public readonly apiVersion: string;
public readonly namespace?: string;
constructor(apiVersion: string, storage?: KubernetesStorageKind) { constructor(apiVersion: string, storage?: KubernetesStorageKind, namespace?: string) {
this.storage = storage; this.storage = storage;
this.apiVersion = apiVersion; this.apiVersion = apiVersion;
this.namespace = namespace
} }
public get storageType(): KubernetesStorageKind { public get storageType(): KubernetesStorageKind {
...@@ -34,9 +36,10 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -34,9 +36,10 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
constructor( constructor(
apiVersion: string, apiVersion: string,
nfs: NFSConfig, nfs: NFSConfig,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind,
namespace?: string
) { ) {
super(apiVersion, storage); super(apiVersion, storage, namespace);
this.nfs = nfs; this.nfs = nfs;
} }
...@@ -50,7 +53,8 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -50,7 +53,8 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
return new KubernetesClusterConfigNFS( return new KubernetesClusterConfigNFS(
kubernetesClusterConfigObjectNFS.apiVersion, kubernetesClusterConfigObjectNFS.apiVersion,
kubernetesClusterConfigObjectNFS.nfs, kubernetesClusterConfigObjectNFS.nfs,
kubernetesClusterConfigObjectNFS.storage kubernetesClusterConfigObjectNFS.storage,
kubernetesClusterConfigObjectNFS.namespace
); );
} }
} }
...@@ -65,9 +69,11 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -65,9 +69,11 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
keyVault: KeyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind, storage?: KubernetesStorageKind,
uploadRetryCount?: number uploadRetryCount?: number,
namespace?: string,
) { ) {
super(apiVersion, storage); super(apiVersion, storage, namespace);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
this.uploadRetryCount = uploadRetryCount; this.uploadRetryCount = uploadRetryCount;
...@@ -85,18 +91,48 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -85,18 +91,48 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.keyVault,
kubernetesClusterConfigObjectAzure.azureStorage, kubernetesClusterConfigObjectAzure.azureStorage,
kubernetesClusterConfigObjectAzure.storage, kubernetesClusterConfigObjectAzure.storage,
kubernetesClusterConfigObjectAzure.uploadRetryCount kubernetesClusterConfigObjectAzure.uploadRetryCount,
kubernetesClusterConfigObjectAzure.namespace
); );
} }
} }
export class KubernetesClusterConfigFactory { export class KubernetesClusterConfigPVC extends KubernetesClusterConfig {
public readonly pvc: PVCConfig;
constructor(
apiVersion: string,
pvc: PVCConfig,
storage?: KubernetesStorageKind,
namespace?: string,
) {
super(apiVersion, storage, namespace);
this.pvc = pvc;
}
public get storageType(): KubernetesStorageKind {
return 'pvc';
}
public static getInstance(jsonObject: object): KubernetesClusterConfigPVC {
const kubernetesClusterConfigObjectPVC: KubernetesClusterConfigPVC =
<KubernetesClusterConfigPVC>jsonObject;
return new KubernetesClusterConfigPVC(
kubernetesClusterConfigObjectPVC.apiVersion,
kubernetesClusterConfigObjectPVC.pvc,
kubernetesClusterConfigObjectPVC.storage,
kubernetesClusterConfigObjectPVC.namespace
);
}
}
export class KubernetesClusterConfigFactory {
public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig { public static generateKubernetesClusterConfig(jsonObject: object): KubernetesClusterConfig {
const storageConfig: StorageConfig = <StorageConfig>jsonObject; const storageConfig: StorageConfig = <StorageConfig>jsonObject;
switch (storageConfig.storage) { switch (storageConfig.storage) {
case 'azureStorage': case 'azureStorage':
return KubernetesClusterConfigAzure.getInstance(jsonObject); return KubernetesClusterConfigAzure.getInstance(jsonObject);
case 'pvc':
return KubernetesClusterConfigPVC.getInstance(jsonObject);
case 'nfs': case 'nfs':
case undefined: case undefined:
return KubernetesClusterConfigNFS.getInstance(jsonObject); return KubernetesClusterConfigNFS.getInstance(jsonObject);
...@@ -121,6 +157,18 @@ export class NFSConfig { ...@@ -121,6 +157,18 @@ export class NFSConfig {
} }
} }
/**
* PVC configuration to store Kubernetes job related files
*/
export class PVCConfig {
// Path of the mounted pvc
public readonly path: string;
constructor(path: string) {
this.path = path;
}
}
/** /**
* KeyVault configuration to store the key of Azure Storage Service * KeyVault configuration to store the key of Azure Storage Service
* Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2 * Refer https://docs.microsoft.com/en-us/azure/key-vault/key-vault-manage-with-cli2
......
...@@ -7,21 +7,21 @@ import * as cpp from 'child-process-promise'; ...@@ -7,21 +7,21 @@ import * as cpp from 'child-process-promise';
import * as path from 'path'; import * as path from 'path';
import * as azureStorage from 'azure-storage'; import * as azureStorage from 'azure-storage';
import { EventEmitter } from 'events'; import {EventEmitter} from 'events';
import { Base64 } from 'js-base64'; import {Base64} from 'js-base64';
import { String } from 'typescript-string-operations'; import {String} from 'typescript-string-operations';
import { getExperimentId } from '../../common/experimentStartupInfo'; import {getExperimentId} from '../../common/experimentStartupInfo';
import { getLogger, Logger } from '../../common/log'; import {getLogger, Logger} from '../../common/log';
import { MethodNotImplementedError } from '../../common/errors'; import {MethodNotImplementedError} from '../../common/errors';
import { import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType NNIManagerIpConfig, TrialJobDetail, TrialJobMetric, LogType
} from '../../common/trainingService'; } from '../../common/trainingService';
import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import {delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString} from '../../common/utils';
import { AzureStorageClientUtility } from './azureStorageClientUtils'; import {AzureStorageClientUtility} from './azureStorageClientUtils';
import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient'; import {GeneralK8sClient, KubernetesCRDClient} from './kubernetesApiClient';
import { KubernetesClusterConfig } from './kubernetesConfig'; import {KubernetesClusterConfig} from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import {kubernetesScriptFormat, KubernetesTrialJobDetail} from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import {KubernetesJobRestServer} from './kubernetesJobRestServer';
const fs = require('fs'); const fs = require('fs');
...@@ -34,7 +34,7 @@ abstract class KubernetesTrainingService { ...@@ -34,7 +34,7 @@ abstract class KubernetesTrainingService {
protected readonly metricsEmitter: EventEmitter; protected readonly metricsEmitter: EventEmitter;
protected readonly trialJobsMap: Map<string, KubernetesTrialJobDetail>; protected readonly trialJobsMap: Map<string, KubernetesTrialJobDetail>;
// experiment root dir in NFS // experiment root dir in NFS
protected readonly trialLocalNFSTempFolder: string; protected readonly trialLocalTempFolder: string;
protected stopping: boolean = false; protected stopping: boolean = false;
protected experimentId!: string; protected experimentId!: string;
protected kubernetesRestServerPort?: number; protected kubernetesRestServerPort?: number;
...@@ -57,7 +57,7 @@ abstract class KubernetesTrainingService { ...@@ -57,7 +57,7 @@ abstract class KubernetesTrainingService {
this.log = getLogger(); this.log = getLogger();
this.metricsEmitter = new EventEmitter(); this.metricsEmitter = new EventEmitter();
this.trialJobsMap = new Map<string, KubernetesTrialJobDetail>(); this.trialJobsMap = new Map<string, KubernetesTrialJobDetail>();
this.trialLocalNFSTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp'); this.trialLocalTempFolder = path.join(getExperimentRootDir(), 'trials-nfs-tmp');
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.CONTAINER_MOUNT_PATH = '/tmp/mount'; this.CONTAINER_MOUNT_PATH = '/tmp/mount';
this.expContainerCodeFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId, 'nni-code'); this.expContainerCodeFolder = path.join(this.CONTAINER_MOUNT_PATH, 'nni', this.experimentId, 'nni-code');
...@@ -191,9 +191,9 @@ abstract class KubernetesTrainingService { ...@@ -191,9 +191,9 @@ abstract class KubernetesTrainingService {
// Unmount NFS // Unmount NFS
try { try {
await cpp.exec(`sudo umount ${this.trialLocalNFSTempFolder}`); await cpp.exec(`sudo umount ${this.trialLocalTempFolder}`);
} catch (error) { } catch (error) {
this.log.error(`Unmount ${this.trialLocalNFSTempFolder} failed, error is ${error}`); this.log.error(`Unmount ${this.trialLocalTempFolder} failed, error is ${error}`);
} }
// Stop kubernetes rest server // Stop kubernetes rest server
...@@ -231,13 +231,15 @@ abstract class KubernetesTrainingService { ...@@ -231,13 +231,15 @@ abstract class KubernetesTrainingService {
//create sotrage secret //create sotrage secret
this.azureStorageSecretName = String.Format('nni-secret-{0}', uniqueString(8) this.azureStorageSecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase()); .toLowerCase());
const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default"
await this.genericK8sClient.createSecret( await this.genericK8sClient.createSecret(
{ {
apiVersion: 'v1', apiVersion: 'v1',
kind: 'Secret', kind: 'Secret',
metadata: { metadata: {
name: this.azureStorageSecretName, name: this.azureStorageSecretName,
namespace: 'default', namespace: namespace,
labels: { labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL, app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId() expId: getExperimentId()
...@@ -297,11 +299,11 @@ abstract class KubernetesTrainingService { ...@@ -297,11 +299,11 @@ abstract class KubernetesTrainingService {
return Promise.resolve(runScript); return Promise.resolve(runScript);
} }
protected async createNFSStorage(nfsServer: string, nfsPath: string): Promise<void> { protected async createNFSStorage(nfsServer: string, nfsPath: string): Promise<void> {
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}`); await cpp.exec(`mkdir -p ${this.trialLocalTempFolder}`);
try { try {
await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.trialLocalNFSTempFolder}`); await cpp.exec(`sudo mount ${nfsServer}:${nfsPath} ${this.trialLocalTempFolder}`);
} catch (error) { } catch (error) {
const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.trialLocalNFSTempFolder} failed, error is ${error}`; const mountError: string = `Mount NFS ${nfsServer}:${nfsPath} to ${this.trialLocalTempFolder} failed, error is ${error}`;
this.log.error(mountError); this.log.error(mountError);
return Promise.reject(mountError); return Promise.reject(mountError);
...@@ -309,21 +311,35 @@ abstract class KubernetesTrainingService { ...@@ -309,21 +311,35 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
protected async createPVCStorage(pvcPath: string): Promise<void> {
try {
await cpp.exec(`mkdir -p ${pvcPath}`);
await cpp.exec(`sudo ln -s ${pvcPath} ${this.trialLocalTempFolder}`);
} catch (error) {
const linkError: string = `Linking ${pvcPath} to ${this.trialLocalTempFolder} failed, error is ${error}`;
this.log.error(linkError);
return Promise.reject(linkError);
}
return Promise.resolve();
}
protected async createRegistrySecret(filePath: string | undefined): Promise<string | undefined> { protected async createRegistrySecret(filePath: string | undefined): Promise<string | undefined> {
if(filePath === undefined || filePath === '') { if (filePath === undefined || filePath === '') {
return undefined; return undefined;
} }
const body = fs.readFileSync(filePath).toString('base64'); const body = fs.readFileSync(filePath).toString('base64');
const registrySecretName = String.Format('nni-secret-{0}', uniqueString(8) const registrySecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase()); .toLowerCase());
const namespace = this.genericK8sClient.getNamespace ? this.genericK8sClient.getNamespace : "default"
await this.genericK8sClient.createSecret( await this.genericK8sClient.createSecret(
{ {
apiVersion: 'v1', apiVersion: 'v1',
kind: 'Secret', kind: 'Secret',
metadata: { metadata: {
name: registrySecretName, name: registrySecretName,
namespace: 'default', namespace: namespace,
labels: { labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL, app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId() expId: getExperimentId()
...@@ -349,7 +365,7 @@ abstract class KubernetesTrainingService { ...@@ -349,7 +365,7 @@ abstract class KubernetesTrainingService {
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
let retryCount: number = 1; let retryCount: number = 1;
if(uploadRetryCount) { if (uploadRetryCount) {
retryCount = uploadRetryCount; retryCount = uploadRetryCount;
} }
let uploadSuccess: boolean = false; let uploadSuccess: boolean = false;
...@@ -378,4 +394,4 @@ abstract class KubernetesTrainingService { ...@@ -378,4 +394,4 @@ abstract class KubernetesTrainingService {
return Promise.resolve(folderUriInAzure); return Promise.resolve(folderUriInAzure);
} }
} }
export { KubernetesTrainingService }; export {KubernetesTrainingService};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment