"git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "fc7ddcd0c83febfbbae76bc5065e1e9d6cd8f8c3"
Unverified Commit c265903e authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support kuberflow pytorch-operator (#406)

1.Support pytorch-operator
2.remove unsupported operator
parent 80624de7
...@@ -371,6 +371,10 @@ machineList: ...@@ -371,6 +371,10 @@ machineList:
__operator__ specify the kubeflow's operator to be used, nni support __tf-operator__ in current version. __operator__ specify the kubeflow's operator to be used, nni support __tf-operator__ in current version.
* __storage__
__storage__ specify the storage type of kubeflow, including {__nfs__, __azureStorage__}. This field is optional, and the default value is __nfs__. If the config use azureStorage, this field must be completed.
* __nfs__ * __nfs__
__server__ is the host of nfs server __server__ is the host of nfs server
......
...@@ -63,6 +63,7 @@ trial: ...@@ -63,6 +63,7 @@ trial:
image: {your_docker_image_for_tensorflow_worker} image: {your_docker_image_for_tensorflow_worker}
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
storage: nfs
nfs: nfs:
server: {your_nfs_server} server: {your_nfs_server}
path: {your_nfs_server_exported_path} path: {your_nfs_server_exported_path}
...@@ -71,6 +72,7 @@ If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your co ...@@ -71,6 +72,7 @@ If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your co
``` ```
kubeflowConfig: kubeflowConfig:
operator: tf-operator operator: tf-operator
storage: azureStorage
keyVault: keyVault:
vaultName: {your_vault_name} vaultName: {your_vault_name}
name: {your_secert_name} name: {your_secert_name}
......
...@@ -59,6 +59,15 @@ export namespace ValidationSchemas { ...@@ -59,6 +59,15 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(), gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required() command: joi.string().min(1).required()
}),
master: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}) })
}), }),
pai_config: joi.object({ pai_config: joi.object({
...@@ -68,6 +77,7 @@ export namespace ValidationSchemas { ...@@ -68,6 +77,7 @@ export namespace ValidationSchemas {
}), }),
kubeflow_config: joi.object({ kubeflow_config: joi.object({
operator: joi.string().min(1).required(), operator: joi.string().min(1).required(),
storage: joi.string().min(1),
nfs: joi.object({ nfs: joi.object({
server: joi.string().min(1).required(), server: joi.string().min(1).required(),
path: joi.string().min(1).required() path: joi.string().min(1).required()
......
...@@ -23,31 +23,35 @@ import { TrialConfig } from "../common/trialConfig"; ...@@ -23,31 +23,35 @@ import { TrialConfig } from "../common/trialConfig";
/** operator types that kubeflow supported */ /** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator'; export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs'; export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' ;
export type KubeflowOperatorJobKind = 'TFJob' | 'PyTorchJob';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';
/** /**
* map from Kubeflow operator name to its plural name in K8S * map from Kubeflow operator name to its plural name in K8S
*/ */
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([ export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'], ['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'], ['pytorch-operator', 'pytorchjobs']
['mxnet-operator', 'mxjobs'], ]);
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'], /**
['mpi-operator', 'mpijobs'] * map from Kubeflow operator name to its job kind name in K8S
*/
export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperatorJobKind> = new Map<KubeflowOperator, KubeflowOperatorJobKind>([
['tf-operator' , 'TFJob'],
['pytorch-operator', 'PyTorchJob']
]); ]);
/** /**
* Kuberflow cluster configuration * Kuberflow cluster configuration
* *
*/ */
export class KubeflowClusterConfig { export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */ /** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator; public readonly operator: KubeflowOperator;
public readonly nfs?: NFSConfig; public readonly storage?: KubeflowStorageKind;
public readonly keyVault?: keyVaultConfig;
public readonly azureStorage?: AzureStorage;
/** /**
* Constructor * Constructor
...@@ -55,9 +59,27 @@ export class KubeflowClusterConfig { ...@@ -55,9 +59,27 @@ export class KubeflowClusterConfig {
* @param passWord password of Kubeflow Cluster * @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster * @param host Host IP of Kubeflow Cluster
*/ */
constructor(operator: KubeflowOperator, nfs?: NFSConfig, keyVault?: keyVaultConfig, azureStorage ?: AzureStorage) { constructor(operator: KubeflowOperator, storage?: KubeflowStorageKind) {
this.operator = operator; this.operator = operator;
this.nfs = nfs; this.storage = storage;
}
}
export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{
public readonly nfs: NFSConfig;
constructor(operator: KubeflowOperator, nfs: NFSConfig, storage?: KubeflowStorageKind) {
super(operator, storage)
this.nfs = nfs;
}
}
export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{
public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage;
constructor(operator: KubeflowOperator, keyVault: keyVaultConfig, azureStorage: AzureStorage, storage?: KubeflowStorageKind) {
super(operator, storage)
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
} }
...@@ -142,15 +164,33 @@ export class KubeflowTrialConfigTemplate { ...@@ -142,15 +164,33 @@ export class KubeflowTrialConfigTemplate {
} }
} }
export class KubeflowTrialConfig { export class KubeflowTrialConfigBase {
public readonly codeDir: string; public readonly codeDir: string;
constructor(codeDir: string) {
this.codeDir = codeDir;
}
}
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate; public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate; public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) { constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
this.codeDir = codeDir; super(codeDir);
this.worker = worker;
this.ps = ps; this.ps = ps;
this.worker = worker;
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, master?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
} }
} }
...@@ -122,6 +122,14 @@ kubeflow_trial_schema = { ...@@ -122,6 +122,14 @@ kubeflow_trial_schema = {
'memoryMB': int, 'memoryMB': int,
'image': str 'image': str
}, },
Optional('master'): {
'replicas': int,
'command': str,
'gpuNum': And(int, lambda x: 0 <= x <= 99999),
'cpuNum': And(int, lambda x: 0 <= x <= 99999),
'memoryMB': int,
'image': str
},
'worker':{ 'worker':{
'replicas': int, 'replicas': int,
'command': str, 'command': str,
...@@ -135,13 +143,15 @@ kubeflow_trial_schema = { ...@@ -135,13 +143,15 @@ kubeflow_trial_schema = {
kubeflow_config_schema = { kubeflow_config_schema = {
'kubeflowConfig':Or({ 'kubeflowConfig':Or({
'operator': Or('tf-operator', 'mxnet-operator', 'pytorch-operator'), 'operator': Or('tf-operator', 'pytorch-operator'),
Optional('storage'): Or('nfs', 'azureStorage'),
'nfs': { 'nfs': {
'server': str, 'server': str,
'path': str 'path': str
} }
},{ },{
'operator': Or('tf-operator', 'mxnet-operator', 'pytorch-operator'), 'operator': Or('tf-operator', 'pytorch-operator'),
Optional('storage'): Or('nfs', 'azureStorage'),
'keyVault': { 'keyVault': {
'vaultName': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'), 'vaultName': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
'name': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}') 'name': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}')
......
...@@ -35,7 +35,6 @@ def parse_relative_path(root_path, experiment_config, key): ...@@ -35,7 +35,6 @@ def parse_relative_path(root_path, experiment_config, key):
print_warning('expand %s: %s to %s ' % (key, experiment_config[key], absolute_path)) print_warning('expand %s: %s to %s ' % (key, experiment_config[key], absolute_path))
experiment_config[key] = absolute_path experiment_config[key] = absolute_path
def parse_time(experiment_config): def parse_time(experiment_config):
'''Parse time format''' '''Parse time format'''
unit = experiment_config['maxExecDuration'][-1] unit = experiment_config['maxExecDuration'][-1]
...@@ -85,7 +84,33 @@ def validate_search_space_content(experiment_config): ...@@ -85,7 +84,33 @@ def validate_search_space_content(experiment_config):
print_error('please use _type and _value to specify searchspace!') print_error('please use _type and _value to specify searchspace!')
exit(1) exit(1)
except: except:
raise Exception('searchspace file is not a valid json format!') print_error('searchspace file is not a valid json format!')
exit(1)
def validate_kubeflow_operators(experiment_config):
'''Validate whether the kubeflow operators are valid'''
if experiment_config.get('kubeflowConfig'):
if experiment_config.get('kubeflowConfig').get('operator') == 'tf-operator':
if experiment_config.get('trial').get('master') is not None:
print_error('kubeflow with tf-operator can not set master')
exit(1)
elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator':
if experiment_config.get('trial').get('ps') is not None:
print_error('kubeflow with pytorch-operator can not set ps')
exit(1)
if experiment_config.get('kubeflowConfig').get('storage') == 'nfs':
if experiment_config.get('kubeflowConfig').get('nfs') is None:
print_error('please set nfs configuration!')
exit(1)
elif experiment_config.get('kubeflowConfig').get('storage') == 'azureStorage':
if experiment_config.get('kubeflowConfig').get('azureStorage') is None:
print_error('please set azureStorage configuration!')
exit(1)
elif experiment_config.get('kubeflowConfig').get('storage') is None:
if experiment_config.get('kubeflowConfig').get('azureStorage'):
print_error('please set storage type!')
exit(1)
def validate_common_content(experiment_config): def validate_common_content(experiment_config):
'''Validate whether the common values in experiment_config is valid''' '''Validate whether the common values in experiment_config is valid'''
...@@ -168,7 +193,7 @@ def validate_annotation_content(experiment_config, spec_key, builtin_name): ...@@ -168,7 +193,7 @@ def validate_annotation_content(experiment_config, spec_key, builtin_name):
# validate searchSpaceFile # validate searchSpaceFile
if experiment_config[spec_key].get(builtin_name): if experiment_config[spec_key].get(builtin_name):
if experiment_config.get('searchSpacePath') is None: if experiment_config.get('searchSpacePath') is None:
print_error('Please set searchSpace!') print_error('Please set searchSpacePath!')
exit(1) exit(1)
validate_search_space_content(experiment_config) validate_search_space_content(experiment_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment