Unverified Commit c265903e authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Support kuberflow pytorch-operator (#406)

1.Support pytorch-operator
2.remove unsupported operator
parent 80624de7
......@@ -371,6 +371,10 @@ machineList:
__operator__ specify the kubeflow's operator to be used, nni support __tf-operator__ in current version.
* __storage__
__storage__ specify the storage type of kubeflow, including {__nfs__, __azureStorage__}. This field is optional, and the default value is __nfs__. If the config use azureStorage, this field must be completed.
* __nfs__
__server__ is the host of nfs server
......
......@@ -63,6 +63,7 @@ trial:
image: {your_docker_image_for_tensorflow_worker}
kubeflowConfig:
operator: tf-operator
storage: nfs
nfs:
server: {your_nfs_server}
path: {your_nfs_server_exported_path}
......@@ -71,6 +72,7 @@ If you use Azure Kubernetes Service, you should set `kubeflowConfig` in your co
```
kubeflowConfig:
operator: tf-operator
storage: azureStorage
keyVault:
vaultName: {your_vault_name}
name: {your_secert_name}
......
......@@ -59,6 +59,15 @@ export namespace ValidationSchemas {
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
}),
master: joi.object({
replicas: joi.number().min(1).required(),
image: joi.string().min(1),
outputDir: joi.string(),
cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100),
gpuNum: joi.number().min(0).required(),
command: joi.string().min(1).required()
})
}),
pai_config: joi.object({
......@@ -68,6 +77,7 @@ export namespace ValidationSchemas {
}),
kubeflow_config: joi.object({
operator: joi.string().min(1).required(),
storage: joi.string().min(1),
nfs: joi.object({
server: joi.string().min(1).required(),
path: joi.string().min(1).required()
......
......@@ -23,31 +23,35 @@ import { TrialConfig } from "../common/trialConfig";
/** operator types that kubeflow supported */
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' | 'mxnet-operator' | 'caffe2-operator' | 'chainer-operator' | 'mpi-operator';
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' | 'mxjobs' | 'caffe2jobs' | 'chainerjobs' | 'mpijobs';
export type KubeflowOperator = 'tf-operator' | 'pytorch-operator' ;
export type KubeflowOperatorPlural = 'tfjobs' | 'pytorchjobs' ;
export type KubeflowOperatorJobKind = 'TFJob' | 'PyTorchJob';
export type KubeflowStorageKind = 'nfs' | 'azureStorage';
/**
* map from Kubeflow operator name to its plural name in K8S
*/
export const kubeflowOperatorMap : Map<KubeflowOperator, KubeflowOperatorPlural> = new Map<KubeflowOperator, KubeflowOperatorPlural>([
['tf-operator' , 'tfjobs'],
['pytorch-operator', 'pytorchjobs'],
['mxnet-operator', 'mxjobs'],
['caffe2-operator', 'caffe2jobs'],
['chainer-operator', 'chainerjobs'],
['mpi-operator', 'mpijobs']
['pytorch-operator', 'pytorchjobs']
]);
/**
* map from Kubeflow operator name to its job kind name in K8S
*/
export const kubeflowOperatorJobKindMap : Map<KubeflowOperator, KubeflowOperatorJobKind> = new Map<KubeflowOperator, KubeflowOperatorJobKind>([
['tf-operator' , 'TFJob'],
['pytorch-operator', 'PyTorchJob']
]);
/**
* Kuberflow cluster configuration
*
*/
export class KubeflowClusterConfig {
export class KubeflowClusterConfigBase {
/** Name of Kubeflow operator, like tf-operator */
public readonly operator: KubeflowOperator;
public readonly nfs?: NFSConfig;
public readonly keyVault?: keyVaultConfig;
public readonly azureStorage?: AzureStorage;
public readonly storage?: KubeflowStorageKind;
/**
* Constructor
......@@ -55,9 +59,27 @@ export class KubeflowClusterConfig {
* @param passWord password of Kubeflow Cluster
* @param host Host IP of Kubeflow Cluster
*/
constructor(operator: KubeflowOperator, nfs?: NFSConfig, keyVault?: keyVaultConfig, azureStorage ?: AzureStorage) {
constructor(operator: KubeflowOperator, storage?: KubeflowStorageKind) {
this.operator = operator;
this.storage = storage;
}
}
export class KubeflowClusterConfigNFS extends KubeflowClusterConfigBase{
public readonly nfs: NFSConfig;
constructor(operator: KubeflowOperator, nfs: NFSConfig, storage?: KubeflowStorageKind) {
super(operator, storage)
this.nfs = nfs;
}
}
export class KubeflowClusterConfigAzure extends KubeflowClusterConfigBase{
public readonly keyVault: keyVaultConfig;
public readonly azureStorage: AzureStorage;
constructor(operator: KubeflowOperator, keyVault: keyVaultConfig, azureStorage: AzureStorage, storage?: KubeflowStorageKind) {
super(operator, storage)
this.keyVault = keyVault;
this.azureStorage = azureStorage;
}
......@@ -142,15 +164,33 @@ export class KubeflowTrialConfigTemplate {
}
}
export class KubeflowTrialConfig {
export class KubeflowTrialConfigBase {
public readonly codeDir: string;
constructor(codeDir: string) {
this.codeDir = codeDir;
}
}
export class KubeflowTrialConfigTensorflow extends KubeflowTrialConfigBase{
public readonly ps?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, ps?: KubeflowTrialConfigTemplate) {
this.codeDir = codeDir;
this.worker = worker;
super(codeDir);
this.ps = ps;
this.worker = worker;
}
}
export class KubeflowTrialConfigPytorch extends KubeflowTrialConfigBase{
public readonly master?: KubeflowTrialConfigTemplate;
public readonly worker: KubeflowTrialConfigTemplate;
constructor(codeDir: string, worker: KubeflowTrialConfigTemplate, master?: KubeflowTrialConfigTemplate) {
super(codeDir);
this.master = master;
this.worker = worker;
}
}
......@@ -122,6 +122,14 @@ kubeflow_trial_schema = {
'memoryMB': int,
'image': str
},
Optional('master'): {
'replicas': int,
'command': str,
'gpuNum': And(int, lambda x: 0 <= x <= 99999),
'cpuNum': And(int, lambda x: 0 <= x <= 99999),
'memoryMB': int,
'image': str
},
'worker':{
'replicas': int,
'command': str,
......@@ -135,13 +143,15 @@ kubeflow_trial_schema = {
kubeflow_config_schema = {
'kubeflowConfig':Or({
'operator': Or('tf-operator', 'mxnet-operator', 'pytorch-operator'),
'operator': Or('tf-operator', 'pytorch-operator'),
Optional('storage'): Or('nfs', 'azureStorage'),
'nfs': {
'server': str,
'path': str
}
},{
'operator': Or('tf-operator', 'mxnet-operator', 'pytorch-operator'),
'operator': Or('tf-operator', 'pytorch-operator'),
Optional('storage'): Or('nfs', 'azureStorage'),
'keyVault': {
'vaultName': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}'),
'name': Regex('([0-9]|[a-z]|[A-Z]|-){1,127}')
......
......@@ -35,7 +35,6 @@ def parse_relative_path(root_path, experiment_config, key):
print_warning('expand %s: %s to %s ' % (key, experiment_config[key], absolute_path))
experiment_config[key] = absolute_path
def parse_time(experiment_config):
'''Parse time format'''
unit = experiment_config['maxExecDuration'][-1]
......@@ -85,7 +84,33 @@ def validate_search_space_content(experiment_config):
print_error('please use _type and _value to specify searchspace!')
exit(1)
except:
raise Exception('searchspace file is not a valid json format!')
print_error('searchspace file is not a valid json format!')
exit(1)
def validate_kubeflow_operators(experiment_config):
'''Validate whether the kubeflow operators are valid'''
if experiment_config.get('kubeflowConfig'):
if experiment_config.get('kubeflowConfig').get('operator') == 'tf-operator':
if experiment_config.get('trial').get('master') is not None:
print_error('kubeflow with tf-operator can not set master')
exit(1)
elif experiment_config.get('kubeflowConfig').get('operator') == 'pytorch-operator':
if experiment_config.get('trial').get('ps') is not None:
print_error('kubeflow with pytorch-operator can not set ps')
exit(1)
if experiment_config.get('kubeflowConfig').get('storage') == 'nfs':
if experiment_config.get('kubeflowConfig').get('nfs') is None:
print_error('please set nfs configuration!')
exit(1)
elif experiment_config.get('kubeflowConfig').get('storage') == 'azureStorage':
if experiment_config.get('kubeflowConfig').get('azureStorage') is None:
print_error('please set azureStorage configuration!')
exit(1)
elif experiment_config.get('kubeflowConfig').get('storage') is None:
if experiment_config.get('kubeflowConfig').get('azureStorage'):
print_error('please set storage type!')
exit(1)
def validate_common_content(experiment_config):
'''Validate whether the common values in experiment_config is valid'''
......@@ -168,7 +193,7 @@ def validate_annotation_content(experiment_config, spec_key, builtin_name):
# validate searchSpaceFile
if experiment_config[spec_key].get(builtin_name):
if experiment_config.get('searchSpacePath') is None:
print_error('Please set searchSpace!')
print_error('Please set searchSpacePath!')
exit(1)
validate_search_space_content(experiment_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment