Unverified Commit 88ef6c04 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #197 from microsoft/master

merge master
parents 5f3c5ffd 555334de
This diff is collapsed.
**Automatic Feature Engineering in nni**
===
Now we have an [example](https://github.com/SpongebBob/tabular_automl_NNI), which could automaticlly do feature engineering in nni.
These code come from our contributors. And thanks our lovely contributors!
And welcome more and more people to join us!
...@@ -7,7 +7,9 @@ def random_archi_generator(nas_ss, random_state): ...@@ -7,7 +7,9 @@ def random_archi_generator(nas_ss, random_state):
''' '''
chosen_archi = {} chosen_archi = {}
print("zql: nas search space: ", nas_ss) print("zql: nas search space: ", nas_ss)
for block_name, block in nas_ss.items(): for block_name, block_value in nas_ss.items():
assert block_value['_type'] == "mutable_layer", "Random NAS Tuner only receives NAS search space whose _type is 'mutable_layer'"
block = block_value['_value']
tmp_block = {} tmp_block = {}
for layer_name, layer in block.items(): for layer_name, layer in block.items():
tmp_layer = {} tmp_layer = {}
......
...@@ -35,9 +35,10 @@ setup( ...@@ -35,9 +35,10 @@ setup(
license = 'MIT', license = 'MIT',
url = 'https://github.com/Microsoft/nni', url = 'https://github.com/Microsoft/nni',
packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools'), packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('src/sdk/pycli') + find_packages('tools'),
package_dir = { package_dir = {
'nni': 'src/sdk/pynni/nni', 'nni': 'src/sdk/pynni/nni',
'nnicli': 'src/sdk/pycli/nnicli',
'nni_annotation': 'tools/nni_annotation', 'nni_annotation': 'tools/nni_annotation',
'nni_cmd': 'tools/nni_cmd', 'nni_cmd': 'tools/nni_cmd',
'nni_trial_tool':'tools/nni_trial_tool', 'nni_trial_tool':'tools/nni_trial_tool',
......
...@@ -51,10 +51,12 @@ export namespace ValidationSchemas { ...@@ -51,10 +51,12 @@ export namespace ValidationSchemas {
command: joi.string().min(1), command: joi.string().min(1),
virtualCluster: joi.string(), virtualCluster: joi.string(),
shmMB: joi.number(), shmMB: joi.number(),
authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'),
worker: joi.object({ worker: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -64,6 +66,7 @@ export namespace ValidationSchemas { ...@@ -64,6 +66,7 @@ export namespace ValidationSchemas {
ps: joi.object({ ps: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -73,6 +76,7 @@ export namespace ValidationSchemas { ...@@ -73,6 +76,7 @@ export namespace ValidationSchemas {
master: joi.object({ master: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -83,6 +87,7 @@ export namespace ValidationSchemas { ...@@ -83,6 +87,7 @@ export namespace ValidationSchemas {
name: joi.string().min(1), name: joi.string().min(1),
taskNum: joi.number().min(1).required(), taskNum: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
......
...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy, privateRegistryFilePath?: string | undefined) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryFilePath);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name; this.name = name;
this.taskNum = taskNum; this.taskNum = taskNum;
......
...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
// Generate frameworkcontroller job resource config object // Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = const frameworkcontrollerJobConfig: any =
this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources); await this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig); return Promise.resolve(frameworkcontrollerJobConfig);
} }
...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name * @param frameworkcontrollerJobName job name
* @param podResources pod template * @param podResources pod template
*/ */
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, private async generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string,
frameworkcontrollerJobName : string, podResources : any) : any { frameworkcontrollerJobName : string, podResources : any) : Promise<any> {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if (containerPort === undefined) { if (containerPort === undefined) {
throw new Error('Container port is not initialized'); throw new Error('Container port is not initialized');
} }
const taskRole: any = this.generateTaskRoleConfig( const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.fcTrialConfig.taskRoles[index].image, this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`, `run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index], podResources[index],
containerPort containerPort,
await this.createRegistrySecret(this.fcTrialConfig.taskRoles[index].privateRegistryAuthPath)
); );
taskRoles.push({ taskRoles.push({
name: this.fcTrialConfig.taskRoles[index].name, name: this.fcTrialConfig.taskRoles[index].name,
...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}); });
} }
return { return Promise.resolve({
apiVersion: `frameworkcontroller.microsoft.com/v1`, apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework', kind: 'Framework',
metadata: { metadata: {
...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
executionType: 'Start', executionType: 'Start',
taskRoles: taskRoles taskRoles: taskRoles
} }
}; });
} }
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string,
podResources: any, containerPort: number): any { podResources: any, containerPort: number, privateRegistrySecretName: string | undefined): any {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}] }]
}]; }];
const spec: any = {
containers: containers, let spec: any = {
initContainers: initContainers, containers: containers,
restartPolicy: 'OnFailure', initContainers: initContainers,
volumes: volumeSpecMap.get('nniVolumes'), restartPolicy: 'OnFailure',
hostNetwork: false volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false
}; };
if(privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}
]
}
if (this.fcClusterConfig.serviceAccountName !== undefined) { if (this.fcClusterConfig.serviceAccountName !== undefined) {
spec.serviceAccountName = this.fcClusterConfig.serviceAccountName; spec.serviceAccountName = this.fcClusterConfig.serviceAccountName;
} }
......
...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate { export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryAuthPath);
this.replicas = replicas; this.replicas = replicas;
} }
} }
......
...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources,
nonWorkerResources); nonWorkerResources);
return Promise.resolve(kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig);
...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template * @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master * @param nonWorkerPodResources non-worker pod template, like ps or master
*/ */
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any,
nonWorkerPodResources?: any) : any { nonWorkerPodResources?: any) : Promise<any> {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const replicaSpecsObj: any = {}; const replicaSpecsObj: any = {};
const replicaSpecsObjMap: Map<string, object> = new Map<string, object>(); const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if (pytorchTrialConfig.worker !== undefined) { if (pytorchTrialConfig.worker !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
} }
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName);
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
} }
return { return Promise.resolve({
apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.kubernetesCRDClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}, },
spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; });
} }
/** /**
...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param podResources pod resource config section * @param podResources pod resource config section
*/ */
private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string,
podResources: any): any { podResources: any, privateRegistrySecretName: string | undefined): any {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (this.kubernetesCRDClient === undefined) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
// The config spec for volume field
const volumeSpecMap: Map<string, object> = new Map<string, object>(); const volumeSpecMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowClusterConfig.storageType === 'azureStorage') { if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
...@@ -459,7 +462,34 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -459,7 +462,34 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}]); }]);
} }
// The config spec for container field
const containersSpecMap: Map<string, object> = new Map<string, object>();
containersSpecMap.set('containers', [
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name: this.kubernetesCRDClient.containerName,
image: replicaImage,
args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [
{
name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources
}
]);
let spec: any = {
containers: containersSpecMap.get('containers'),
restartPolicy: 'ExitCode',
volumes: volumeSpecMap.get('nniVolumes')
}
if (privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}]
}
return { return {
replicas: replicaNumber, replicas: replicaNumber,
template: { template: {
...@@ -467,26 +497,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -467,26 +497,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
// tslint:disable-next-line:no-null-keyword // tslint:disable-next-line:no-null-keyword
creationTimestamp: null creationTimestamp: null
}, },
spec: { spec: spec
containers: [
{
// Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type
name: this.kubernetesCRDClient.containerName,
image: replicaImage,
args: ['sh', `${path.join(trialWorkingFolder, runScriptFile)}`],
volumeMounts: [
{
name: 'nni-vol',
mountPath: this.CONTAINER_MOUNT_PATH
}],
resources: podResources
}],
restartPolicy: 'ExitCode',
volumes: volumeSpecMap.get('nniVolumes')
}
} }
}; }
} }
} }
// tslint:enable: no-unsafe-any no-any // tslint:enable: no-unsafe-any no-any
......
...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate { ...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate {
// Docker image // Docker image
public readonly image: string; public readonly image: string;
// Private registry config file path to download docker iamge
public readonly privateRegistryAuthPath?: string;
// Trail command // Trail command
public readonly command : string; public readonly command : string;
...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate { ...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate {
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.privateRegistryAuthPath = privateRegistryAuthPath;
} }
} }
......
...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig'; ...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var fs = require('fs');
/** /**
* Training Service implementation for Kubernetes * Training Service implementation for Kubernetes
*/ */
...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService { ...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
protected async createRegistrySecret(filePath: string | undefined): Promise<string | undefined> {
if(filePath === undefined || filePath === '') {
return undefined;
}
let body = fs.readFileSync(filePath).toString('base64');
let registrySecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase());
await this.genericK8sClient.createSecret(
{
apiVersion: 'v1',
kind: 'Secret',
metadata: {
name: registrySecretName,
namespace: 'default',
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId()
}
},
type: 'kubernetes.io/dockerconfigjson',
data: {
'.dockerconfigjson': body
}
}
);
return registrySecretName;
}
} }
export { KubernetesTrainingService }; export { KubernetesTrainingService };
...@@ -71,6 +71,8 @@ export class PAIJobConfig { ...@@ -71,6 +71,8 @@ export class PAIJobConfig {
public readonly image: string; public readonly image: string;
// Code directory on HDFS // Code directory on HDFS
public readonly codeDir: string; public readonly codeDir: string;
//authentication file used for private Docker registry
public readonly authFile?: string;
// List of taskRole, one task role at least // List of taskRole, one task role at least
public taskRoles: PAITaskRole[]; public taskRoles: PAITaskRole[];
...@@ -87,12 +89,13 @@ export class PAIJobConfig { ...@@ -87,12 +89,13 @@ export class PAIJobConfig {
* @param taskRoles List of taskRole, one task role at least * @param taskRoles List of taskRole, one task role at least
*/ */
constructor(jobName: string, image : string, codeDir : string, constructor(jobName: string, image : string, codeDir : string,
taskRoles : PAITaskRole[], virtualCluster: string) { taskRoles : PAITaskRole[], virtualCluster: string, authFile?: string) {
this.jobName = jobName; this.jobName = jobName;
this.image = image; this.image = image;
this.codeDir = codeDir; this.codeDir = codeDir;
this.taskRoles = taskRoles; this.taskRoles = taskRoles;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.authFile = authFile;
} }
} }
...@@ -129,14 +132,17 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -129,14 +132,17 @@ export class NNIPAITrialConfig extends TrialConfig {
public virtualCluster?: string; public virtualCluster?: string;
//Shared memory for one task in the task role //Shared memory for one task in the task role
public shmMB?: number; public shmMB?: number;
//authentication file used for private Docker registry
public authFile?: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number) { image: string, virtualCluster?: string, shmMB?: number, authFile?: string) {
super(command, codeDir, gpuNum); super(command, codeDir, gpuNum);
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.shmMB = shmMB; this.shmMB = shmMB;
this.authFile = authFile;
} }
} }
...@@ -442,7 +442,7 @@ class PAITrainingService implements TrainingService { ...@@ -442,7 +442,7 @@ class PAITrainingService implements TrainingService {
// Task command // Task command
nniPaiTrialCommand, nniPaiTrialCommand,
// Task shared memory // Task shared memory
this.paiTrialConfig.shmMB this.paiTrialConfig.shmMB,
) )
]; ];
...@@ -456,7 +456,9 @@ class PAITrainingService implements TrainingService { ...@@ -456,7 +456,9 @@ class PAITrainingService implements TrainingService {
// PAI Task roles // PAI Task roles
paiTaskRoles, paiTaskRoles,
// Add Virutal Cluster // Add Virutal Cluster
this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString() this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString(),
//Task auth File
this.paiTrialConfig.authFile
); );
// Step 2. Upload code files in codeDir onto HDFS // Step 2. Upload code files in codeDir onto HDFS
......
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
from .nni_client import *
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
""" A python wrapper for nni rest api
Example:
import nnicli as nc
nc.start_nni('../../../../examples/trials/mnist/config.yml')
nc.set_endpoint('http://localhost:8080')
print(nc.version())
print(nc.get_experiment_status())
print(nc.get_job_statistics())
print(nc.list_trial_jobs())
nc.stop_nni()
"""
import sys
import os
import subprocess
import requests
__all__ = [
'start_nni',
'stop_nni',
'set_endpoint',
'version',
'get_experiment_status',
'get_experiment_profile',
'get_trial_job',
'list_trial_jobs',
'get_job_statistics',
'get_job_metrics',
'export_data'
]
EXPERIMENT_PATH = 'experiment'
VERSION_PATH = 'version'
STATUS_PATH = 'check-status'
JOB_STATISTICS_PATH = 'job-statistics'
TRIAL_JOBS_PATH = 'trial-jobs'
METRICS_PATH = 'metric-data'
EXPORT_DATA_PATH = 'export-data'
API_ROOT_PATH = 'api/v1/nni'
_api_endpoint = None
def set_endpoint(endpoint):
"""set endpoint of nni rest server for nnicli, for example:
http://localhost:8080
"""
global _api_endpoint
_api_endpoint = endpoint
def _check_endpoint():
if _api_endpoint is None:
raise AssertionError("Please call set_endpoint to specify nni endpoint")
def _nni_rest_get(api_path, response_type='json'):
_check_endpoint()
uri = '{}/{}/{}'.format(_api_endpoint, API_ROOT_PATH, api_path)
res = requests.get(uri)
if _http_succeed(res.status_code):
if response_type == 'json':
return res.json()
elif response_type == 'text':
return res.text
else:
raise AssertionError('Incorrect response_type')
else:
return None
def _http_succeed(status_code):
return status_code // 100 == 2
def _create_process(cmd):
if sys.platform == 'win32':
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
else:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
while process.poll() is None:
output = process.stdout.readline()
if output:
print(output.decode('utf-8').strip())
return process.returncode
def start_nni(config_file):
"""start nni experiment with specified configuration file"""
cmd = 'nnictl create --config {}'.format(config_file).split(' ')
if _create_process(cmd) != 0:
raise RuntimeError('Failed to start nni.')
def stop_nni():
"""stop nni experiment"""
cmd = 'nnictl stop'.split(' ')
if _create_process(cmd) != 0:
raise RuntimeError('Failed to stop nni.')
def version():
"""return version of nni"""
return _nni_rest_get(VERSION_PATH, 'text')
def get_experiment_status():
"""return experiment status as a dict"""
return _nni_rest_get(STATUS_PATH)
def get_experiment_profile():
"""return experiment profile as a dict"""
return _nni_rest_get(EXPERIMENT_PATH)
def get_trial_job(trial_job_id):
"""return trial job information as a dict"""
assert trial_job_id is not None
return _nni_rest_get(os.path.join(TRIAL_JOBS_PATH, trial_job_id))
def list_trial_jobs():
"""return information for all trial jobs as a list"""
return _nni_rest_get(TRIAL_JOBS_PATH)
def get_job_statistics():
"""return trial job statistics information as a dict"""
return _nni_rest_get(JOB_STATISTICS_PATH)
def get_job_metrics(trial_job_id=None):
"""return trial job metrics"""
api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id)
return _nni_rest_get(api_path)
def export_data():
"""return exported information for all trial jobs"""
return _nni_rest_get(EXPORT_DATA_PATH)
import setuptools
setuptools.setup(
name = 'nnicli',
version = '999.0.0-developing',
packages = setuptools.find_packages(),
python_requires = '>=3.5',
install_requires = [
'requests'
],
author = 'Microsoft NNI Team',
author_email = 'nni@microsoft.com',
description = 'nnicli for Neural Network Intelligence project',
license = 'MIT',
url = 'https://github.com/Microsoft/nni',
)
...@@ -190,13 +190,19 @@ class HyperoptTuner(Tuner): ...@@ -190,13 +190,19 @@ class HyperoptTuner(Tuner):
HyperoptTuner is a tuner which using hyperopt algorithm. HyperoptTuner is a tuner which using hyperopt algorithm.
""" """
def __init__(self, algorithm_name, optimize_mode='minimize'): def __init__(self, algorithm_name, optimize_mode='minimize',
parallel_optimize=False, constant_liar_type='min'):
""" """
Parameters Parameters
---------- ----------
algorithm_name : str algorithm_name : str
algorithm_name includes "tpe", "random_search" and anneal". algorithm_name includes "tpe", "random_search" and anneal".
optimize_mode : str optimize_mode : str
parallel_optimize : bool
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
constant_liar_type : str
constant_liar_type including "min", "max" and "mean"
More detail could reference: docs/en_US/Tuner/HyperoptTuner.md
""" """
self.algorithm_name = algorithm_name self.algorithm_name = algorithm_name
self.optimize_mode = OptimizeMode(optimize_mode) self.optimize_mode = OptimizeMode(optimize_mode)
...@@ -205,6 +211,13 @@ class HyperoptTuner(Tuner): ...@@ -205,6 +211,13 @@ class HyperoptTuner(Tuner):
self.rval = None self.rval = None
self.supplement_data_num = 0 self.supplement_data_num = 0
self.parallel = parallel_optimize
if self.parallel:
self.CL_rval = None
self.constant_liar_type = constant_liar_type
self.running_data = []
self.optimal_y = None
def _choose_tuner(self, algorithm_name): def _choose_tuner(self, algorithm_name):
""" """
Parameters Parameters
...@@ -266,6 +279,10 @@ class HyperoptTuner(Tuner): ...@@ -266,6 +279,10 @@ class HyperoptTuner(Tuner):
# but it can cause deplicate parameter rarely # but it can cause deplicate parameter rarely
total_params = self.get_suggestion(random_search=True) total_params = self.get_suggestion(random_search=True)
self.total_data[parameter_id] = total_params self.total_data[parameter_id] = total_params
if self.parallel:
self.running_data.append(parameter_id)
params = split_index(total_params) params = split_index(total_params)
return params return params
...@@ -287,10 +304,39 @@ class HyperoptTuner(Tuner): ...@@ -287,10 +304,39 @@ class HyperoptTuner(Tuner):
raise RuntimeError('Received parameter_id not in total_data.') raise RuntimeError('Received parameter_id not in total_data.')
params = self.total_data[parameter_id] params = self.total_data[parameter_id]
# code for parallel
if self.parallel:
constant_liar = kwargs.get('constant_liar', False)
if constant_liar:
rval = self.CL_rval
else:
rval = self.rval
self.running_data.remove(parameter_id)
# update the reward of optimal_y
if self.optimal_y is None:
if self.constant_liar_type == 'mean':
self.optimal_y = [reward, 1]
else:
self.optimal_y = reward
else:
if self.constant_liar_type == 'mean':
_sum = self.optimal_y[0] + reward
_number = self.optimal_y[1] + 1
self.optimal_y = [_sum, _number]
elif self.constant_liar_type == 'min':
self.optimal_y = min(self.optimal_y, reward)
elif self.constant_liar_type == 'max':
self.optimal_y = max(self.optimal_y, reward)
logger.debug("Update optimal_y with reward, optimal_y = %s", self.optimal_y)
else:
rval = self.rval
if self.optimize_mode is OptimizeMode.Maximize: if self.optimize_mode is OptimizeMode.Maximize:
reward = -reward reward = -reward
rval = self.rval
domain = rval.domain domain = rval.domain
trials = rval.trials trials = rval.trials
...@@ -375,13 +421,26 @@ class HyperoptTuner(Tuner): ...@@ -375,13 +421,26 @@ class HyperoptTuner(Tuner):
total_params : dict total_params : dict
parameter suggestion parameter suggestion
""" """
if self.parallel and len(self.total_data)>20 and len(self.running_data) and self.optimal_y is not None:
self.CL_rval = copy.deepcopy(self.rval)
if self.constant_liar_type == 'mean':
_constant_liar_y = self.optimal_y[0] / self.optimal_y[1]
else:
_constant_liar_y = self.optimal_y
for _parameter_id in self.running_data:
self.receive_trial_result(parameter_id=_parameter_id, parameters=None, value=_constant_liar_y, constant_liar=True)
rval = self.CL_rval
rval = self.rval random_state = np.random.randint(2**31 - 1)
else:
rval = self.rval
random_state = rval.rstate.randint(2**31 - 1)
trials = rval.trials trials = rval.trials
algorithm = rval.algo algorithm = rval.algo
new_ids = rval.trials.new_trial_ids(1) new_ids = rval.trials.new_trial_ids(1)
rval.trials.refresh() rval.trials.refresh()
random_state = rval.rstate.randint(2**31 - 1)
if random_search: if random_search:
new_trials = hp.rand.suggest(new_ids, rval.domain, trials, new_trials = hp.rand.suggest(new_ids, rval.domain, trials,
random_state) random_state)
......
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys
import time
import traceback
from utils import GREEN, RED, CLEAR, setup_experiment
def test_nni_cli():
import nnicli as nc
config_file = 'config_test/examples/mnist.test.yml'
try:
# Sleep here to make sure previous stopped exp has enough time to exit to avoid port conflict
time.sleep(6)
print(GREEN + 'Testing nnicli:' + config_file + CLEAR)
nc.start_nni(config_file)
time.sleep(3)
nc.set_endpoint('http://localhost:8080')
print(nc.version())
print(nc.get_job_statistics())
print(nc.get_experiment_status())
nc.list_trial_jobs()
print(GREEN + 'Test nnicli {}: TEST PASS'.format(config_file) + CLEAR)
except Exception as error:
print(RED + 'Test nnicli {}: TEST FAIL'.format(config_file) + CLEAR)
print('%r' % error)
traceback.print_exc()
raise error
finally:
nc.stop_nni()
if __name__ == '__main__':
installed = (sys.argv[-1] != '--preinstall')
setup_experiment(installed)
test_nni_cli()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment