Commit 05913424 authored by suiguoxin's avatar suiguoxin
Browse files

Merge branch 'master' into quniform-tuners

parents e3c8552f 1dab3118
...@@ -23,10 +23,6 @@ trial: ...@@ -23,10 +23,6 @@ trial:
memoryMB: 8196 memoryMB: 8196
#The docker image to run nni job on pai #The docker image to run nni job on pai
image: msranni/nni:latest image: msranni/nni:latest
#The hdfs directory to store data on pai, format 'hdfs://host:port/directory'
dataDir: hdfs://10.10.10.10:9000/username/nni
#The hdfs directory to store output data generated by nni, format 'hdfs://host:port/directory'
outputDir: hdfs://10.10.10.10:9000/username/nni
paiConfig: paiConfig:
#The username to login pai #The username to login pai
userName: username userName: username
......
...@@ -7,7 +7,9 @@ def random_archi_generator(nas_ss, random_state): ...@@ -7,7 +7,9 @@ def random_archi_generator(nas_ss, random_state):
''' '''
chosen_archi = {} chosen_archi = {}
print("zql: nas search space: ", nas_ss) print("zql: nas search space: ", nas_ss)
for block_name, block in nas_ss.items(): for block_name, block_value in nas_ss.items():
assert block_value['_type'] == "mutable_layer", "Random NAS Tuner only receives NAS search space whose _type is 'mutable_layer'"
block = block_value['_value']
tmp_block = {} tmp_block = {}
for layer_name, layer in block.items(): for layer_name, layer in block.items():
tmp_layer = {} tmp_layer = {}
......
...@@ -35,9 +35,10 @@ setup( ...@@ -35,9 +35,10 @@ setup(
license = 'MIT', license = 'MIT',
url = 'https://github.com/Microsoft/nni', url = 'https://github.com/Microsoft/nni',
packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('tools'), packages = find_packages('src/sdk/pynni', exclude=['tests']) + find_packages('src/sdk/pycli') + find_packages('tools'),
package_dir = { package_dir = {
'nni': 'src/sdk/pynni/nni', 'nni': 'src/sdk/pynni/nni',
'nnicli': 'src/sdk/pycli/nnicli',
'nni_annotation': 'tools/nni_annotation', 'nni_annotation': 'tools/nni_annotation',
'nni_cmd': 'tools/nni_cmd', 'nni_cmd': 'tools/nni_cmd',
'nni_trial_tool':'tools/nni_trial_tool', 'nni_trial_tool':'tools/nni_trial_tool',
......
...@@ -51,10 +51,12 @@ export namespace ValidationSchemas { ...@@ -51,10 +51,12 @@ export namespace ValidationSchemas {
command: joi.string().min(1), command: joi.string().min(1),
virtualCluster: joi.string(), virtualCluster: joi.string(),
shmMB: joi.number(), shmMB: joi.number(),
authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode'),
worker: joi.object({ worker: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -64,6 +66,7 @@ export namespace ValidationSchemas { ...@@ -64,6 +66,7 @@ export namespace ValidationSchemas {
ps: joi.object({ ps: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -73,6 +76,7 @@ export namespace ValidationSchemas { ...@@ -73,6 +76,7 @@ export namespace ValidationSchemas {
master: joi.object({ master: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
...@@ -83,6 +87,7 @@ export namespace ValidationSchemas { ...@@ -83,6 +87,7 @@ export namespace ValidationSchemas {
name: joi.string().min(1), name: joi.string().min(1),
taskNum: joi.number().min(1).required(), taskNum: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
privateRegistryAuthPath: joi.string().min(1),
outputDir: joi.string(), outputDir: joi.string(),
cpuNum: joi.number().min(1), cpuNum: joi.number().min(1),
memoryMB: joi.number().min(100), memoryMB: joi.number().min(100),
......
...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi ...@@ -43,8 +43,8 @@ export class FrameworkControllerTrialConfigTemplate extends KubernetesTrialConfi
public readonly taskNum: number; public readonly taskNum: number;
constructor(taskNum: number, command : string, gpuNum : number, constructor(taskNum: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string, cpuNum: number, memoryMB: number, image: string,
frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy) { frameworkAttemptCompletionPolicy: FrameworkAttemptCompletionPolicy, privateRegistryFilePath?: string | undefined) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryFilePath);
this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy; this.frameworkAttemptCompletionPolicy = frameworkAttemptCompletionPolicy;
this.name = name; this.name = name;
this.taskNum = taskNum; this.taskNum = taskNum;
......
...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -305,7 +305,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
} }
// Generate frameworkcontroller job resource config object // Generate frameworkcontroller job resource config object
const frameworkcontrollerJobConfig: any = const frameworkcontrollerJobConfig: any =
this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources); await this.generateFrameworkControllerJobConfig(trialJobId, trialWorkingFolder, frameworkcontrollerJobName, podResources);
return Promise.resolve(frameworkcontrollerJobConfig); return Promise.resolve(frameworkcontrollerJobConfig);
} }
...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -329,8 +329,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
* @param frameworkcontrollerJobName job name * @param frameworkcontrollerJobName job name
* @param podResources pod template * @param podResources pod template
*/ */
private generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string, private async generateFrameworkControllerJobConfig(trialJobId: string, trialWorkingFolder: string,
frameworkcontrollerJobName : string, podResources : any) : any { frameworkcontrollerJobName : string, podResources : any) : Promise<any> {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -345,12 +345,14 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
if (containerPort === undefined) { if (containerPort === undefined) {
throw new Error('Container port is not initialized'); throw new Error('Container port is not initialized');
} }
const taskRole: any = this.generateTaskRoleConfig( const taskRole: any = this.generateTaskRoleConfig(
trialWorkingFolder, trialWorkingFolder,
this.fcTrialConfig.taskRoles[index].image, this.fcTrialConfig.taskRoles[index].image,
`run_${this.fcTrialConfig.taskRoles[index].name}.sh`, `run_${this.fcTrialConfig.taskRoles[index].name}.sh`,
podResources[index], podResources[index],
containerPort containerPort,
await this.createRegistrySecret(this.fcTrialConfig.taskRoles[index].privateRegistryAuthPath)
); );
taskRoles.push({ taskRoles.push({
name: this.fcTrialConfig.taskRoles[index].name, name: this.fcTrialConfig.taskRoles[index].name,
...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -363,7 +365,7 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
}); });
} }
return { return Promise.resolve({
apiVersion: `frameworkcontroller.microsoft.com/v1`, apiVersion: `frameworkcontroller.microsoft.com/v1`,
kind: 'Framework', kind: 'Framework',
metadata: { metadata: {
...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -379,11 +381,11 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
executionType: 'Start', executionType: 'Start',
taskRoles: taskRoles taskRoles: taskRoles
} }
}; });
} }
private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string, private generateTaskRoleConfig(trialWorkingFolder: string, replicaImage: string, runScriptFile: string,
podResources: any, containerPort: number): any { podResources: any, containerPort: number, privateRegistrySecretName: string | undefined): any {
if (this.fcClusterConfig === undefined) { if (this.fcClusterConfig === undefined) {
throw new Error('frameworkcontroller Cluster config is not initialized'); throw new Error('frameworkcontroller Cluster config is not initialized');
} }
...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -451,13 +453,22 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
mountPath: '/mnt/frameworkbarrier' mountPath: '/mnt/frameworkbarrier'
}] }]
}]; }];
const spec: any = {
let spec: any = {
containers: containers, containers: containers,
initContainers: initContainers, initContainers: initContainers,
restartPolicy: 'OnFailure', restartPolicy: 'OnFailure',
volumes: volumeSpecMap.get('nniVolumes'), volumes: volumeSpecMap.get('nniVolumes'),
hostNetwork: false hostNetwork: false
}; };
if(privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}
]
}
if (this.fcClusterConfig.serviceAccountName !== undefined) { if (this.fcClusterConfig.serviceAccountName !== undefined) {
spec.serviceAccountName = this.fcClusterConfig.serviceAccountName; spec.serviceAccountName = this.fcClusterConfig.serviceAccountName;
} }
......
...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig { ...@@ -135,8 +135,8 @@ export class KubeflowTrialConfig extends KubernetesTrialConfig {
export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate { export class KubeflowTrialConfigTemplate extends KubernetesTrialConfigTemplate {
public readonly replicas: number; public readonly replicas: number;
constructor(replicas: number, command : string, gpuNum : number, constructor(replicas: number, command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
super(command, gpuNum, cpuNum, memoryMB, image); super(command, gpuNum, cpuNum, memoryMB, image, privateRegistryAuthPath);
this.replicas = replicas; this.replicas = replicas;
} }
} }
......
...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -347,7 +347,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
// Generate kubeflow job resource config object // Generate kubeflow job resource config object
const kubeflowJobConfig: any = this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources, const kubeflowJobConfig: any = await this.generateKubeflowJobConfig(trialJobId, trialWorkingFolder, kubeflowJobName, workerPodResources,
nonWorkerResources); nonWorkerResources);
return Promise.resolve(kubeflowJobConfig); return Promise.resolve(kubeflowJobConfig);
...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -361,8 +361,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param workerPodResources worker pod template * @param workerPodResources worker pod template
* @param nonWorkerPodResources non-worker pod template, like ps or master * @param nonWorkerPodResources non-worker pod template, like ps or master
*/ */
private generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any, private async generateKubeflowJobConfig(trialJobId: string, trialWorkingFolder: string, kubeflowJobName : string, workerPodResources : any,
nonWorkerPodResources?: any) : any { nonWorkerPodResources?: any) : Promise<any> {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -377,29 +377,32 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const replicaSpecsObj: any = {}; const replicaSpecsObj: any = {};
const replicaSpecsObjMap: Map<string, object> = new Map<string, object>(); const replicaSpecsObjMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowTrialConfig.operatorType === 'tf-operator') { if (this.kubeflowTrialConfig.operatorType === 'tf-operator') {
const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig; const tensorflowTrialConfig: KubeflowTrialConfigTensorflow = <KubeflowTrialConfigTensorflow>this.kubeflowTrialConfig;
let privateRegistrySecretName = await this.createRegistrySecret(tensorflowTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.worker.replicas,
tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources); tensorflowTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
if (tensorflowTrialConfig.ps !== undefined) { if (tensorflowTrialConfig.ps !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(tensorflowTrialConfig.ps.privateRegistryAuthPath);
replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas, replicaSpecsObj.Ps = this.generateReplicaConfig(trialWorkingFolder, tensorflowTrialConfig.ps.replicas,
tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources); tensorflowTrialConfig.ps.image, 'run_ps.sh', nonWorkerPodResources, privateRegistrySecretName);
} }
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {tfReplicaSpecs: replicaSpecsObj});
} else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') { } else if (this.kubeflowTrialConfig.operatorType === 'pytorch-operator') {
const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig; const pytorchTrialConfig: KubeflowTrialConfigPytorch = <KubeflowTrialConfigPytorch>this.kubeflowTrialConfig;
if (pytorchTrialConfig.worker !== undefined) { if (pytorchTrialConfig.worker !== undefined) {
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.worker.privateRegistryAuthPath);
replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas, replicaSpecsObj.Worker = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.worker.replicas,
pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources); pytorchTrialConfig.worker.image, 'run_worker.sh', workerPodResources, privateRegistrySecretName);
} }
let privateRegistrySecretName: string | undefined = await this.createRegistrySecret(pytorchTrialConfig.master.privateRegistryAuthPath);
replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas, replicaSpecsObj.Master = this.generateReplicaConfig(trialWorkingFolder, pytorchTrialConfig.master.replicas,
pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources); pytorchTrialConfig.master.image, 'run_master.sh', nonWorkerPodResources, privateRegistrySecretName);
replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj}); replicaSpecsObjMap.set(this.kubernetesCRDClient.jobKind, {pytorchReplicaSpecs: replicaSpecsObj});
} }
return { return Promise.resolve({
apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`, apiVersion: `kubeflow.org/${this.kubernetesCRDClient.apiVersion}`,
kind: this.kubernetesCRDClient.jobKind, kind: this.kubernetesCRDClient.jobKind,
metadata: { metadata: {
...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -412,7 +415,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}, },
spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind) spec: replicaSpecsObjMap.get(this.kubernetesCRDClient.jobKind)
}; });
} }
/** /**
...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -424,7 +427,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
* @param podResources pod resource config section * @param podResources pod resource config section
*/ */
private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string, private generateReplicaConfig(trialWorkingFolder: string, replicaNumber: number, replicaImage: string, runScriptFile: string,
podResources: any): any { podResources: any, privateRegistrySecretName: string | undefined): any {
if (this.kubeflowClusterConfig === undefined) { if (this.kubeflowClusterConfig === undefined) {
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -436,7 +439,7 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (this.kubernetesCRDClient === undefined) { if (this.kubernetesCRDClient === undefined) {
throw new Error('Kubeflow operator client is not initialized'); throw new Error('Kubeflow operator client is not initialized');
} }
// The config spec for volume field
const volumeSpecMap: Map<string, object> = new Map<string, object>(); const volumeSpecMap: Map<string, object> = new Map<string, object>();
if (this.kubeflowClusterConfig.storageType === 'azureStorage') { if (this.kubeflowClusterConfig.storageType === 'azureStorage') {
volumeSpecMap.set('nniVolumes', [ volumeSpecMap.set('nniVolumes', [
...@@ -459,16 +462,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -459,16 +462,9 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
}]); }]);
} }
// The config spec for container field
return { const containersSpecMap: Map<string, object> = new Map<string, object>();
replicas: replicaNumber, containersSpecMap.set('containers', [
template: {
metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null
},
spec: {
containers: [
{ {
// Kubeflow tensorflow operator requires that containers' name must be tensorflow // Kubeflow tensorflow operator requires that containers' name must be tensorflow
// TODO: change the name based on operator's type // TODO: change the name based on operator's type
...@@ -481,12 +477,29 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -481,12 +477,29 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
mountPath: this.CONTAINER_MOUNT_PATH mountPath: this.CONTAINER_MOUNT_PATH
}], }],
resources: podResources resources: podResources
}], }
]);
let spec: any = {
containers: containersSpecMap.get('containers'),
restartPolicy: 'ExitCode', restartPolicy: 'ExitCode',
volumes: volumeSpecMap.get('nniVolumes') volumes: volumeSpecMap.get('nniVolumes')
} }
if (privateRegistrySecretName) {
spec.imagePullSecrets = [
{
name: privateRegistrySecretName
}]
}
return {
replicas: replicaNumber,
template: {
metadata: {
// tslint:disable-next-line:no-null-keyword
creationTimestamp: null
},
spec: spec
}
} }
};
} }
} }
// tslint:enable: no-unsafe-any no-any // tslint:enable: no-unsafe-any no-any
......
...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate { ...@@ -179,6 +179,9 @@ export class KubernetesTrialConfigTemplate {
// Docker image // Docker image
public readonly image: string; public readonly image: string;
// Private registry config file path to download docker iamge
public readonly privateRegistryAuthPath?: string;
// Trail command // Trail command
public readonly command : string; public readonly command : string;
...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate { ...@@ -186,12 +189,13 @@ export class KubernetesTrialConfigTemplate {
public readonly gpuNum : number; public readonly gpuNum : number;
constructor(command : string, gpuNum : number, constructor(command : string, gpuNum : number,
cpuNum: number, memoryMB: number, image: string) { cpuNum: number, memoryMB: number, image: string, privateRegistryAuthPath?: string) {
this.command = command; this.command = command;
this.gpuNum = gpuNum; this.gpuNum = gpuNum;
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.privateRegistryAuthPath = privateRegistryAuthPath;
} }
} }
......
...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig'; ...@@ -38,6 +38,8 @@ import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var fs = require('fs');
/** /**
* Training Service implementation for Kubernetes * Training Service implementation for Kubernetes
*/ */
...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService { ...@@ -327,5 +329,34 @@ abstract class KubernetesTrainingService {
return Promise.resolve(); return Promise.resolve();
} }
protected async createRegistrySecret(filePath: string | undefined): Promise<string | undefined> {
if(filePath === undefined || filePath === '') {
return undefined;
}
let body = fs.readFileSync(filePath).toString('base64');
let registrySecretName = String.Format('nni-secret-{0}', uniqueString(8)
.toLowerCase());
await this.genericK8sClient.createSecret(
{
apiVersion: 'v1',
kind: 'Secret',
metadata: {
name: registrySecretName,
namespace: 'default',
labels: {
app: this.NNI_KUBERNETES_TRIAL_LABEL,
expId: getExperimentId()
}
},
type: 'kubernetes.io/dockerconfigjson',
data: {
'.dockerconfigjson': body
}
}
);
return registrySecretName;
}
} }
export { KubernetesTrainingService }; export { KubernetesTrainingService };
...@@ -307,10 +307,12 @@ class LocalTrainingService implements TrainingService { ...@@ -307,10 +307,12 @@ class LocalTrainingService implements TrainingService {
if (this.localTrailConfig === undefined) { if (this.localTrailConfig === undefined) {
throw new Error('trial config parsed failed'); throw new Error('trial config parsed failed');
} }
if (this.localTrailConfig.gpuNum !== undefined) {
this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`); this.log.info(`required GPU number is ${this.localTrailConfig.gpuNum}`);
if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) { if (this.gpuScheduler === undefined && this.localTrailConfig.gpuNum > 0) {
this.gpuScheduler = new GPUScheduler(); this.gpuScheduler = new GPUScheduler();
} }
}
break; break;
case TrialConfigMetadataKey.LOCAL_CONFIG: case TrialConfigMetadataKey.LOCAL_CONFIG:
this.localConfig = <LocalConfig>JSON.parse(value); this.localConfig = <LocalConfig>JSON.parse(value);
...@@ -399,7 +401,8 @@ class LocalTrainingService implements TrainingService { ...@@ -399,7 +401,8 @@ class LocalTrainingService implements TrainingService {
private getEnvironmentVariables( private getEnvironmentVariables(
trialJobDetail: TrialJobDetail, trialJobDetail: TrialJobDetail,
resource: { gpuIndices: number[] }): { key: string; value: string }[] { resource: { gpuIndices: number[] },
gpuNum: number | undefined): { key: string; value: string }[] {
const envVariables: { key: string; value: string }[] = [ const envVariables: { key: string; value: string }[] = [
{ key: 'NNI_PLATFORM', value: 'local' }, { key: 'NNI_PLATFORM', value: 'local' },
{ key: 'NNI_EXP_ID', value: this.experimentId }, { key: 'NNI_EXP_ID', value: this.experimentId },
...@@ -409,11 +412,12 @@ class LocalTrainingService implements TrainingService { ...@@ -409,11 +412,12 @@ class LocalTrainingService implements TrainingService {
{ key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() }, { key: 'NNI_TRIAL_SEQ_ID', value: trialJobDetail.sequenceId.toString() },
{ key: 'MULTI_PHASE', value: this.isMultiPhase.toString() } { key: 'MULTI_PHASE', value: this.isMultiPhase.toString() }
]; ];
if (gpuNum !== undefined) {
envVariables.push({ envVariables.push({
key: 'CUDA_VISIBLE_DEVICES', key: 'CUDA_VISIBLE_DEVICES',
value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',') value: this.gpuScheduler === undefined ? '-1' : resource.gpuIndices.join(',')
}); });
}
return envVariables; return envVariables;
} }
...@@ -490,6 +494,7 @@ class LocalTrainingService implements TrainingService { ...@@ -490,6 +494,7 @@ class LocalTrainingService implements TrainingService {
if (!success) { if (!success) {
break; break;
} }
this.occupyResource(resource); this.occupyResource(resource);
await this.runTrialJob(trialJobId, resource); await this.runTrialJob(trialJobId, resource);
} }
...@@ -526,7 +531,10 @@ class LocalTrainingService implements TrainingService { ...@@ -526,7 +531,10 @@ class LocalTrainingService implements TrainingService {
private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> { private async runTrialJob(trialJobId: string, resource: {gpuIndices: number[]}): Promise<void> {
const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId); const trialJobDetail: LocalTrialJobDetail = <LocalTrialJobDetail>this.jobMap.get(trialJobId);
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource); if (this.localTrailConfig === undefined) {
throw new Error(`localTrialConfig not initialized!`);
}
const variables: { key: string; value: string }[] = this.getEnvironmentVariables(trialJobDetail, resource, this.localTrailConfig.gpuNum);
if (this.localTrailConfig === undefined) { if (this.localTrailConfig === undefined) {
throw new Error('trial config is not initialized'); throw new Error('trial config is not initialized');
......
...@@ -69,12 +69,10 @@ export class PAIJobConfig { ...@@ -69,12 +69,10 @@ export class PAIJobConfig {
public readonly jobName: string; public readonly jobName: string;
// URL pointing to the Docker image for all tasks in the job // URL pointing to the Docker image for all tasks in the job
public readonly image: string; public readonly image: string;
// Data directory existing on HDFS
public readonly dataDir: string;
// Output directory on HDFS
public readonly outputDir: string;
// Code directory on HDFS // Code directory on HDFS
public readonly codeDir: string; public readonly codeDir: string;
//authentication file used for private Docker registry
public readonly authFile?: string;
// List of taskRole, one task role at least // List of taskRole, one task role at least
public taskRoles: PAITaskRole[]; public taskRoles: PAITaskRole[];
...@@ -90,15 +88,14 @@ export class PAIJobConfig { ...@@ -90,15 +88,14 @@ export class PAIJobConfig {
* @param outputDir Output directory on HDFS * @param outputDir Output directory on HDFS
* @param taskRoles List of taskRole, one task role at least * @param taskRoles List of taskRole, one task role at least
*/ */
constructor(jobName: string, image : string, dataDir : string, outputDir : string, codeDir : string, constructor(jobName: string, image : string, codeDir : string,
taskRoles : PAITaskRole[], virtualCluster: string) { taskRoles : PAITaskRole[], virtualCluster: string, authFile?: string) {
this.jobName = jobName; this.jobName = jobName;
this.image = image; this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
this.codeDir = codeDir; this.codeDir = codeDir;
this.taskRoles = taskRoles; this.taskRoles = taskRoles;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.authFile = authFile;
} }
} }
...@@ -130,23 +127,22 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -130,23 +127,22 @@ export class NNIPAITrialConfig extends TrialConfig {
public readonly cpuNum: number; public readonly cpuNum: number;
public readonly memoryMB: number; public readonly memoryMB: number;
public readonly image: string; public readonly image: string;
public readonly dataDir: string;
public outputDir: string;
//The virtual cluster job runs on. If omitted, the job will run on default virtual cluster //The virtual cluster job runs on. If omitted, the job will run on default virtual cluster
public virtualCluster?: string; public virtualCluster?: string;
//Shared memory for one task in the task role //Shared memory for one task in the task role
public shmMB?: number; public shmMB?: number;
//authentication file used for private Docker registry
public authFile?: string;
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, dataDir: string, outputDir: string, virtualCluster?: string, shmMB?: number) { image: string, virtualCluster?: string, shmMB?: number, authFile?: string) {
super(command, codeDir, gpuNum); super(command, codeDir, gpuNum);
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
this.image = image; this.image = image;
this.dataDir = dataDir;
this.outputDir = outputDir;
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.shmMB = shmMB; this.shmMB = shmMB;
this.authFile = authFile;
} }
} }
...@@ -70,9 +70,6 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = ...@@ -70,9 +70,6 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
--pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \ --pai_hdfs_output_dir '{9}' --pai_hdfs_host '{10}' --pai_user_name {11} --nni_hdfs_exp_dir '{12}' --webhdfs_path '/webhdfs/api/v1' \
--nni_manager_version '{13}' --log_collection '{14}'`; --nni_manager_version '{13}' --log_collection '{14}'`;
export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
// tslint:disable:no-http-string // tslint:disable:no-http-string
export const PAI_LOG_PATH_FORMAT: string = export const PAI_LOG_PATH_FORMAT: string =
`http://{0}/webhdfs/explorer.html#{1}`; `http://{0}/webhdfs/explorer.html#{1}`;
...@@ -43,7 +43,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; ...@@ -43,7 +43,7 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey';
import { execMkdir, validateCodeDir } from '../common/util'; import { execMkdir, validateCodeDir } from '../common/util';
import { HDFSClientUtility } from './hdfsClientUtility'; import { HDFSClientUtility } from './hdfsClientUtility';
import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig'; import { NNIPAITrialConfig, PAIClusterConfig, PAIJobConfig, PAITaskRole } from './paiConfig';
import { PAI_LOG_PATH_FORMAT, PAI_OUTPUT_DIR_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData'; import { PAI_LOG_PATH_FORMAT, PAI_TRIAL_COMMAND_FORMAT, PAITrialJobDetail } from './paiData';
import { PAIJobInfoCollector } from './paiJobInfoCollector'; import { PAIJobInfoCollector } from './paiJobInfoCollector';
import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer'; import { PAIJobRestServer, ParameterFileMeta } from './paiJobRestServer';
...@@ -70,9 +70,6 @@ class PAITrainingService implements TrainingService { ...@@ -70,9 +70,6 @@ class PAITrainingService implements TrainingService {
private readonly paiTokenUpdateInterval: number; private readonly paiTokenUpdateInterval: number;
private readonly experimentId! : string; private readonly experimentId! : string;
private readonly paiJobCollector : PAIJobInfoCollector; private readonly paiJobCollector : PAIJobInfoCollector;
private readonly hdfsDirPattern: string;
private hdfsBaseDir: string | undefined;
private hdfsOutputHost: string | undefined;
private nextTrialSequenceId: number; private nextTrialSequenceId: number;
private paiRestServerPort?: number; private paiRestServerPort?: number;
private nniManagerIpConfig?: NNIManagerIpConfig; private nniManagerIpConfig?: NNIManagerIpConfig;
...@@ -80,6 +77,8 @@ class PAITrainingService implements TrainingService { ...@@ -80,6 +77,8 @@ class PAITrainingService implements TrainingService {
private versionCheck: boolean = true; private versionCheck: boolean = true;
private logCollection: string; private logCollection: string;
private isMultiPhase: boolean = false; private isMultiPhase: boolean = false;
private hdfsCodeDir?: string;
private hdfsOutputDir?: string;
constructor() { constructor() {
this.log = getLogger(); this.log = getLogger();
...@@ -90,7 +89,6 @@ class PAITrainingService implements TrainingService { ...@@ -90,7 +89,6 @@ class PAITrainingService implements TrainingService {
this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); this.expRootDir = path.join('/nni', 'experiments', getExperimentId());
this.experimentId = getExperimentId(); this.experimentId = getExperimentId();
this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap); this.paiJobCollector = new PAIJobInfoCollector(this.trialJobsMap);
this.hdfsDirPattern = 'hdfs://(?<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?<baseDir>/.*)?';
this.nextTrialSequenceId = -1; this.nextTrialSequenceId = -1;
this.paiTokenUpdateInterval = 7200000; //2hours this.paiTokenUpdateInterval = 7200000; //2hours
this.logCollection = 'none'; this.logCollection = 'none';
...@@ -144,10 +142,10 @@ class PAITrainingService implements TrainingService { ...@@ -144,10 +142,10 @@ class PAITrainingService implements TrainingService {
} }
public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> { public async submitTrialJob(form: JobApplicationForm): Promise<TrialJobDetail> {
const deferred : Deferred<PAITrialJobDetail> = new Deferred<PAITrialJobDetail>(); if (this.paiClusterConfig === undefined) {
if (this.hdfsBaseDir === undefined) { throw new Error(`paiClusterConfig not initialized!`);
throw new Error('hdfsBaseDir is not initialized');
} }
const deferred : Deferred<PAITrialJobDetail> = new Deferred<PAITrialJobDetail>();
this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`);
...@@ -156,12 +154,14 @@ class PAITrainingService implements TrainingService { ...@@ -156,12 +154,14 @@ class PAITrainingService implements TrainingService {
//TODO: use HDFS working folder instead //TODO: use HDFS working folder instead
const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId);
const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; const paiJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`;
this.hdfsCodeDir = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
this.hdfsOutputDir = unixPathJoin(this.hdfsCodeDir, 'nnioutput');
const hdfsOutputDir : string = path.join(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsLogPath : string = String.Format( const hdfsLogPath : string = String.Format(
PAI_LOG_PATH_FORMAT, PAI_LOG_PATH_FORMAT,
this.hdfsOutputHost, this.paiClusterConfig.host,
hdfsOutputDir); this.hdfsOutputDir
);
const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail( const trialJobDetail: PAITrialJobDetail = new PAITrialJobDetail(
trialJobId, trialJobId,
...@@ -278,14 +278,6 @@ class PAITrainingService implements TrainingService { ...@@ -278,14 +278,6 @@ class PAITrainingService implements TrainingService {
break; break;
} }
this.paiTrialConfig = <NNIPAITrialConfig>JSON.parse(value); this.paiTrialConfig = <NNIPAITrialConfig>JSON.parse(value);
//paiTrialConfig.outputDir could be null if it is not set in nnictl
if (this.paiTrialConfig.outputDir === undefined || this.paiTrialConfig.outputDir === null) {
this.paiTrialConfig.outputDir = String.Format(
PAI_OUTPUT_DIR_FORMAT,
this.paiClusterConfig.host
)
.replace(/\r\n|\n|\r/gm, '');
}
// Validate to make sure codeDir doesn't have too many files // Validate to make sure codeDir doesn't have too many files
try { try {
...@@ -296,42 +288,6 @@ class PAITrainingService implements TrainingService { ...@@ -296,42 +288,6 @@ class PAITrainingService implements TrainingService {
break; break;
} }
const hdfsDirContent: any = this.paiTrialConfig.outputDir.match(this.hdfsDirPattern);
if (hdfsDirContent === null) {
throw new Error('Trial outputDir format Error');
}
const groups: any = hdfsDirContent.groups;
if (groups === undefined) {
throw new Error('Trial outputDir format Error');
}
this.hdfsOutputHost = groups.host;
//TODO: choose to use /${username} as baseDir
this.hdfsBaseDir = groups.baseDir;
if (this.hdfsBaseDir === undefined) {
this.hdfsBaseDir = '/';
}
let dataOutputHdfsClient: any;
if (this.paiClusterConfig.host === this.hdfsOutputHost && this.hdfsClient) {
dataOutputHdfsClient = this.hdfsClient;
} else {
dataOutputHdfsClient = WebHDFS.createClient({
user: this.paiClusterConfig.userName,
port: 50070,
host: this.hdfsOutputHost
});
}
try {
const exist : boolean = await HDFSClientUtility.pathExists('/', dataOutputHdfsClient);
if (!exist) {
deferred.reject(new Error(`Please check hdfsOutputDir host!`));
}
} catch (error) {
deferred.reject(new Error(`HDFS encounters problem, error is ${error}. Please check hdfsOutputDir host!`));
}
// Copy experiment files from local folder to HDFS // Copy experiment files from local folder to HDFS
this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs( this.copyExpCodeDirPromise = HDFSClientUtility.copyDirectoryToHdfs(
this.paiTrialConfig.codeDir, this.paiTrialConfig.codeDir,
...@@ -409,12 +365,12 @@ class PAITrainingService implements TrainingService { ...@@ -409,12 +365,12 @@ class PAITrainingService implements TrainingService {
throw new Error('PAI token is not initialized'); throw new Error('PAI token is not initialized');
} }
if (this.hdfsBaseDir === undefined) { if (this.hdfsCodeDir === undefined) {
throw new Error('hdfsBaseDir is not initialized'); throw new Error('hdfsCodeDir is not initialized');
} }
if (this.hdfsOutputHost === undefined) { if (this.hdfsOutputDir === undefined) {
throw new Error('hdfsOutputHost is not initialized'); throw new Error('hdfsOutputDir is not initialized');
} }
if (this.paiRestServerPort === undefined) { if (this.paiRestServerPort === undefined) {
...@@ -428,8 +384,6 @@ class PAITrainingService implements TrainingService { ...@@ -428,8 +384,6 @@ class PAITrainingService implements TrainingService {
} }
// Step 1. Prepare PAI job configuration // Step 1. Prepare PAI job configuration
const hdfsOutputDir : string = unixPathJoin(this.hdfsBaseDir, this.experimentId, trialJobId);
const hdfsCodeDir: string = HDFSClientUtility.getHdfsTrialWorkDir(this.paiClusterConfig.userName, trialJobId);
const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId); const trialLocalTempFolder: string = path.join(getExperimentRootDir(), 'trials-local', trialJobId);
//create tmp trial working folder locally. //create tmp trial working folder locally.
...@@ -463,8 +417,8 @@ class PAITrainingService implements TrainingService { ...@@ -463,8 +417,8 @@ class PAITrainingService implements TrainingService {
this.paiTrialConfig.command, this.paiTrialConfig.command,
nniManagerIp, nniManagerIp,
this.paiRestServerPort, this.paiRestServerPort,
hdfsOutputDir, this.hdfsOutputDir,
this.hdfsOutputHost, this.paiClusterConfig.host,
this.paiClusterConfig.userName, this.paiClusterConfig.userName,
HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName), HDFSClientUtility.getHdfsExpCodeDir(this.paiClusterConfig.userName),
version, version,
...@@ -488,7 +442,7 @@ class PAITrainingService implements TrainingService { ...@@ -488,7 +442,7 @@ class PAITrainingService implements TrainingService {
// Task command // Task command
nniPaiTrialCommand, nniPaiTrialCommand,
// Task shared memory // Task shared memory
this.paiTrialConfig.shmMB this.paiTrialConfig.shmMB,
) )
]; ];
...@@ -497,23 +451,21 @@ class PAITrainingService implements TrainingService { ...@@ -497,23 +451,21 @@ class PAITrainingService implements TrainingService {
trialJobDetail.paiJobName, trialJobDetail.paiJobName,
// Docker image // Docker image
this.paiTrialConfig.image, this.paiTrialConfig.image,
// dataDir
this.paiTrialConfig.dataDir,
// outputDir
this.paiTrialConfig.outputDir,
// codeDir // codeDir
`$PAI_DEFAULT_FS_URI${hdfsCodeDir}`, `$PAI_DEFAULT_FS_URI${this.hdfsCodeDir}`,
// PAI Task roles // PAI Task roles
paiTaskRoles, paiTaskRoles,
// Add Virutal Cluster // Add Virutal Cluster
this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString() this.paiTrialConfig.virtualCluster === undefined ? 'default' : this.paiTrialConfig.virtualCluster.toString(),
//Task auth File
this.paiTrialConfig.authFile
); );
// Step 2. Upload code files in codeDir onto HDFS // Step 2. Upload code files in codeDir onto HDFS
try { try {
await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, hdfsCodeDir, this.hdfsClient); await HDFSClientUtility.copyDirectoryToHdfs(trialLocalTempFolder, this.hdfsCodeDir, this.hdfsClient);
} catch (error) { } catch (error) {
this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${hdfsCodeDir} failed, error is ${error}`); this.log.error(`PAI Training service: copy ${this.paiTrialConfig.codeDir} to HDFS ${this.hdfsCodeDir} failed, error is ${error}`);
trialJobDetail.status = 'FAILED'; trialJobDetail.status = 'FAILED';
deferred.resolve(true); deferred.resolve(true);
......
...@@ -48,14 +48,17 @@ export class GPUScheduler { ...@@ -48,14 +48,17 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum) * Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number * @param requiredGPUNum required GPU number
*/ */
public scheduleMachine(requiredGPUNum: number, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult { public scheduleMachine(requiredGPUNum: number | undefined, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
if(requiredGPUNum === undefined) {
requiredGPUNum = 0;
}
assert(requiredGPUNum >= 0); assert(requiredGPUNum >= 0);
const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys()); const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys());
assert(allRMs.length > 0); assert(allRMs.length > 0);
// Step 1: Check if required GPU number not exceeds the total GPU number in all machines // Step 1: Check if required GPU number not exceeds the total GPU number in all machines
const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) => const eligibleRM: RemoteMachineMeta[] = allRMs.filter((rmMeta : RemoteMachineMeta) =>
rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || rmMeta.gpuSummary.gpuCount >= requiredGPUNum); rmMeta.gpuSummary === undefined || requiredGPUNum === 0 || (requiredGPUNum !== undefined && rmMeta.gpuSummary.gpuCount >= requiredGPUNum));
if (eligibleRM.length === 0) { if (eligibleRM.length === 0) {
// If the required gpu number exceeds the upper limit of all machine's GPU number // If the required gpu number exceeds the upper limit of all machine's GPU number
// Return REQUIRE_EXCEED_TOTAL directly // Return REQUIRE_EXCEED_TOTAL directly
......
...@@ -601,12 +601,16 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -601,12 +601,16 @@ class RemoteMachineTrainingService implements TrainingService {
let command: string; let command: string;
// Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device // Set CUDA_VISIBLE_DEVICES environment variable based on cuda_visible_device
// If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device // If no valid cuda_visible_device is defined, set CUDA_VISIBLE_DEVICES to empty string to hide GPU device
// If gpuNum is undefined, will not set CUDA_VISIBLE_DEVICES in script
if (this.trialConfig.gpuNum === undefined) {
command = this.trialConfig.command;
} else {
if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) { if (typeof cuda_visible_device === 'string' && cuda_visible_device.length > 0) {
command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`; command = `CUDA_VISIBLE_DEVICES=${cuda_visible_device} ${this.trialConfig.command}`;
} else { } else {
command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`; command = `CUDA_VISIBLE_DEVICES=" " ${this.trialConfig.command}`;
} }
}
// tslint:disable-next-line: strict-boolean-expressions // tslint:disable-next-line: strict-boolean-expressions
const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address(); const nniManagerIp: string = this.nniManagerIpConfig ? this.nniManagerIpConfig.nniManagerIp : getIPV4Address();
if (this.remoteRestServerPort === undefined) { if (this.remoteRestServerPort === undefined) {
......
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
from .nni_client import *
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
""" A python wrapper for nni rest api
Example:
import nnicli as nc
nc.start_nni('../../../../examples/trials/mnist/config.yml')
nc.set_endpoint('http://localhost:8080')
print(nc.version())
print(nc.get_experiment_status())
print(nc.get_job_statistics())
print(nc.list_trial_jobs())
nc.stop_nni()
"""
import sys
import os
import subprocess
import requests
__all__ = [
'start_nni',
'stop_nni',
'set_endpoint',
'version',
'get_experiment_status',
'get_experiment_profile',
'get_trial_job',
'list_trial_jobs',
'get_job_statistics',
'get_job_metrics',
'export_data'
]
EXPERIMENT_PATH = 'experiment'
VERSION_PATH = 'version'
STATUS_PATH = 'check-status'
JOB_STATISTICS_PATH = 'job-statistics'
TRIAL_JOBS_PATH = 'trial-jobs'
METRICS_PATH = 'metric-data'
EXPORT_DATA_PATH = 'export-data'
API_ROOT_PATH = 'api/v1/nni'
_api_endpoint = None
def set_endpoint(endpoint):
"""set endpoint of nni rest server for nnicli, for example:
http://localhost:8080
"""
global _api_endpoint
_api_endpoint = endpoint
def _check_endpoint():
if _api_endpoint is None:
raise AssertionError("Please call set_endpoint to specify nni endpoint")
def _nni_rest_get(api_path, response_type='json'):
_check_endpoint()
uri = '{}/{}/{}'.format(_api_endpoint, API_ROOT_PATH, api_path)
res = requests.get(uri)
if _http_succeed(res.status_code):
if response_type == 'json':
return res.json()
elif response_type == 'text':
return res.text
else:
raise AssertionError('Incorrect response_type')
else:
return None
def _http_succeed(status_code):
return status_code // 100 == 2
def _create_process(cmd):
if sys.platform == 'win32':
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
else:
process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
while process.poll() is None:
output = process.stdout.readline()
if output:
print(output.decode('utf-8').strip())
return process.returncode
def start_nni(config_file):
"""start nni experiment with specified configuration file"""
cmd = 'nnictl create --config {}'.format(config_file).split(' ')
if _create_process(cmd) != 0:
raise RuntimeError('Failed to start nni.')
def stop_nni():
"""stop nni experiment"""
cmd = 'nnictl stop'.split(' ')
if _create_process(cmd) != 0:
raise RuntimeError('Failed to stop nni.')
def version():
"""return version of nni"""
return _nni_rest_get(VERSION_PATH, 'text')
def get_experiment_status():
"""return experiment status as a dict"""
return _nni_rest_get(STATUS_PATH)
def get_experiment_profile():
"""return experiment profile as a dict"""
return _nni_rest_get(EXPERIMENT_PATH)
def get_trial_job(trial_job_id):
"""return trial job information as a dict"""
assert trial_job_id is not None
return _nni_rest_get(os.path.join(TRIAL_JOBS_PATH, trial_job_id))
def list_trial_jobs():
"""return information for all trial jobs as a list"""
return _nni_rest_get(TRIAL_JOBS_PATH)
def get_job_statistics():
"""return trial job statistics information as a dict"""
return _nni_rest_get(JOB_STATISTICS_PATH)
def get_job_metrics(trial_job_id=None):
"""return trial job metrics"""
api_path = METRICS_PATH if trial_job_id is None else os.path.join(METRICS_PATH, trial_job_id)
return _nni_rest_get(api_path)
def export_data():
"""return exported information for all trial jobs"""
return _nni_rest_get(EXPORT_DATA_PATH)
import setuptools
setuptools.setup(
name = 'nnicli',
version = '999.0.0-developing',
packages = setuptools.find_packages(),
python_requires = '>=3.5',
install_requires = [
'requests'
],
author = 'Microsoft NNI Team',
author_email = 'nni@microsoft.com',
description = 'nnicli for Neural Network Intelligence project',
license = 'MIT',
url = 'https://github.com/Microsoft/nni',
)
...@@ -39,16 +39,14 @@ logger = logging.getLogger('grid_search_AutoML') ...@@ -39,16 +39,14 @@ logger = logging.getLogger('grid_search_AutoML')
class GridSearchTuner(Tuner): class GridSearchTuner(Tuner):
''' '''
GridSearchTuner will search all the possible configures that the user define in the searchSpace. GridSearchTuner will search all the possible configures that the user define in the searchSpace.
The only acceptable types of search space are 'quniform', 'qloguniform' and 'choice' The only acceptable types of search space are 'choice', 'quniform', 'randint'
Type 'choice' will select one of the options. Note that it can also be nested. Type 'choice' will select one of the options. Note that it can also be nested.
Type 'quniform' will receive three values [low, high, q], where [low, high] specifies a range and 'q' specifies the number of values that will be sampled evenly. Type 'quniform' will receive three values [low, high, q], where [low, high] specifies a range and 'q' specifies the interval
Note that q should be at least 2. It will be sampled in a way that the first sampled value is 'low', and each of the following values is 'interval' larger than the value in front of it.
It will be sampled in a way that the first sampled value is 'low', and each of the following values is (high-low)/q larger that the value in front of it.
Type 'qloguniform' behaves like 'quniform' except that it will first change the range to [log(low), log(high)] Type 'randint' gives all possible intergers in range[low, high). Note that 'high' is not included.
and sample and then change the sampled value back.
''' '''
def __init__(self): def __init__(self):
...@@ -73,8 +71,12 @@ class GridSearchTuner(Tuner): ...@@ -73,8 +71,12 @@ class GridSearchTuner(Tuner):
chosen_params.extend(choice) chosen_params.extend(choice)
else: else:
chosen_params.append(choice) chosen_params.append(choice)
elif _type == 'quniform':
chosen_params = self._parse_quniform(_value)
elif _type == 'randint':
chosen_params = self._parse_randint(_value)
else: else:
chosen_params = self.parse_qtype(_type, _value) raise RuntimeError("Not supported type: %s" % _type)
else: else:
chosen_params = dict() chosen_params = dict()
for key in ss_spec.keys(): for key in ss_spec.keys():
...@@ -95,21 +97,13 @@ class GridSearchTuner(Tuner): ...@@ -95,21 +97,13 @@ class GridSearchTuner(Tuner):
def _parse_quniform(self, param_value): def _parse_quniform(self, param_value):
'''parse type of quniform parameter and return a list''' '''parse type of quniform parameter and return a list'''
if param_value[2] < 2: low, high, interval = param_value[0], param_value[1], param_value[2]
raise RuntimeError("The number of values sampled (q) should be at least 2") count = int(np.floor((high - low) / interval)) + 1
low, high, count = param_value[0], param_value[1], param_value[2] return [low + interval * i for i in range(count)]
interval = (high - low) / (count - 1)
return [float(low + interval * i) for i in range(count)] def _parse_randint(self, param_value):
'''parse type of randint parameter and return a list'''
def parse_qtype(self, param_type, param_value): return np.arange(param_value[0], param_value[1]).tolist()
'''parse type of quniform or qloguniform'''
if param_type == 'quniform':
return self._parse_quniform(param_value)
if param_type == 'qloguniform':
param_value[:2] = np.log(param_value[:2])
return list(np.exp(self._parse_quniform(param_value)))
raise RuntimeError("Not supported type: %s" % param_type)
def expand_parameters(self, para): def expand_parameters(self, para):
''' '''
...@@ -133,7 +127,7 @@ class GridSearchTuner(Tuner): ...@@ -133,7 +127,7 @@ class GridSearchTuner(Tuner):
def update_search_space(self, search_space): def update_search_space(self, search_space):
''' '''
Check if the search space is valid and expand it: only contains 'choice' type or other types beginnning with the letter 'q' Check if the search space is valid and expand it: support only 'choice', 'quniform', randint'
''' '''
self.expanded_search_space = self.json2parameter(search_space) self.expanded_search_space = self.json2parameter(search_space)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment