Unverified Commit 871f031d authored by Guoxin's avatar Guoxin Committed by GitHub
Browse files

Merge pull request #1520 from suiguoxin/v1.0-conf-resolve

merge v1.0 back to master (conflicts resolved)
parents 8f71479e b75a2914
...@@ -53,6 +53,11 @@ export namespace ValidationSchemas { ...@@ -53,6 +53,11 @@ export namespace ValidationSchemas {
shmMB: joi.number(), shmMB: joi.number(),
authFile: joi.string(), authFile: joi.string(),
nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
portList: joi.array().items(joi.object({
label: joi.string().required(),
beginAt: joi.number().required(),
portNumber: joi.number().required(),
})),
worker: joi.object({ worker: joi.object({
replicas: joi.number().min(1).required(), replicas: joi.number().min(1).required(),
image: joi.string().min(1), image: joi.string().min(1),
...@@ -120,7 +125,8 @@ export namespace ValidationSchemas { ...@@ -120,7 +125,8 @@ export namespace ValidationSchemas {
azureStorage: joi.object({ azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) }),
uploadRetryCount: joi.number().min(1)
}), }),
frameworkcontroller_config: joi.object({ frameworkcontroller_config: joi.object({
storage: joi.string().min(1), storage: joi.string().min(1),
...@@ -136,7 +142,8 @@ export namespace ValidationSchemas { ...@@ -136,7 +142,8 @@ export namespace ValidationSchemas {
azureStorage: joi.object({ azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) }),
uploadRetryCount: joi.number().min(1)
}), }),
nni_manager_ip: joi.object({ nni_manager_ip: joi.object({
nniManagerIp: joi.string().min(1) nniManagerIp: joi.string().min(1)
......
...@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility { ...@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient * @param fileServerClient
* @param azureShare * @param azureShare
*/ */
export async function createShare(fileServerClient: any, azureShare: any): Promise<void> { export async function createShare(fileServerClient: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => { fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Create share failed:, ${error}`); .error(`Create share failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility { ...@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
* @param azureFoler * @param azureFoler
* @param azureShare * @param azureShare
*/ */
export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<void> { export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => { fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Create directory failed:, ${error}`); .error(`Create directory failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
return deferred.promise; return deferred.promise;
} }
...@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility { ...@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
* @param azureDirectory * @param azureDirectory
*/ */
export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string, export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string,
azureShare: any): Promise<void> { azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
const directories: string[] = azureDirectory.split('/'); const directories: string[] = azureDirectory.split('/');
let rootDirectory: string = ''; let rootDirectory: string = '';
for (const directory of directories) { for (const directory of directories) {
rootDirectory += directory; rootDirectory += directory;
await createDirectory(fileServerClient, rootDirectory, azureShare); let result:boolean = await createDirectory(fileServerClient, rootDirectory, azureShare);
if (!result) {
deferred.resolve(false);
return deferred.promise;
}
rootDirectory += '/'; rootDirectory += '/';
} }
deferred.resolve(); deferred.resolve(true);
return deferred.promise; return deferred.promise;
} }
...@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility { ...@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
* @param localFilePath * @param localFilePath
*/ */
async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> { localFilePath: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
(error: any, result: any, response: any) => { (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Upload file failed:, ${error}`); .error(`Upload file failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility { ...@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
* @param localFilePath * @param localFilePath
*/ */
async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> { localFilePath: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
// tslint:disable-next-line:non-literal-fs-path // tslint:disable-next-line:non-literal-fs-path
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
(error: any, result: any, response: any) => { (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Download file failed:, ${error}`); .error(`Download file failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility { ...@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
*/ */
// tslint:disable:non-literal-fs-path // tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any, export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<void> { localDirectory: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); let result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
if (!result) {
deferred.resolve(false);
return deferred.promise;
}
for (const fileName of fileNameArray) { for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName); const fullFilePath: string = path.join(localDirectory, fileName);
try { try {
let resultUploadFile: boolean = true;
let resultUploadDir: boolean = true;
if (fs.lstatSync(fullFilePath) if (fs.lstatSync(fullFilePath)
.isFile()) { .isFile()) {
await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); resultUploadFile = await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath);
} else { } else {
// If filePath is a directory, recuisively copy it to azure // If filePath is a directory, recuisively copy it to azure
await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); resultUploadDir = await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath);
}
if (!(resultUploadFile && resultUploadDir)) {
deferred.resolve(false);
return deferred.promise;
} }
} catch (error) { } catch (error) {
deferred.reject(error); deferred.resolve(false);
return deferred.promise; return deferred.promise;
} }
} }
// All files/directories are copied successfully, resolve // All files/directories are copied successfully, resolve
deferred.resolve(); deferred.resolve(true);
return deferred.promise; return deferred.promise;
} }
......
...@@ -25,7 +25,7 @@ import * as path from 'path'; ...@@ -25,7 +25,7 @@ import * as path from 'path';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//upload code files //upload code files
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
}
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', initStatus,
Date.now(), Date.now(),
trialWorkingFolder, trialWorkingFolder,
form, form,
...@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
if (this.fcClusterConfig.storageType === 'azureStorage') { if (this.fcClusterConfig.storageType === 'azureStorage') {
if (this.azureStorageClient === undefined) { const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure =
throw new Error('azureStorageClient is not initialized'); <FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
} trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir,
try { azureFrameworkControllerClusterConfig.uploadRetryCount);
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` +
`${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);
return Promise.reject(error);
}
} else if (this.fcClusterConfig.storageType === 'nfs') { } else if (this.fcClusterConfig.storageType === 'nfs') {
const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
<FrameworkControllerClusterConfigNFS>this.fcClusterConfig; <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
......
...@@ -27,7 +27,7 @@ import * as component from '../../../common/component'; ...@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form);
//upload files to sotrage //upload files to sotrage
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
}
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', initStatus,
Date.now(), Date.now(),
trialWorkingFolder, trialWorkingFolder,
form, form,
...@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (this.azureStorageClient === undefined) { if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
try { const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount);
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${this.kubeflowTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);
return Promise.reject(error);
}
} else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
......
...@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: KeyVaultConfig; public readonly keyVault: KeyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
public readonly uploadRetryCount: number | undefined;
constructor( constructor(
apiVersion: string, apiVersion: string,
keyVault: KeyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind,
uploadRetryCount?: number
) { ) {
super(apiVersion, storage); super(apiVersion, storage);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
this.uploadRetryCount = uploadRetryCount;
} }
public get storageType(): KubernetesStorageKind { public get storageType(): KubernetesStorageKind {
...@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.apiVersion,
kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.keyVault,
kubernetesClusterConfigObjectAzure.azureStorage, kubernetesClusterConfigObjectAzure.azureStorage,
kubernetesClusterConfigObjectAzure.storage kubernetesClusterConfigObjectAzure.storage,
kubernetesClusterConfigObjectAzure.uploadRetryCount
); );
} }
} }
......
...@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log'; ...@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
import { import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric NNIManagerIpConfig, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService'; } from '../../common/trainingService';
import { getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils';
import { AzureStorageClientUtility } from './azureStorageClientUtils'; import { AzureStorageClientUtility } from './azureStorageClientUtils';
import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient'; import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient';
import { KubernetesClusterConfig } from './kubernetesConfig'; import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var yaml = require('js-yaml');
var fs = require('fs'); var fs = require('fs');
/** /**
...@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService { ...@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService {
); );
return registrySecretName; return registrySecretName;
} }
protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String, uploadRetryCount: number | undefined): Promise<string> {
if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
let trialJobOutputUrl: string = '';
let retryCount: number = 1;
if(uploadRetryCount) {
retryCount = uploadRetryCount;
}
let resultUploadNNIScript: boolean = false;
let resultUploadCodeFile: boolean = false;
try {
do {
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if(!resultUploadNNIScript) {
resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
}
//upload code files to azure storage
if(!resultUploadCodeFile) {
resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${codeDir}`);
}
if (resultUploadNNIScript && resultUploadCodeFile) {
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
break;
} else {
//wait for 5 seconds to re-upload files
await delay(5000);
this.log.info('Upload failed, Retry: upload files to azure-storage');
}
} while (retryCount-- >= 0)
} catch (error) {
this.log.error(error);
//return a empty url when got error
return Promise.resolve("");
}
if(!trialJobOutputUrl) {
this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`);
}
return Promise.resolve(trialJobOutputUrl);
}
} }
export { KubernetesTrainingService }; export { KubernetesTrainingService };
...@@ -39,6 +39,8 @@ export class PAITaskRole { ...@@ -39,6 +39,8 @@ export class PAITaskRole {
public readonly command: string; public readonly command: string;
//Shared memory for one task in the task role //Shared memory for one task in the task role
public readonly shmMB?: number; public readonly shmMB?: number;
//portList to specify the port used in container
public portList?: portListMetaData[];
/** /**
* Constructor * Constructor
...@@ -50,7 +52,7 @@ export class PAITaskRole { ...@@ -50,7 +52,7 @@ export class PAITaskRole {
* @param command Executable command for tasks in the task role, can not be empty * @param command Executable command for tasks in the task role, can not be empty
*/ */
constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number, constructor(name : string, taskNumber : number, cpuNumber : number, memoryMB : number, gpuNumber : number,
command : string, shmMB?: number) { command : string, shmMB?: number, portList?: portListMetaData[]) {
this.name = name; this.name = name;
this.taskNumber = taskNumber; this.taskNumber = taskNumber;
this.cpuNumber = cpuNumber; this.cpuNumber = cpuNumber;
...@@ -58,6 +60,7 @@ export class PAITaskRole { ...@@ -58,6 +60,7 @@ export class PAITaskRole {
this.gpuNumber = gpuNumber; this.gpuNumber = gpuNumber;
this.command = command; this.command = command;
this.shmMB = shmMB; this.shmMB = shmMB;
this.portList = portList;
} }
} }
...@@ -120,6 +123,16 @@ export class PAIClusterConfig { ...@@ -120,6 +123,16 @@ export class PAIClusterConfig {
} }
} }
/**
* portList data structure used in PAI taskRole
*/
export class portListMetaData {
public readonly label : string = '';
public readonly beginAt: number = 0;
public readonly portNumber: number = 0;
}
/** /**
* PAI trial configuration * PAI trial configuration
*/ */
...@@ -134,9 +147,11 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -134,9 +147,11 @@ export class NNIPAITrialConfig extends TrialConfig {
public shmMB?: number; public shmMB?: number;
//authentication file used for private Docker registry //authentication file used for private Docker registry
public authFile?: string; public authFile?: string;
//portList to specify the port used in container
public portList?: portListMetaData[];
constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number, constructor(command : string, codeDir : string, gpuNum : number, cpuNum: number, memoryMB: number,
image: string, virtualCluster?: string, shmMB?: number, authFile?: string) { image: string, virtualCluster?: string, shmMB?: number, authFile?: string, portList?: portListMetaData[]) {
super(command, codeDir, gpuNum); super(command, codeDir, gpuNum);
this.cpuNum = cpuNum; this.cpuNum = cpuNum;
this.memoryMB = memoryMB; this.memoryMB = memoryMB;
...@@ -144,5 +159,6 @@ export class NNIPAITrialConfig extends TrialConfig { ...@@ -144,5 +159,6 @@ export class NNIPAITrialConfig extends TrialConfig {
this.virtualCluster = virtualCluster; this.virtualCluster = virtualCluster;
this.shmMB = shmMB; this.shmMB = shmMB;
this.authFile = authFile; this.authFile = authFile;
this.portList = portList;
} }
} }
...@@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService { ...@@ -79,6 +79,7 @@ class PAITrainingService implements TrainingService {
private logCollection: string; private logCollection: string;
private isMultiPhase: boolean = false; private isMultiPhase: boolean = false;
private authFileHdfsPath: string | undefined = undefined; private authFileHdfsPath: string | undefined = undefined;
private portList?: string | undefined;
constructor() { constructor() {
this.log = getLogger(); this.log = getLogger();
...@@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService { ...@@ -446,6 +447,8 @@ class PAITrainingService implements TrainingService {
nniPaiTrialCommand, nniPaiTrialCommand,
// Task shared memory // Task shared memory
this.paiTrialConfig.shmMB, this.paiTrialConfig.shmMB,
// Task portList
this.paiTrialConfig.portList
) )
]; ];
......
...@@ -1410,14 +1410,7 @@ js-tokens@^4.0.0: ...@@ -1410,14 +1410,7 @@ js-tokens@^4.0.0:
version "4.0.0" version "4.0.0"
resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499" resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
js-yaml@^3.10.0: js-yaml@^3.10.0, js-yaml@^3.13.1:
version "3.12.0"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.12.0.tgz#eaed656ec8344f10f527c6bfa1b6e2244de167d1"
dependencies:
argparse "^1.0.7"
esprima "^4.0.0"
js-yaml@^3.13.1:
version "3.13.1" version "3.13.1"
resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.13.1.tgz#aff151b30bfdfa8e49e05da22e7415e9dfa37847" resolved "https://registry.yarnpkg.com/js-yaml/-/js-yaml-3.13.1.tgz#aff151b30bfdfa8e49e05da22e7415e9dfa37847"
dependencies: dependencies:
......
...@@ -32,7 +32,7 @@ def classic_mode( ...@@ -32,7 +32,7 @@ def classic_mode(
'''Execute the chosen function and inputs directly. '''Execute the chosen function and inputs directly.
In this mode, the trial code is only running the chosen subgraph (i.e., the chosen ops and inputs), In this mode, the trial code is only running the chosen subgraph (i.e., the chosen ops and inputs),
without touching the full model graph.''' without touching the full model graph.'''
if trial._params is None: if trial.get_current_parameter() is None:
trial.get_next_parameter() trial.get_next_parameter()
mutable_block = trial.get_current_parameter(mutable_id) mutable_block = trial.get_current_parameter(mutable_id)
chosen_layer = mutable_block[mutable_layer_id]["chosen_layer"] chosen_layer = mutable_block[mutable_layer_id]["chosen_layer"]
...@@ -118,7 +118,7 @@ def oneshot_mode( ...@@ -118,7 +118,7 @@ def oneshot_mode(
The difference is that oneshot mode does not receive subgraph. The difference is that oneshot mode does not receive subgraph.
Instead, it uses dropout to randomly dropout inputs and ops.''' Instead, it uses dropout to randomly dropout inputs and ops.'''
# NNI requires to get_next_parameter before report a result. But the parameter will not be used in this mode # NNI requires to get_next_parameter before report a result. But the parameter will not be used in this mode
if trial._params is None: if trial.get_current_parameter() is None:
trial.get_next_parameter() trial.get_next_parameter()
optional_inputs = list(optional_inputs.values()) optional_inputs = list(optional_inputs.values())
inputs_num = len(optional_inputs) inputs_num = len(optional_inputs)
......
...@@ -189,6 +189,6 @@ else: ...@@ -189,6 +189,6 @@ else:
raise RuntimeError('Unrecognized mode: %s' % mode) raise RuntimeError('Unrecognized mode: %s' % mode)
def _get_param(key): def _get_param(key):
if trial._params is None: if trial.get_current_parameter() is None:
trial.get_next_parameter() trial.get_next_parameter()
return trial.get_current_parameter(key) return trial.get_current_parameter(key)
...@@ -50,10 +50,12 @@ def get_next_parameter(): ...@@ -50,10 +50,12 @@ def get_next_parameter():
return None return None
return _params['parameters'] return _params['parameters']
def get_current_parameter(tag): def get_current_parameter(tag=None):
global _params global _params
if _params is None: if _params is None:
return None return None
if tag is None:
return _params['parameters']
return _params['parameters'][tag] return _params['parameters'][tag]
def get_experiment_id(): def get_experiment_id():
......
...@@ -85,7 +85,9 @@ class DefaultPoint extends React.Component<DefaultPointProps, DefaultPointState> ...@@ -85,7 +85,9 @@ class DefaultPoint extends React.Component<DefaultPointProps, DefaultPointState>
}); });
// deal with best metric line // deal with best metric line
const bestCurve: Array<number | object>[] = []; // best curve data source const bestCurve: Array<number | object>[] = []; // best curve data source
bestCurve.push([lineListDefault[0][0], lineListDefault[0][1], accSource[0].searchSpace]); if (lineListDefault[0] !== undefined) {
bestCurve.push([lineListDefault[0][0], lineListDefault[0][1], accSource[0].searchSpace]);
}
if (optimize === 'maximize') { if (optimize === 'maximize') {
for (let i = 1; i < lineListDefault.length; i++) { for (let i = 1; i < lineListDefault.length; i++) {
const val = lineListDefault[i][1]; const val = lineListDefault[i][1];
......
...@@ -115,6 +115,7 @@ ...@@ -115,6 +115,7 @@
} }
#detail-button{ #detail-button{
margin: 2px 0;
.common-style, .common-style:visited, .common-style:focus{ .common-style, .common-style:visited, .common-style:focus{
height: 26px; height: 26px;
border: none; border: none;
...@@ -131,7 +132,7 @@ ...@@ -131,7 +132,7 @@
.common-style:disabled{ .common-style:disabled{
background-color: #f4f4f4; background-color: #f4f4f4;
} }
.special, .special:visited, .special:focus{ .special, .special:visited, .special:focus, .special button{
height: 26px; height: 26px;
border: none; border: none;
border-radius: 0; border-radius: 0;
...@@ -146,7 +147,7 @@ ...@@ -146,7 +147,7 @@
background-color: #c8c8c8; background-color: #c8c8c8;
outline: 0; outline: 0;
} }
.special:disabled{ .special:disabled, .special button:disabled{
background-color: #f4f4f4; background-color: #f4f4f4;
color: #d9d9d9; color: #d9d9d9;
} }
......
...@@ -240,7 +240,12 @@ pai_trial_schema = { ...@@ -240,7 +240,12 @@ pai_trial_schema = {
Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\ Optional('outputDir'): And(Regex(r'hdfs://(([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(/.*)?'),\
error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'), error='ERROR: outputDir format error, outputDir format is hdfs://xxx.xxx.xxx.xxx:xxx'),
Optional('virtualCluster'): setType('virtualCluster', str), Optional('virtualCluster'): setType('virtualCluster', str),
Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode') Optional('nasMode'): setChoice('nasMode', 'classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'),
Optional('portList'): [{
"label": setType('label', str),
"beginAt": setType('beginAt', int),
"portNumber": setType('portNumber', int)
}]
} }
} }
...@@ -310,7 +315,8 @@ kubeflow_config_schema = { ...@@ -310,7 +315,8 @@ kubeflow_config_schema = {
error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
} },
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
}) })
} }
...@@ -356,7 +362,8 @@ frameworkcontroller_config_schema = { ...@@ -356,7 +362,8 @@ frameworkcontroller_config_schema = {
error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
} },
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
}) })
} }
......
...@@ -198,10 +198,7 @@ def validate_common_content(experiment_config): ...@@ -198,10 +198,7 @@ def validate_common_content(experiment_config):
Schema({**separate_schema_dict[separate_key]['customized']}).validate(experiment_config[separate_key]) Schema({**separate_schema_dict[separate_key]['customized']}).validate(experiment_config[separate_key])
except SchemaError as error: except SchemaError as error:
print_error('Your config file is not correct, please check your config file content!') print_error('Your config file is not correct, please check your config file content!')
if error.__str__().__contains__('Wrong key'): print_error(error.code)
print_error(' '.join(error.__str__().split()[:3]))
else:
print_error(error)
exit(1) exit(1)
#set default value #set default value
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment