"src/include/gridwise_direct_convolution_1.hip.hpp" did not exist on "39775d484c4d15a5b895edfc9d2323f05ab2d3d4"
Unverified Commit 2d252c9e authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Add retry policy for azureStorage (#1480)

parent a224f4f2
...@@ -519,6 +519,10 @@ machineList: ...@@ -519,6 +519,10 @@ machineList:
__azureShare__ is the share of the azure file storage. __azureShare__ is the share of the azure file storage.
* __uploadRetryCount__
If upload files to azure storage failed, NNI will retry the process of uploading, this field will specify the number of attempts to re-upload files.
* __paiConfig__ * __paiConfig__
* __userName__ * __userName__
......
...@@ -125,7 +125,8 @@ export namespace ValidationSchemas { ...@@ -125,7 +125,8 @@ export namespace ValidationSchemas {
azureStorage: joi.object({ azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) }),
uploadRetryCount: joi.number().min(1)
}), }),
frameworkcontroller_config: joi.object({ frameworkcontroller_config: joi.object({
storage: joi.string().min(1), storage: joi.string().min(1),
...@@ -141,7 +142,8 @@ export namespace ValidationSchemas { ...@@ -141,7 +142,8 @@ export namespace ValidationSchemas {
azureStorage: joi.object({ azureStorage: joi.object({
accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/), accountName: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,31}$/),
azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/) azureShare: joi.string().regex(/^([0-9]|[a-z]|[A-Z]|-){3,63}$/)
}) }),
uploadRetryCount: joi.number().min(1)
}), }),
nni_manager_ip: joi.object({ nni_manager_ip: joi.object({
nniManagerIp: joi.string().min(1) nniManagerIp: joi.string().min(1)
......
...@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility { ...@@ -35,15 +35,15 @@ export namespace AzureStorageClientUtility {
* @param fileServerClient * @param fileServerClient
* @param azureShare * @param azureShare
*/ */
export async function createShare(fileServerClient: any, azureShare: any): Promise<void> { export async function createShare(fileServerClient: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => { fileServerClient.createShareIfNotExists(azureShare, (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Create share failed:, ${error}`); .error(`Create share failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility { ...@@ -56,18 +56,17 @@ export namespace AzureStorageClientUtility {
* @param azureFoler * @param azureFoler
* @param azureShare * @param azureShare
*/ */
export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<void> { export async function createDirectory(fileServerClient: azureStorage.FileService, azureFoler: any, azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => { fileServerClient.createDirectoryIfNotExists(azureShare, azureFoler, (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Create directory failed:, ${error}`); .error(`Create directory failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
return deferred.promise; return deferred.promise;
} }
...@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility { ...@@ -77,16 +76,20 @@ export namespace AzureStorageClientUtility {
* @param azureDirectory * @param azureDirectory
*/ */
export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string, export async function createDirectoryRecursive(fileServerClient: azureStorage.FileService, azureDirectory: string,
azureShare: any): Promise<void> { azureShare: any): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
const directories: string[] = azureDirectory.split('/'); const directories: string[] = azureDirectory.split('/');
let rootDirectory: string = ''; let rootDirectory: string = '';
for (const directory of directories) { for (const directory of directories) {
rootDirectory += directory; rootDirectory += directory;
await createDirectory(fileServerClient, rootDirectory, azureShare); let result:boolean = await createDirectory(fileServerClient, rootDirectory, azureShare);
if (!result) {
deferred.resolve(false);
return deferred.promise;
}
rootDirectory += '/'; rootDirectory += '/';
} }
deferred.resolve(); deferred.resolve(true);
return deferred.promise; return deferred.promise;
} }
...@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility { ...@@ -100,16 +103,16 @@ export namespace AzureStorageClientUtility {
* @param localFilePath * @param localFilePath
*/ */
async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, async function uploadFileToAzure(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> { localFilePath: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath, await fileServerClient.createFileFromLocalFile(azureShare, azureDirectory, azureFileName, localFilePath,
(error: any, result: any, response: any) => { (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Upload file failed:, ${error}`); .error(`Upload file failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility { ...@@ -125,17 +128,17 @@ export namespace AzureStorageClientUtility {
* @param localFilePath * @param localFilePath
*/ */
async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any, async function downloadFile(fileServerClient: any, azureDirectory: string, azureFileName: any, azureShare: any,
localFilePath: string): Promise<void> { localFilePath: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
// tslint:disable-next-line:non-literal-fs-path // tslint:disable-next-line:non-literal-fs-path
await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath), await fileServerClient.getFileToStream(azureShare, azureDirectory, azureFileName, fs.createWriteStream(localFilePath),
(error: any, result: any, response: any) => { (error: any, result: any, response: any) => {
if (error) { if (error) {
getLogger() getLogger()
.error(`Download file failed:, ${error}`); .error(`Download file failed:, ${error}`);
deferred.reject(error); deferred.resolve(false);
} else { } else {
deferred.resolve(); deferred.resolve(true);
} }
}); });
...@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility { ...@@ -151,28 +154,38 @@ export namespace AzureStorageClientUtility {
*/ */
// tslint:disable:non-literal-fs-path // tslint:disable:non-literal-fs-path
export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any, export async function uploadDirectory(fileServerClient: azureStorage.FileService, azureDirectory: string, azureShare: any,
localDirectory: string): Promise<void> { localDirectory: string): Promise<boolean> {
const deferred: Deferred<void> = new Deferred<void>(); const deferred: Deferred<boolean> = new Deferred<boolean>();
const fileNameArray: string[] = fs.readdirSync(localDirectory); const fileNameArray: string[] = fs.readdirSync(localDirectory);
await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare); let result: boolean = await createDirectoryRecursive(fileServerClient, azureDirectory, azureShare);
if (!result) {
deferred.resolve(false);
return deferred.promise;
}
for (const fileName of fileNameArray) { for (const fileName of fileNameArray) {
const fullFilePath: string = path.join(localDirectory, fileName); const fullFilePath: string = path.join(localDirectory, fileName);
try { try {
let resultUploadFile: boolean = true;
let resultUploadDir: boolean = true;
if (fs.lstatSync(fullFilePath) if (fs.lstatSync(fullFilePath)
.isFile()) { .isFile()) {
await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath); resultUploadFile = await uploadFileToAzure(fileServerClient, azureDirectory, fileName, azureShare, fullFilePath);
} else { } else {
// If filePath is a directory, recuisively copy it to azure // If filePath is a directory, recuisively copy it to azure
await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath); resultUploadDir = await uploadDirectory(fileServerClient, String.Format('{0}/{1}', azureDirectory, fileName), azureShare, fullFilePath);
}
if (!(resultUploadFile && resultUploadDir)) {
deferred.resolve(false);
return deferred.promise;
} }
} catch (error) { } catch (error) {
deferred.reject(error); deferred.resolve(false);
return deferred.promise; return deferred.promise;
} }
} }
// All files/directories are copied successfully, resolve // All files/directories are copied successfully, resolve
deferred.resolve(); deferred.resolve(true);
return deferred.promise; return deferred.promise;
} }
......
...@@ -25,7 +25,7 @@ import * as path from 'path'; ...@@ -25,7 +25,7 @@ import * as path from 'path';
import * as component from '../../../common/component'; import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -102,10 +102,13 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
//upload code files //upload code files
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
}
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', initStatus,
Date.now(), Date.now(),
trialWorkingFolder, trialWorkingFolder,
form, form,
...@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -208,24 +211,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
if (this.fcClusterConfig.storageType === 'azureStorage') { if (this.fcClusterConfig.storageType === 'azureStorage') {
if (this.azureStorageClient === undefined) { const azureFrameworkControllerClusterConfig: FrameworkControllerClusterConfigAzure =
throw new Error('azureStorageClient is not initialized'); <FrameworkControllerClusterConfigAzure>this.fcClusterConfig;
} trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.fcTrialConfig.codeDir,
try { azureFrameworkControllerClusterConfig.uploadRetryCount);
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` +
`${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);
return Promise.reject(error);
}
} else if (this.fcClusterConfig.storageType === 'nfs') { } else if (this.fcClusterConfig.storageType === 'nfs') {
const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS = const nfsFrameworkControllerClusterConfig: FrameworkControllerClusterConfigNFS =
<FrameworkControllerClusterConfigNFS>this.fcClusterConfig; <FrameworkControllerClusterConfigNFS>this.fcClusterConfig;
......
...@@ -27,7 +27,7 @@ import * as component from '../../../common/component'; ...@@ -27,7 +27,7 @@ import * as component from '../../../common/component';
import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getExperimentId } from '../../../common/experimentStartupInfo';
import { import {
JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail JobApplicationForm, NNIManagerIpConfig, TrialJobApplicationForm, TrialJobDetail, TrialJobStatus
} from '../../../common/trainingService'; } from '../../../common/trainingService';
import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils'; import { delay, generateParamFileName, getExperimentRootDir, uniqueString } from '../../../common/utils';
import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData';
...@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -102,9 +102,13 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form); await this.prepareRunScript(trialLocalTempFolder, trialJobId, trialWorkingFolder, curTrialSequenceId, form);
//upload files to sotrage //upload files to sotrage
const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder); const trialJobOutputUrl: string = await this.uploadCodeFiles(trialJobId, trialLocalTempFolder);
let initStatus: TrialJobStatus = 'WAITING';
if (!trialJobOutputUrl) {
initStatus = 'FAILED';
}
const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail( const trialJobDetail: KubernetesTrialJobDetail = new KubernetesTrialJobDetail(
trialJobId, trialJobId,
'WAITING', initStatus,
Date.now(), Date.now(),
trialWorkingFolder, trialWorkingFolder,
form, form,
...@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -215,23 +219,8 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
if (this.azureStorageClient === undefined) { if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
try { const azureKubeflowClusterConfig: KubeflowClusterConfigAzure = <KubeflowClusterConfigAzure>this.kubeflowClusterConfig;
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage trialJobOutputUrl = await this.uploadFilesToAzureStorage(trialJobId, trialLocalTempFolder, this.kubeflowTrialConfig.codeDir, azureKubeflowClusterConfig.uploadRetryCount);
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${this.kubeflowTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) {
this.log.error(error);
return Promise.reject(error);
}
} else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) { } else if (this.kubeflowClusterConfig.storage === 'nfs' || this.kubeflowClusterConfig.storage === undefined) {
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
......
...@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig { ...@@ -75,16 +75,19 @@ export class KubernetesClusterConfigNFS extends KubernetesClusterConfig {
export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
public readonly keyVault: KeyVaultConfig; public readonly keyVault: KeyVaultConfig;
public readonly azureStorage: AzureStorage; public readonly azureStorage: AzureStorage;
public readonly uploadRetryCount: number | undefined;
constructor( constructor(
apiVersion: string, apiVersion: string,
keyVault: KeyVaultConfig, keyVault: KeyVaultConfig,
azureStorage: AzureStorage, azureStorage: AzureStorage,
storage?: KubernetesStorageKind storage?: KubernetesStorageKind,
uploadRetryCount?: number
) { ) {
super(apiVersion, storage); super(apiVersion, storage);
this.keyVault = keyVault; this.keyVault = keyVault;
this.azureStorage = azureStorage; this.azureStorage = azureStorage;
this.uploadRetryCount = uploadRetryCount;
} }
public get storageType(): KubernetesStorageKind { public get storageType(): KubernetesStorageKind {
...@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig { ...@@ -98,7 +101,8 @@ export class KubernetesClusterConfigAzure extends KubernetesClusterConfig {
kubernetesClusterConfigObjectAzure.apiVersion, kubernetesClusterConfigObjectAzure.apiVersion,
kubernetesClusterConfigObjectAzure.keyVault, kubernetesClusterConfigObjectAzure.keyVault,
kubernetesClusterConfigObjectAzure.azureStorage, kubernetesClusterConfigObjectAzure.azureStorage,
kubernetesClusterConfigObjectAzure.storage kubernetesClusterConfigObjectAzure.storage,
kubernetesClusterConfigObjectAzure.uploadRetryCount
); );
} }
} }
......
...@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log'; ...@@ -31,13 +31,14 @@ import { getLogger, Logger } from '../../common/log';
import { import {
NNIManagerIpConfig, TrialJobDetail, TrialJobMetric NNIManagerIpConfig, TrialJobDetail, TrialJobMetric
} from '../../common/trainingService'; } from '../../common/trainingService';
import { getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils'; import { delay, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../common/utils';
import { AzureStorageClientUtility } from './azureStorageClientUtils'; import { AzureStorageClientUtility } from './azureStorageClientUtils';
import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient'; import { GeneralK8sClient, KubernetesCRDClient } from './kubernetesApiClient';
import { KubernetesClusterConfig } from './kubernetesConfig'; import { KubernetesClusterConfig } from './kubernetesConfig';
import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData'; import { kubernetesScriptFormat, KubernetesTrialJobDetail } from './kubernetesData';
import { KubernetesJobRestServer } from './kubernetesJobRestServer'; import { KubernetesJobRestServer } from './kubernetesJobRestServer';
var yaml = require('js-yaml');
var fs = require('fs'); var fs = require('fs');
/** /**
...@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService { ...@@ -357,6 +358,52 @@ abstract class KubernetesTrainingService {
); );
return registrySecretName; return registrySecretName;
} }
protected async uploadFilesToAzureStorage(trialJobId: string, trialLocalTempFolder: String, codeDir: String, uploadRetryCount: number | undefined): Promise<string> {
if (this.azureStorageClient === undefined) {
throw new Error('azureStorageClient is not initialized');
}
let trialJobOutputUrl: string = '';
let retryCount: number = 1;
if(uploadRetryCount) {
retryCount = uploadRetryCount;
}
let resultUploadNNIScript: boolean = false;
let resultUploadCodeFile: boolean = false;
try {
do {
//upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
if(!resultUploadNNIScript) {
resultUploadNNIScript = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`);
}
//upload code files to azure storage
if(!resultUploadCodeFile) {
resultUploadCodeFile = await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${codeDir}`);
}
if (resultUploadNNIScript && resultUploadCodeFile) {
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
`/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
break;
} else {
//wait for 5 seconds to re-upload files
await delay(5000);
this.log.info('Upload failed, Retry: upload files to azure-storage');
}
} while (retryCount-- >= 0)
} catch (error) {
this.log.error(error);
//return a empty url when got error
return Promise.resolve("");
}
if(!trialJobOutputUrl) {
this.log.info(`Retry-count is used up, upload files to azureStorage for trial ${trialJobId} failed!`);
}
return Promise.resolve(trialJobOutputUrl);
}
} }
export { KubernetesTrainingService }; export { KubernetesTrainingService };
...@@ -315,7 +315,8 @@ kubeflow_config_schema = { ...@@ -315,7 +315,8 @@ kubeflow_config_schema = {
error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
} },
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
}) })
} }
...@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = { ...@@ -361,7 +362,8 @@ frameworkcontroller_config_schema = {
error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'), error='ERROR: accountName format error, accountName support using (0-9|a-z|A-Z|-)'),
'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\ 'azureShare': And(Regex('([0-9]|[a-z]|[A-Z]|-){3,63}'),\
error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)') error='ERROR: azureShare format error, azureShare support using (0-9|a-z|A-Z|-)')
} },
Optional('uploadRetryCount'): setNumberRange('uploadRetryCount', int, 1, 99999)
}) })
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment