Unverified Commit a5fa2351 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Do not copy codeDir when submit trial in Kubeflow and Frameworkcontroller mode (#1309)

parent 251a439d
...@@ -201,6 +201,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -201,6 +201,10 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if (this.fcTrialConfig === undefined) {
throw new Error('Kubeflow trial config is not initialized');
}
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
if (this.fcClusterConfig.storageType === 'azureStorage') { if (this.fcClusterConfig.storageType === 'azureStorage') {
...@@ -208,12 +212,15 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -208,12 +212,15 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
try { try {
//upload local files to azure storage //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory( await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`); this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(
this.azureStorageClient, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `${this.fcTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/\ trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/` +
${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; `${this.azureStorageShare}/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) { } catch (error) {
this.log.error(error); this.log.error(error);
...@@ -226,7 +233,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -226,7 +233,8 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy code files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs; const nfsConfig: NFSConfig = nfsFrameworkControllerClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`; trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
} }
...@@ -257,13 +265,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -257,13 +265,12 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('frameworkcontroller trial config is not initialized'); throw new Error('frameworkcontroller trial config is not initialized');
} }
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
await cpp.exec(`cp -r ${this.fcTrialConfig.codeDir} ${trialLocalTempFolder}`);
const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const installScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), installScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally. // Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
for (const taskRole of this.fcTrialConfig.taskRoles) { for (const taskRole of this.fcTrialConfig.taskRoles) {
const runScriptContent: string = const runScriptContent: string =
......
...@@ -201,6 +201,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -201,6 +201,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('Kubeflow Cluster config is not initialized'); throw new Error('Kubeflow Cluster config is not initialized');
} }
if (this.kubeflowTrialConfig === undefined) {
throw new Error('Kubeflow Trial config is not initialized');
}
let trialJobOutputUrl: string = ''; let trialJobOutputUrl: string = '';
assert(this.kubeflowClusterConfig.storage === undefined assert(this.kubeflowClusterConfig.storage === undefined
...@@ -212,13 +216,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -212,13 +216,17 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('azureStorageClient is not initialized'); throw new Error('azureStorageClient is not initialized');
} }
try { try {
//upload local files to azure storage //upload local files, including scripts for running the trial and configuration (e.g., hyperparameters) for the trial, to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient, await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare, `nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${trialLocalTempFolder}`); `${trialLocalTempFolder}`);
//upload code files to azure storage
await AzureStorageClientUtility.uploadDirectory(this.azureStorageClient,
`nni/${getExperimentId()}/${trialJobId}`, this.azureStorageShare,
`${this.kubeflowTrialConfig.codeDir}`);
trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}\ trialJobOutputUrl = `https://${this.azureStorageAccountName}.file.core.windows.net/${this.azureStorageShare}` +
/${path.join('nni', getExperimentId(), trialJobId, 'output')}`; `/${path.join('nni', getExperimentId(), trialJobId, 'output')}`;
} catch (error) { } catch (error) {
this.log.error(error); this.log.error(error);
...@@ -228,9 +236,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -228,9 +236,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig; const nfsKubeflowClusterConfig: KubeflowClusterConfigNFS = <KubeflowClusterConfigNFS>this.kubeflowClusterConfig;
// Creat work dir for current trial in NFS directory // Creat work dir for current trial in NFS directory
await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`); await cpp.exec(`mkdir -p ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}`);
// Copy code files from local dir to NFS mounted dir // Copy script files from local dir to NFS mounted dir
await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`); await cpp.exec(`cp -r ${trialLocalTempFolder}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
// Copy codeDir to NFS mounted dir
await cpp.exec(`cp -r ${this.kubeflowTrialConfig.codeDir}/* ${this.trialLocalNFSTempFolder}/nni/${getExperimentId()}/${trialJobId}/.`);
const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs; const nfsConfig: NFSConfig = nfsKubeflowClusterConfig.nfs;
trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`; trialJobOutputUrl = `nfs://${nfsConfig.server}:${path.join(nfsConfig.path, 'nni', getExperimentId(), trialJobId, 'output')}`;
} }
...@@ -255,13 +264,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -255,13 +264,10 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
} }
//create tmp trial working folder locally. //create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${path.dirname(trialLocalTempFolder)}`); await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
await cpp.exec(`cp -r ${kubeflowTrialConfig.codeDir} ${trialLocalTempFolder}`);
const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT; const runScriptContent : string = CONTAINER_INSTALL_NNI_SHELL_FORMAT;
// Write NNI installation file to local tmp files // Write NNI installation file to local tmp files
await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' }); await fs.promises.writeFile(path.join(trialLocalTempFolder, 'install_nni.sh'), runScriptContent, { encoding: 'utf8' });
// Create tmp trial working folder locally.
await cpp.exec(`mkdir -p ${trialLocalTempFolder}`);
// Write worker file content run_worker.sh to local tmp folders // Write worker file content run_worker.sh to local tmp folders
if (kubeflowTrialConfig.worker !== undefined) { if (kubeflowTrialConfig.worker !== undefined) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment