Unverified Commit 76f39903 authored by J-shang's avatar J-shang Committed by GitHub
Browse files

sharedstorage support remote umount and fix bug (#3456)

parent 200a1086
...@@ -124,6 +124,12 @@ class LinuxCommands extends OsCommands { ...@@ -124,6 +124,12 @@ class LinuxCommands extends OsCommands {
command = `bash '${script}'`; command = `bash '${script}'`;
} else { } else {
script = script.replace(/"/g, '\\"'); script = script.replace(/"/g, '\\"');
const result = script.match(/[^\\]\\\\"/g);
if (result) {
result.forEach((res) => {
script = script.replace(res, res.replace(/"$/g, '\\"'));
})
}
command = `bash -c "${script}"`; command = `bash -c "${script}"`;
} }
return command; return command;
......
...@@ -181,7 +181,7 @@ export class RemoteEnvironmentService extends EnvironmentService { ...@@ -181,7 +181,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
} else { } else {
environment.setStatus('FAILED'); environment.setStatus('FAILED');
} }
this.releaseEnvironmentResource(environment); await this.releaseEnvironmentResource(environment);
} }
} }
} }
...@@ -194,7 +194,16 @@ export class RemoteEnvironmentService extends EnvironmentService { ...@@ -194,7 +194,16 @@ export class RemoteEnvironmentService extends EnvironmentService {
* If a environment is finished, release the connection resource * If a environment is finished, release the connection resource
* @param environment remote machine environment job detail * @param environment remote machine environment job detail
*/ */
private releaseEnvironmentResource(environment: EnvironmentInformation): void { private async releaseEnvironmentResource(environment: EnvironmentInformation): Promise<void> {
if (environment.useSharedStorage) {
const executor = await this.getExecutor(environment.id);
const remoteUmountCommand = component.get<SharedStorageService>(SharedStorageService).remoteUmountCommand;
const result = await executor.executeScript(remoteUmountCommand, false, false);
if (result.exitCode !== 0) {
this.log.error(`Umount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
}
const executorManager = this.environmentExecutorManagerMap.get(environment.id); const executorManager = this.environmentExecutorManagerMap.get(environment.id);
if (executorManager === undefined) { if (executorManager === undefined) {
throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`); throw new Error(`ExecutorManager is not assigned for environment ${environment.id}`);
...@@ -251,8 +260,11 @@ export class RemoteEnvironmentService extends EnvironmentService { ...@@ -251,8 +260,11 @@ export class RemoteEnvironmentService extends EnvironmentService {
const executor = await this.getExecutor(environment.id); const executor = await this.getExecutor(environment.id);
if (environment.useSharedStorage) { if (environment.useSharedStorage) {
this.remoteExperimentRootDir = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot; this.remoteExperimentRootDir = component.get<SharedStorageService>(SharedStorageService).remoteWorkingRoot;
const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand; const remoteMountCommand = component.get<SharedStorageService>(SharedStorageService).remoteMountCommand.replace(/echo -e /g, `echo `).replace(/echo /g, `echo -e `);
await executor.executeScript(remoteMountCommand, false, false); const result = await executor.executeScript(remoteMountCommand, false, false);
if (result.exitCode !== 0) {
throw new Error(`Mount shared storage on remote machine failed.\n ERROR: ${result.stderr}`);
}
} else { } else {
this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId()); this.remoteExperimentRootDir = executor.getRemoteExperimentRootDir(getExperimentId());
} }
...@@ -304,14 +316,14 @@ export class RemoteEnvironmentService extends EnvironmentService { ...@@ -304,14 +316,14 @@ export class RemoteEnvironmentService extends EnvironmentService {
if (environment.status === 'UNKNOWN') { if (environment.status === 'UNKNOWN') {
environment.status = 'USER_CANCELED'; environment.status = 'USER_CANCELED';
this.releaseEnvironmentResource(environment); await this.releaseEnvironmentResource(environment);
return return
} }
const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`; const jobpidPath: string = `${environment.runnerWorkingFolder}/pid`;
try { try {
await executor.killChildProcesses(jobpidPath); await executor.killChildProcesses(jobpidPath);
this.releaseEnvironmentResource(environment); await this.releaseEnvironmentResource(environment);
} catch (error) { } catch (error) {
this.log.error(`stopEnvironment: ${error}`); this.log.error(`stopEnvironment: ${error}`);
} }
......
...@@ -20,6 +20,8 @@ export abstract class SharedStorageService { ...@@ -20,6 +20,8 @@ export abstract class SharedStorageService {
public abstract get storageService(): StorageService; public abstract get storageService(): StorageService;
public abstract get localMountCommand(): string; public abstract get localMountCommand(): string;
public abstract get remoteMountCommand(): string; public abstract get remoteMountCommand(): string;
public abstract get remoteUmountCommand(): string;
public abstract get localWorkingRoot(): string; public abstract get localWorkingRoot(): string;
public abstract get remoteWorkingRoot(): string; public abstract get remoteWorkingRoot(): string;
public abstract cleanUp(): Promise<void>;
} }
...@@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService { ...@@ -79,6 +79,7 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
private log: Logger; private log: Logger;
private internalStorageService: MountedStorageService; private internalStorageService: MountedStorageService;
private experimentId: string; private experimentId: string;
private localMounted?: string;
private storageType?: SharedStorageType; private storageType?: SharedStorageType;
private storageAccountName?: string; private storageAccountName?: string;
...@@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService { ...@@ -113,10 +114,10 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
this.log.error(errorMessage); this.log.error(errorMessage);
return Promise.reject(errorMessage); return Promise.reject(errorMessage);
} }
this.localMounted = azureblobConfig.localMounted;
if (azureblobConfig.localMounted === 'nnimount') { if (this.localMounted === 'nnimount') {
await this.helpLocalMount(); await this.helpLocalMount();
} else if (azureblobConfig.localMounted === 'nomount') { } else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`; const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount' yet.`;
this.log.error(errorMessage); this.log.error(errorMessage);
return Promise.reject(errorMessage); return Promise.reject(errorMessage);
...@@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService { ...@@ -154,6 +155,15 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
} }
} }
public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}
private getCommand(mountPoint: string): string { private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`; const install = `rm -f nni_install_fuseblob.sh && touch nni_install_fuseblob.sh && echo "${INSTALL_BLOBFUSE.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_fuseblob.sh && bash nni_install_fuseblob.sh`;
const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`; const prepare = `sudo mkdir /mnt/resource/nniblobfusetmp -p && rm -f nni_fuse_connection.cfg && touch nni_fuse_connection.cfg && echo "accountName ${this.storageAccountName}\\naccountKey ${this.storageAccountKey}\\ncontainerName ${this.containerName}" >> nni_fuse_connection.cfg`;
...@@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService { ...@@ -206,4 +216,21 @@ export class AzureBlobSharedStorageService extends SharedStorageService {
return Promise.reject(errorMessage); return Promise.reject(errorMessage);
} }
} }
public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
} }
...@@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService { ...@@ -59,6 +59,7 @@ export class NFSSharedStorageService extends SharedStorageService {
private log: Logger; private log: Logger;
private internalStorageService: MountedStorageService; private internalStorageService: MountedStorageService;
private experimentId: string; private experimentId: string;
private localMounted?: string;
private storageType?: SharedStorageType; private storageType?: SharedStorageType;
private nfsServer?: string; private nfsServer?: string;
...@@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService { ...@@ -83,9 +84,10 @@ export class NFSSharedStorageService extends SharedStorageService {
this.storageType = nfsConfig.storageType; this.storageType = nfsConfig.storageType;
this.nfsServer = nfsConfig.nfsServer; this.nfsServer = nfsConfig.nfsServer;
this.exportedDirectory = nfsConfig.exportedDirectory; this.exportedDirectory = nfsConfig.exportedDirectory;
if (nfsConfig.localMounted === 'nnimount') { this.localMounted = nfsConfig.localMounted;
if (this.localMounted === 'nnimount') {
await this.helpLocalMount(); await this.helpLocalMount();
} else if (nfsConfig.localMounted === 'nomount') { } else if (this.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`; const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`;
this.log.error(errorMessage); this.log.error(errorMessage);
return Promise.reject(errorMessage); return Promise.reject(errorMessage);
...@@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService { ...@@ -122,6 +124,15 @@ export class NFSSharedStorageService extends SharedStorageService {
} }
} }
public get remoteUmountCommand(): string {
if (this.remoteMountPoint) {
return `sudo umount -f -l ${this.remoteMountPoint}`;
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}
private getCommand(mountPoint: string): string { private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`; const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`;
const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`; const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`;
...@@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService { ...@@ -157,4 +168,21 @@ export class NFSSharedStorageService extends SharedStorageService {
return Promise.resolve(); return Promise.resolve();
} }
public async cleanUp(): Promise<void> {
if (this.localMounted !== 'nnimount') {
return Promise.resolve();
}
try {
const result = await cpp.exec(`sudo umount -f -l ${this.localMountPoint}`);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Umount ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
} }
...@@ -348,6 +348,11 @@ class TrialDispatcher implements TrainingService { ...@@ -348,6 +348,11 @@ class TrialDispatcher implements TrainingService {
for (const commandChannel of this.commandChannelSet) { for (const commandChannel of this.commandChannelSet) {
await commandChannel.stop(); await commandChannel.stop();
} }
if (this.useSharedStorage) {
this.log.info(`stopping shared storage...`)
await component.get<SharedStorageService>(SharedStorageService).cleanUp();
this.log.info(`shared storage stopped.`)
}
} }
private async environmentMaintenanceLoop(): Promise<void> { private async environmentMaintenanceLoop(): Promise<void> {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment