"src/targets/gpu/max.cpp" did not exist on "d918b57f9af767d550a203aca5dec3521149b4a7"
Unverified Commit 1338c512 authored by J-shang's avatar J-shang Committed by GitHub
Browse files

support shared storage for reusable mode (#3354)

parent 715b1899
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'use strict';
import * as cpp from 'child-process-promise';
import * as path from 'path';
import { SharedStorageService, SharedStorageConfig, SharedStorageType, LocalMountedType } from '../sharedStorage'
import { MountedStorageService } from '../storages/mountedStorageService';
import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import { getLogger, Logger } from '../../../common/log';
import { getExperimentId } from '../../../common/experimentStartupInfo';
const INSTALL_NFS_CLIENT = `
#!/bin/bash
if [ -n "$(command -v nfsstat)" ]
then
exit 0
fi
if [ -n "$(command -v apt-get)" ]
then
sudo apt-get update
sudo apt-get install -y nfs-common
elif [ -n "$(command -v yum)" ]
then
sudo yum install -y nfs-utils
elif [ -n "$(command -v dnf)" ]
then
sudo dnf install -y nfs-utils
else
echo "Unknown package management."
exit 1
fi
`
class NFSSharedStorageConfig implements SharedStorageConfig {
public storageType: SharedStorageType;
public localMountPoint: string;
public remoteMountPoint: string;
public nfsServer: string;
public exportedDirectory: string;
public localMounted: LocalMountedType;
constructor(storageType: SharedStorageType, localMountPoint: string, remoteMountPoint: string,
nfsServer: string, exportedDirectory: string, localMounted: LocalMountedType) {
this.storageType = storageType;
this.localMountPoint = localMountPoint;
this.remoteMountPoint = remoteMountPoint;
this.nfsServer = nfsServer;
this.exportedDirectory = exportedDirectory;
this.localMounted = localMounted;
}
}
export class NFSSharedStorageService extends SharedStorageService {
private log: Logger;
private internalStorageService: MountedStorageService;
private experimentId: string;
private storageType?: SharedStorageType;
private nfsServer?: string;
private exportedDirectory?: string;
private localMountPoint?: string;
private remoteMountPoint?: string;
constructor() {
super();
this.log = getLogger();
this.internalStorageService = new MountedStorageService();
this.experimentId = getExperimentId();
}
public async config(key: string, value: string): Promise<void> {
if (key === TrialConfigMetadataKey.SHARED_STORAGE_CONFIG) {
const nfsConfig = <NFSSharedStorageConfig>JSON.parse(value);
this.localMountPoint = nfsConfig.localMountPoint;
this.remoteMountPoint = nfsConfig.remoteMountPoint;
this.storageType = nfsConfig.storageType;
this.nfsServer = nfsConfig.nfsServer;
this.exportedDirectory = nfsConfig.exportedDirectory;
if (nfsConfig.localMounted === 'nnimount') {
await this.helpLocalMount();
} else if (nfsConfig.localMounted === 'nomount') {
const errorMessage = `${this.storageType} Shared Storage: ${this.storageType} not Support 'nomount'.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
this.internalStorageService.initialize(this.localMountPoint, path.join(this.localMountPoint, 'nni', this.experimentId));
}
return Promise.resolve();
}
public get canLocalMounted(): boolean{
return true;
}
public get storageService(): MountedStorageService {
return this.internalStorageService;
}
public get localMountCommand(): string {
if (this.localMountPoint) {
return this.getCommand(this.localMountPoint);
} else {
this.log.error(`${this.storageType} Shared Storage: localMountPoint is not initialized.`);
return '';
}
}
public get remoteMountCommand(): string {
if (this.remoteMountPoint) {
return this.getCommand(this.remoteMountPoint);
} else {
this.log.error(`${this.storageType} Shared Storage: remoteMountPoint is not initialized.`);
return '';
}
}
private getCommand(mountPoint: string): string {
const install = `rm -f nni_install_nfsclient.sh && touch nni_install_nfsclient.sh && echo "${INSTALL_NFS_CLIENT.replace(/\$/g, `\\$`).replace(/\n/g, `\\n`).replace(/"/g, `\\"`)}" >> nni_install_nfsclient.sh && bash nni_install_nfsclient.sh`;
const mount = `mkdir -p ${mountPoint} && sudo mount ${this.nfsServer}:${this.exportedDirectory} ${mountPoint}`;
const clean = `rm -f nni_install_nfsclient.sh`;
return `${install} && ${mount} && ${clean}`;
}
public get localWorkingRoot(): string {
return `${this.localMountPoint}/nni/${this.experimentId}`;
}
public get remoteWorkingRoot(): string {
return `${this.remoteMountPoint}/nni/${this.experimentId}`;
}
private async helpLocalMount(): Promise<void> {
if (process.platform === 'win32') {
const errorMessage = `${this.storageType} Shared Storage: NNI not support auto mount ${this.storageType} under Windows yet.`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
try {
const result = await cpp.exec(this.localMountCommand);
if (result.stderr) {
throw new Error(result.stderr);
}
} catch (error) {
const errorMessage: string = `${this.storageType} Shared Storage: Mount ${this.nfsServer}:${this.exportedDirectory} to ${this.localMountPoint} failed, error is ${error}`;
this.log.error(errorMessage);
return Promise.reject(errorMessage);
}
return Promise.resolve();
}
}
......@@ -17,6 +17,11 @@ export class UtEnvironmentService extends EnvironmentService {
// storage service is tested by integration testing.
return false;
}
public get useSharedStorage(): boolean {
return false;
}
public get environmentMaintenceLoopInterval(): number {
return 1;
}
......
......@@ -7,6 +7,7 @@ import { EventEmitter } from 'events';
import * as fs from 'fs';
import * as path from 'path';
import { Writable } from 'stream';
import { Container, Scope } from 'typescript-ioc';
import { String } from 'typescript-string-operations';
import * as component from '../../common/component';
import { NNIError, NNIErrorNames, MethodNotImplementedError } from '../../common/errors';
......@@ -26,6 +27,9 @@ import { EnvironmentServiceFactory } from './environments/environmentServiceFact
import { GpuScheduler } from './gpuScheduler';
import { MountedStorageService } from './storages/mountedStorageService';
import { StorageService } from './storageService';
import { SharedStorageService, SharedStorageConfig } from './sharedStorage';
import { NFSSharedStorageService } from './shared_storages/nfsStorageService'
import { AzureBlobSharedStorageService } from './shared_storages/azureblobStorageService'
import { TrialDetail } from './trial';
......@@ -74,6 +78,10 @@ class TrialDispatcher implements TrainingService {
private isLoggedNoMoreEnvironment: boolean = false;
private isLoggedNoGpuAvailable: boolean = false;
// uses to mark whether to use shared storage
private useSharedStorage: boolean = false;
private fileCopyCompleted: boolean = false;
constructor() {
this.log = getLogger();
this.trials = new Map<string, TrialDetail>();
......@@ -195,7 +203,14 @@ class TrialDispatcher implements TrainingService {
this.log.info(`TrialDispatcher: copying code and settings.`);
let storageService: StorageService;
if (environmentService.hasStorageService) {
if (this.useSharedStorage) {
if (this.fileCopyCompleted) {
this.log.debug(`TrialDispatcher: file already copy to shared storage.`);
continue;
}
this.log.debug(`TrialDispatcher: use shared storage service.`);
storageService = component.get<SharedStorageService>(SharedStorageService).storageService;
} else if (environmentService.hasStorageService) {
this.log.debug(`TrialDispatcher: use existing storage service.`);
storageService = component.get<StorageService>(StorageService);
} else {
......@@ -223,6 +238,10 @@ class TrialDispatcher implements TrainingService {
}
await storageService.copyDirectory(trialToolsPath, envDir, true);
}
if (this.useSharedStorage) {
this.fileCopyCompleted = true;
}
}
// start channel
this.commandEmitter.on("command", (command: Command): void => {
......@@ -260,7 +279,6 @@ class TrialDispatcher implements TrainingService {
break;
case TrialConfigMetadataKey.VERSION_CHECK:
this.enableVersionCheck = (value === 'true' || value === 'True');
break;
case TrialConfigMetadataKey.LOG_COLLECTION:
this.logCollection = value;
......@@ -289,7 +307,16 @@ class TrialDispatcher implements TrainingService {
this.commandChannelSet.add(environmentService.getCommandChannel);
this.environmentServiceList.push(environmentService);
}
break;
}
case TrialConfigMetadataKey.SHARED_STORAGE_CONFIG:
if (this.useSharedStorage === false) {
await this.initializeSharedStorage(key, value);
} else {
const errorMessage = `Already has set shared storage.`;
this.log.error(errorMessage);
}
break;
}
for(const environmentService of this.environmentServiceList) {
await environmentService.config(key, value);
......@@ -618,7 +645,7 @@ class TrialDispatcher implements TrainingService {
}
}
// Schedule a environment platform for environment
private selectEnvironmentService(): EnvironmentService | undefined {
const validEnvironmentServiceList = [];
......@@ -633,7 +660,7 @@ class TrialDispatcher implements TrainingService {
// Random scheduler
return randomSelect(validEnvironmentServiceList);
}
private async prefetchEnvironments (): Promise<void> {
for (const environmentService of this.environmentServiceList) {
const number = environmentService.prefetchedEnvironmentCount;
......@@ -658,6 +685,8 @@ class TrialDispatcher implements TrainingService {
environment.command = `mkdir -p envs/${envId} && cd envs/${envId} && ${environment.command}`;
environment.useSharedStorage = this.useSharedStorage;
await environmentService.startEnvironment(environment);
this.environments.set(environment.id, environment);
......@@ -881,6 +910,30 @@ class TrialDispatcher implements TrainingService {
}
this.shouldUpdateTrials = true;
}
private async initializeSharedStorage(key: string, value: string): Promise<void> {
const storageType = (<SharedStorageConfig>JSON.parse(value)).storageType;
switch (storageType) {
case 'NFS':
Container.bind(SharedStorageService)
.to(NFSSharedStorageService)
.scope(Scope.Singleton);
break;
case 'AzureBlob':
Container.bind(SharedStorageService)
.to(AzureBlobSharedStorageService)
.scope(Scope.Singleton);
break;
default: {
const errorMessage = `Shared storage type '${storageType}' not support.`;
this.log.error(errorMessage)
return Promise.reject(errorMessage);
}
}
await component.get<SharedStorageService>(SharedStorageService).config(key, value);
this.useSharedStorage = true;
return Promise.resolve();
}
}
export { TrialDispatcher };
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment