Unverified Commit 800c675f authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Fix pai reuse mode (#4027)

parent cc5a4fc5
......@@ -16,7 +16,9 @@ _logger = logging.getLogger(__name__)
def to_v2(v1) -> ExperimentConfig:
v1 = copy.deepcopy(v1)
platform = v1.pop('trainingServicePlatform')
assert platform in ['local', 'remote', 'openpai', 'aml']
assert platform in ['local', 'remote', 'pai', 'aml']
if platform == 'pai':
platform = 'openpai'
v2 = ExperimentConfig(platform)
_drop_field(v1, 'authorName')
......@@ -88,7 +90,7 @@ def to_v2(v1) -> ExperimentConfig:
if 'memoryMB' in v1_trial:
ts.trial_memory_size = str(v1_trial.pop('memoryMB')) + 'mb'
_move_field(v1_trial, ts, 'image', 'docker_image')
_deprecate(v1_trial, v2, 'virtualCluster')
_move_field(v1_trial, ts, 'virtualCluster', 'virtual_cluster')
_move_field(v1_trial, ts, 'paiStorageConfigName', 'storage_config_name')
_move_field(v1_trial, ts, 'paiConfigPath', 'openpaiConfigFile')
......
......@@ -21,6 +21,7 @@ class OpenpaiConfig(TrainingServiceConfig):
trial_memory_size: str
storage_config_name: str
docker_image: str = 'msranni/nni:latest'
virtual_cluster: Optional[str]
local_storage_mount_point: PathLike
container_storage_mount_point: str
reuse_mode: bool = True
......
......@@ -58,6 +58,7 @@ export interface OpenpaiConfig extends TrainingServiceConfig {
containerStorageMountPoint: string;
reuseMode: boolean;
openpaiConfig?: object;
virtualCluster?: string;
}
/* AML */
......@@ -198,7 +199,7 @@ export function toSeconds(time: string): number {
throw new Error(`Bad time string "${time}"`);
}
const sizeUnits = { tb: 1024 * 1024, gb: 1024 * 1024, mb: 1, kb: 1 / 1024 };
const sizeUnits = { tb: 1024 * 1024, gb: 1024, mb: 1, kb: 1 / 1024 };
export function toMegaBytes(size: string): number {
for (const [unit, factor] of Object.entries(sizeUnits)) {
......
......@@ -5,6 +5,7 @@
import * as yaml from 'js-yaml';
import * as request from 'request';
import { Container, Scope } from 'typescript-ioc';
import { Deferred } from 'ts-deferred';
import * as component from '../../../common/component';
import { ExperimentConfig, OpenpaiConfig, flattenConfig, toMegaBytes } from '../../../common/experimentConfig';
......@@ -15,6 +16,7 @@ import { NNIPAITrialConfig } from '../../pai/paiConfig';
import { EnvironmentInformation, EnvironmentService } from '../environment';
import { SharedStorageService } from '../sharedStorage';
import { MountedStorageService } from '../storages/mountedStorageService';
import { StorageService } from '../storageService';
interface FlattenOpenpaiConfig extends ExperimentConfig, OpenpaiConfig { }
......@@ -38,9 +40,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
this.config = flattenConfig(config, 'openpai');
this.paiToken = this.config.token;
this.protocol = this.config.host.toLowerCase().startsWith('https://') ? 'https' : 'http';
// FIXME: only support MountedStorageService
const storageService = new MountedStorageService();
Container.bind(StorageService)
.to(MountedStorageService)
.scope(Scope.Singleton);
const storageService = component.get<StorageService>(StorageService)
const remoteRoot = storageService.joinPath(this.config.localStorageMountPoint, this.experimentId);
storageService.initialize(this.config.localStorageMountPoint, remoteRoot);
}
......@@ -286,7 +289,7 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
taskRetryCount: 0,
dockerImage: 'docker_image_0',
resourcePerInstance: {
gpu: this.config.trialGpuNumber,
gpu: this.config.trialGpuNumber === undefined? 0: this.config.trialGpuNumber,
cpu: this.config.trialCpuNumber,
memoryMB: toMegaBytes(this.config.trialMemorySize)
},
......@@ -304,9 +307,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
submitFrom: 'submit-job-v2'
}
}
if (this.config.deprecated && this.config.deprecated.virtualCluster) {
if (this.config.virtualCluster) {
nniJobConfig.defaults = {
virtualCluster: this.config.deprecated.virtualCluster
virtualCluster: this.config.virtualCluster
}
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment