experimentConfig.ts 6.59 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
import assert from 'assert';
5

6
7
8
import { KubeflowOperator, OperatorApiVersion } from '../training_service/kubernetes/kubeflow/kubeflowConfig'
import { KubernetesStorageKind } from '../training_service/kubernetes/kubernetesConfig';

9
10
export interface TrainingServiceConfig {
    platform: string;
liuzhe-lz's avatar
liuzhe-lz committed
11
12
13
14
15
16
17
18
19
20
21
    trialCommand: string;
    trialCodeDirectory: string;
    trialGpuNumber?: number;
    nniManagerIp?: string;

    // FIXME
    // "debug" is only used by openpai to decide whether to check remote nni version
    // it should be better to check when local nni version is not "dev"
    // it should be even better to check version before launching the experiment and let user to confirm
    // log level is currently handled by global logging module and has nothing to do with this
    debug?: boolean;
22
23
24
25
26
27
28
29
30
}

/* Local */

export interface LocalConfig extends TrainingServiceConfig {
    platform: 'local';
    useActiveGpu?: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
liuzhe-lz's avatar
liuzhe-lz committed
31
    reuseMode: boolean;
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
}

/* Remote */

export interface RemoteMachineConfig {
    host: string;
    port: number;
    user: string;
    password?: string;
    sshKeyFile: string;
    sshPassphrase?: string;
    useActiveGpu: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
    pythonPath?: string;
}

export interface RemoteConfig extends TrainingServiceConfig {
    platform: 'remote';
    machineList: RemoteMachineConfig[];
liuzhe-lz's avatar
liuzhe-lz committed
52
    reuseMode: boolean;
53
54
55
56
57
58
59
60
61
62
63
64
65
}

/* OpenPAI */

export interface OpenpaiConfig extends TrainingServiceConfig {
    platform: 'openpai';
    host: string;
    username: string;
    token: string;
    trialCpuNumber: number;
    trialMemorySize: string;
    storageConfigName: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
66
    virtualCluster?: string;
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
    reuseMode: boolean;
    openpaiConfig?: object;
}

/* AML */

export interface AmlConfig extends TrainingServiceConfig {
    platform: 'aml';
    subscriptionId: string;
    resourceGroup: string;
    workspaceName: string;
    computeTarget: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
82
    maxTrialNumberPerGpu: number;
83
84
}

85
86
87
88
89
90
91
92
93
94
95

/*  Alibaba PAI DLC  */
export interface DlcConfig extends TrainingServiceConfig {
    platfrom: 'dlc';
    type: string;
    image: string;
    jobType: string;
    podCount: number;
    ecsSpec: string;
    region: string;
    nasDataSourceId: string;
96
    ossDataSourceId?: string;
97
98
99
100
101
102
103
    accessKeyId: string;
    accessKeySecret: string;
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
}
/* Kubeflow */

104
export interface KubernetesStorageConfig {
105
    storageType: string;
106
107
108
109
    server?: string;
    path?: string;
    azureAccount?: string;
    azureShare?: string;
110
111
    keyVaultName?: string;
    keyVaultKey?: string;
112
113
114
115
116
117
118
}

export interface KubeflowRoleConfig {
    replicas: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
119
    memorySize: string | number;
120
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
121
    codeDirectory: string;
122
    privateRegistryAuthPath?: string;
123
124
125
126
}

export interface KubeflowConfig extends TrainingServiceConfig {
    platform: 'kubeflow';
127
128
    operator: KubeflowOperator;
    apiVersion: OperatorApiVersion;
129
    storage: KubernetesStorageConfig;
liuzhe-lz's avatar
liuzhe-lz committed
130
131
132
    worker?: KubeflowRoleConfig;
    ps?: KubeflowRoleConfig;
    master?: KubeflowRoleConfig;
133
    reuseMode: boolean;
liuzhe-lz's avatar
liuzhe-lz committed
134
    maxTrialNumberPerGpu?: number;
J-shang's avatar
J-shang committed
135
    namespace?: string;
136
137
}

138
export interface FrameworkControllerTaskRoleConfig {
139
    name: string;
liuzhe-lz's avatar
liuzhe-lz committed
140
    dockerImage: string;
141
142
143
144
    taskNumber: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
145
    memorySize: string | number;
146
147
148
149
    frameworkAttemptCompletionPolicy: {
        minFailedTaskCount: number;
        minSucceedTaskCount: number;
    };
liuzhe-lz's avatar
liuzhe-lz committed
150
    privateRegistryAuthPath?: string;
151
152
153
154
}

export interface FrameworkControllerConfig extends TrainingServiceConfig {
    platform: 'frameworkcontroller';
155
    storage: KubernetesStorageConfig;
156
    serviceAccountName: string;
liuzhe-lz's avatar
liuzhe-lz committed
157
158
159
    taskRoles: FrameworkControllerTaskRoleConfig[];
    reuseMode: boolean;
    maxTrialNumberPerGpu?: number;
J-shang's avatar
J-shang committed
160
    namespace?: string;
liuzhe-lz's avatar
liuzhe-lz committed
161
    apiVersion?: string;
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
}

/* shared storage */

export interface SharedStorageConfig {
    storageType: string;
    localMountPoint: string;
    remoteMountPoint: string;
    localMounted: string;
}

export interface NfsConfig extends SharedStorageConfig {
    storageType: 'NFS';
    nfsServer: string;
    exportedDirectory: string;
}

export interface AzureBlobConfig extends SharedStorageConfig {
    storageAccountName: string;
    storageAccountKey?: string;
    containerName: string;
}

/* common */

export interface AlgorithmConfig {
    name?: string;
    className?: string;
    codeDirectory?: string;
    classArgs?: object;
}

export interface ExperimentConfig {
    experimentName?: string;
liuzhe-lz's avatar
liuzhe-lz committed
196
    // searchSpaceFile  (handled in python part)
197
198
199
200
201
    searchSpace: any;
    trialCommand: string;
    trialCodeDirectory: string;
    trialConcurrency: number;
    trialGpuNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
202
    maxExperimentDuration?: string | number;
203
    maxTrialNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
204
    maxTrialDuration?: string | number;
205
    nniManagerIp?: string;
liuzhe-lz's avatar
liuzhe-lz committed
206
    // useAnnotation  (handled in python part)
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    debug: boolean;
    logLevel?: string;
    experimentWorkingDirectory?: string;
    tunerGpuIndices?: number[];
    tuner?: AlgorithmConfig;
    assessor?: AlgorithmConfig;
    advisor?: AlgorithmConfig;
    trainingService: TrainingServiceConfig | TrainingServiceConfig[];
    sharedStorage?: SharedStorageConfig;
    deprecated?: any;  // configs that are not yet natively supported by v2 (workaround)
}

/* util functions */

const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
liuzhe-lz's avatar
liuzhe-lz committed
222
const sizeUnits = { tb: 1024 ** 4, gb: 1024 ** 3, mb: 1024 ** 2, kb: 1024, b: 1 };
223

liuzhe-lz's avatar
liuzhe-lz committed
224
225
226
227
228
229
230
231
232
233
function toUnit(value: string | number, targetUnit: string, allUnits: any): number {
    if (typeof value === 'number') {
        return value;
    }
    value = value.toLowerCase();
    for (const [unit, factor] of Object.entries(allUnits)) {
        if (value.endsWith(unit)) {
            const digits = value.slice(0, -unit.length);
            const num = Number(digits) * (factor as number);
            return Math.ceil(num / allUnits[targetUnit]);
234
235
        }
    }
liuzhe-lz's avatar
liuzhe-lz committed
236
    throw new Error(`Bad unit in "${value}"`);
237
238
}

liuzhe-lz's avatar
liuzhe-lz committed
239
240
export function toSeconds(time: string | number): number {
    return toUnit(time, 's', timeUnits);
241
242
}

liuzhe-lz's avatar
liuzhe-lz committed
243
244
export function toMegaBytes(size: string | number): number {
    return toUnit(size, 'mb', sizeUnits);
245
246
}

liuzhe-lz's avatar
liuzhe-lz committed
247
248
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
        return gpuIndices === undefined ? '' : gpuIndices.join(',');
liuzhe-lz's avatar
liuzhe-lz committed
249
}