experimentConfig.ts 6.57 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
import assert from 'assert';
5

6
7
8
import { KubeflowOperator, OperatorApiVersion } from '../training_service/kubernetes/kubeflow/kubeflowConfig'
import { KubernetesStorageKind } from '../training_service/kubernetes/kubernetesConfig';

9
10
export interface TrainingServiceConfig {
    platform: string;
liuzhe-lz's avatar
liuzhe-lz committed
11
12
13
14
15
16
17
18
19
20
21
    trialCommand: string;
    trialCodeDirectory: string;
    trialGpuNumber?: number;
    nniManagerIp?: string;

    // FIXME
    // "debug" is only used by openpai to decide whether to check remote nni version
    // it should be better to check when local nni version is not "dev"
    // it should be even better to check version before launching the experiment and let user to confirm
    // log level is currently handled by global logging module and has nothing to do with this
    debug?: boolean;
22
23
24
25
26
27
28
29
30
}

/* Local */

export interface LocalConfig extends TrainingServiceConfig {
    platform: 'local';
    useActiveGpu?: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
liuzhe-lz's avatar
liuzhe-lz committed
31
    reuseMode: boolean;
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
}

/* Remote */

export interface RemoteMachineConfig {
    host: string;
    port: number;
    user: string;
    password?: string;
    sshKeyFile: string;
    sshPassphrase?: string;
    useActiveGpu: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
    pythonPath?: string;
}

export interface RemoteConfig extends TrainingServiceConfig {
    platform: 'remote';
    machineList: RemoteMachineConfig[];
liuzhe-lz's avatar
liuzhe-lz committed
52
    reuseMode: boolean;
53
54
55
56
57
58
59
60
61
62
63
64
65
}

/* OpenPAI */

export interface OpenpaiConfig extends TrainingServiceConfig {
    platform: 'openpai';
    host: string;
    username: string;
    token: string;
    trialCpuNumber: number;
    trialMemorySize: string;
    storageConfigName: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
66
    virtualCluster?: string;
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
    reuseMode: boolean;
    openpaiConfig?: object;
}

/* AML */

export interface AmlConfig extends TrainingServiceConfig {
    platform: 'aml';
    subscriptionId: string;
    resourceGroup: string;
    workspaceName: string;
    computeTarget: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
82
    maxTrialNumberPerGpu: number;
83
84
}

85
86
87
88
89
90
91
92
93
94
95

/*  Alibaba PAI DLC  */
export interface DlcConfig extends TrainingServiceConfig {
    platfrom: 'dlc';
    type: string;
    image: string;
    jobType: string;
    podCount: number;
    ecsSpec: string;
    region: string;
    nasDataSourceId: string;
96
    ossDataSourceId?: string;
97
98
99
100
101
102
103
    accessKeyId: string;
    accessKeySecret: string;
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
}
/* Kubeflow */

104
export interface KubernetesStorageConfig {
105
    storageType: string;
106
107
108
109
    server?: string;
    path?: string;
    azureAccount?: string;
    azureShare?: string;
110
111
    keyVaultName?: string;
    keyVaultKey?: string;
112
113
114
115
116
117
118
}

export interface KubeflowRoleConfig {
    replicas: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
119
    memorySize: string | number;
120
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
121
    codeDirectory: string;
122
    privateRegistryAuthPath?: string;
123
124
125
126
}

export interface KubeflowConfig extends TrainingServiceConfig {
    platform: 'kubeflow';
127
128
    operator: KubeflowOperator;
    apiVersion: OperatorApiVersion;
129
    storage: KubernetesStorageConfig;
liuzhe-lz's avatar
liuzhe-lz committed
130
131
132
    worker?: KubeflowRoleConfig;
    ps?: KubeflowRoleConfig;
    master?: KubeflowRoleConfig;
133
    reuseMode: boolean;
liuzhe-lz's avatar
liuzhe-lz committed
134
    maxTrialNumberPerGpu?: number;
135
136
}

137
export interface FrameworkControllerTaskRoleConfig {
138
    name: string;
liuzhe-lz's avatar
liuzhe-lz committed
139
    dockerImage: string;
140
141
142
143
    taskNumber: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
144
    memorySize: string | number;
145
146
147
148
    frameworkAttemptCompletionPolicy: {
        minFailedTaskCount: number;
        minSucceedTaskCount: number;
    };
liuzhe-lz's avatar
liuzhe-lz committed
149
    privateRegistryAuthPath?: string;
150
151
152
153
}

export interface FrameworkControllerConfig extends TrainingServiceConfig {
    platform: 'frameworkcontroller';
154
    storage: KubernetesStorageConfig;
155
    serviceAccountName: string;
liuzhe-lz's avatar
liuzhe-lz committed
156
157
158
159
160
    taskRoles: FrameworkControllerTaskRoleConfig[];
    reuseMode: boolean;
    maxTrialNumberPerGpu?: number;
    namespace?: 'default';
    apiVersion?: string;
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
}

/* shared storage */

export interface SharedStorageConfig {
    storageType: string;
    localMountPoint: string;
    remoteMountPoint: string;
    localMounted: string;
}

export interface NfsConfig extends SharedStorageConfig {
    storageType: 'NFS';
    nfsServer: string;
    exportedDirectory: string;
}

export interface AzureBlobConfig extends SharedStorageConfig {
    storageAccountName: string;
    storageAccountKey?: string;
    containerName: string;
}

/* common */

export interface AlgorithmConfig {
    name?: string;
    className?: string;
    codeDirectory?: string;
    classArgs?: object;
}

export interface ExperimentConfig {
    experimentName?: string;
liuzhe-lz's avatar
liuzhe-lz committed
195
    // searchSpaceFile  (handled in python part)
196
197
198
199
200
    searchSpace: any;
    trialCommand: string;
    trialCodeDirectory: string;
    trialConcurrency: number;
    trialGpuNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
201
    maxExperimentDuration?: string | number;
202
    maxTrialNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
203
    maxTrialDuration?: string | number;
204
    nniManagerIp?: string;
liuzhe-lz's avatar
liuzhe-lz committed
205
    // useAnnotation  (handled in python part)
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
    debug: boolean;
    logLevel?: string;
    experimentWorkingDirectory?: string;
    tunerGpuIndices?: number[];
    tuner?: AlgorithmConfig;
    assessor?: AlgorithmConfig;
    advisor?: AlgorithmConfig;
    trainingService: TrainingServiceConfig | TrainingServiceConfig[];
    sharedStorage?: SharedStorageConfig;
    deprecated?: any;  // configs that are not yet natively supported by v2 (workaround)
}

/* util functions */

const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
liuzhe-lz's avatar
liuzhe-lz committed
221
const sizeUnits = { tb: 1024 ** 4, gb: 1024 ** 3, mb: 1024 ** 2, kb: 1024, b: 1 };
222

liuzhe-lz's avatar
liuzhe-lz committed
223
224
225
226
227
228
229
230
231
232
function toUnit(value: string | number, targetUnit: string, allUnits: any): number {
    if (typeof value === 'number') {
        return value;
    }
    value = value.toLowerCase();
    for (const [unit, factor] of Object.entries(allUnits)) {
        if (value.endsWith(unit)) {
            const digits = value.slice(0, -unit.length);
            const num = Number(digits) * (factor as number);
            return Math.ceil(num / allUnits[targetUnit]);
233
234
        }
    }
liuzhe-lz's avatar
liuzhe-lz committed
235
    throw new Error(`Bad unit in "${value}"`);
236
237
}

liuzhe-lz's avatar
liuzhe-lz committed
238
239
export function toSeconds(time: string | number): number {
    return toUnit(time, 's', timeUnits);
240
241
}

liuzhe-lz's avatar
liuzhe-lz committed
242
243
export function toMegaBytes(size: string | number): number {
    return toUnit(size, 'mb', sizeUnits);
244
245
}

liuzhe-lz's avatar
liuzhe-lz committed
246
247
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
        return gpuIndices === undefined ? '' : gpuIndices.join(',');
liuzhe-lz's avatar
liuzhe-lz committed
248
}