experimentConfig.ts 6.54 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
import assert from 'assert';
5

6
7
8
import { KubeflowOperator, OperatorApiVersion } from '../training_service/kubernetes/kubeflow/kubeflowConfig'
import { KubernetesStorageKind } from '../training_service/kubernetes/kubernetesConfig';

9
10
export interface TrainingServiceConfig {
    platform: string;
liuzhe-lz's avatar
liuzhe-lz committed
11
12
13
14
15
16
17
18
19
20
21
    trialCommand: string;
    trialCodeDirectory: string;
    trialGpuNumber?: number;
    nniManagerIp?: string;

    // FIXME
    // "debug" is only used by openpai to decide whether to check remote nni version
    // it should be better to check when local nni version is not "dev"
    // it should be even better to check version before launching the experiment and let user to confirm
    // log level is currently handled by global logging module and has nothing to do with this
    debug?: boolean;
22
23
24
25
26
27
28
29
30
}

/* Local */

export interface LocalConfig extends TrainingServiceConfig {
    platform: 'local';
    useActiveGpu?: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
liuzhe-lz's avatar
liuzhe-lz committed
31
    reuseMode: boolean;
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
}

/* Remote */

export interface RemoteMachineConfig {
    host: string;
    port: number;
    user: string;
    password?: string;
    sshKeyFile: string;
    sshPassphrase?: string;
    useActiveGpu: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
    pythonPath?: string;
}

export interface RemoteConfig extends TrainingServiceConfig {
    platform: 'remote';
    machineList: RemoteMachineConfig[];
liuzhe-lz's avatar
liuzhe-lz committed
52
    reuseMode: boolean;
53
54
55
56
57
58
59
60
61
62
63
64
65
}

/* OpenPAI */

export interface OpenpaiConfig extends TrainingServiceConfig {
    platform: 'openpai';
    host: string;
    username: string;
    token: string;
    trialCpuNumber: number;
    trialMemorySize: string;
    storageConfigName: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
66
    virtualCluster?: string;
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
    reuseMode: boolean;
    openpaiConfig?: object;
}

/* AML */

export interface AmlConfig extends TrainingServiceConfig {
    platform: 'aml';
    subscriptionId: string;
    resourceGroup: string;
    workspaceName: string;
    computeTarget: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
82
    maxTrialNumberPerGpu: number;
83
84
}

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

/*  Alibaba PAI DLC  */
export interface DlcConfig extends TrainingServiceConfig {
    platfrom: 'dlc';
    type: string;
    image: string;
    jobType: string;
    podCount: number;
    ecsSpec: string;
    region: string;
    nasDataSourceId: string;
    accessKeyId: string;
    accessKeySecret: string;
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
}
/* Kubeflow */

103
export interface KubernetesStorageConfig {
104
    storageType: string;
105
106
107
108
    server?: string;
    path?: string;
    azureAccount?: string;
    azureShare?: string;
109
110
    keyVaultName?: string;
    keyVaultKey?: string;
111
112
113
114
115
116
117
}

export interface KubeflowRoleConfig {
    replicas: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
118
    memorySize: string | number;
119
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
120
    codeDirectory: string;
121
    privateRegistryAuthPath?: string;
122
123
124
125
}

export interface KubeflowConfig extends TrainingServiceConfig {
    platform: 'kubeflow';
126
127
    operator: KubeflowOperator;
    apiVersion: OperatorApiVersion;
128
    storage: KubernetesStorageConfig;
liuzhe-lz's avatar
liuzhe-lz committed
129
130
131
    worker?: KubeflowRoleConfig;
    ps?: KubeflowRoleConfig;
    master?: KubeflowRoleConfig;
132
    reuseMode: boolean;
liuzhe-lz's avatar
liuzhe-lz committed
133
    maxTrialNumberPerGpu?: number;
134
135
}

136
export interface FrameworkControllerTaskRoleConfig {
137
    name: string;
liuzhe-lz's avatar
liuzhe-lz committed
138
    dockerImage: string;
139
140
141
142
    taskNumber: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
143
    memorySize: string | number;
144
145
146
147
    frameworkAttemptCompletionPolicy: {
        minFailedTaskCount: number;
        minSucceedTaskCount: number;
    };
liuzhe-lz's avatar
liuzhe-lz committed
148
    privateRegistryAuthPath?: string;
149
150
151
152
}

export interface FrameworkControllerConfig extends TrainingServiceConfig {
    platform: 'frameworkcontroller';
153
    storage: KubernetesStorageConfig;
154
    serviceAccountName: string;
liuzhe-lz's avatar
liuzhe-lz committed
155
156
157
158
159
    taskRoles: FrameworkControllerTaskRoleConfig[];
    reuseMode: boolean;
    maxTrialNumberPerGpu?: number;
    namespace?: 'default';
    apiVersion?: string;
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
}

/* shared storage */

export interface SharedStorageConfig {
    storageType: string;
    localMountPoint: string;
    remoteMountPoint: string;
    localMounted: string;
}

export interface NfsConfig extends SharedStorageConfig {
    storageType: 'NFS';
    nfsServer: string;
    exportedDirectory: string;
}

export interface AzureBlobConfig extends SharedStorageConfig {
    storageAccountName: string;
    storageAccountKey?: string;
    containerName: string;
}

/* common */

export interface AlgorithmConfig {
    name?: string;
    className?: string;
    codeDirectory?: string;
    classArgs?: object;
}

export interface ExperimentConfig {
    experimentName?: string;
liuzhe-lz's avatar
liuzhe-lz committed
194
    // searchSpaceFile  (handled in python part)
195
196
197
198
199
    searchSpace: any;
    trialCommand: string;
    trialCodeDirectory: string;
    trialConcurrency: number;
    trialGpuNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
200
    maxExperimentDuration?: string | number;
201
    maxTrialNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
202
    maxTrialDuration?: string | number;
203
    nniManagerIp?: string;
liuzhe-lz's avatar
liuzhe-lz committed
204
    // useAnnotation  (handled in python part)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
    debug: boolean;
    logLevel?: string;
    experimentWorkingDirectory?: string;
    tunerGpuIndices?: number[];
    tuner?: AlgorithmConfig;
    assessor?: AlgorithmConfig;
    advisor?: AlgorithmConfig;
    trainingService: TrainingServiceConfig | TrainingServiceConfig[];
    sharedStorage?: SharedStorageConfig;
    deprecated?: any;  // configs that are not yet natively supported by v2 (workaround)
}

/* util functions */

const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
liuzhe-lz's avatar
liuzhe-lz committed
220
const sizeUnits = { tb: 1024 ** 4, gb: 1024 ** 3, mb: 1024 ** 2, kb: 1024, b: 1 };
221

liuzhe-lz's avatar
liuzhe-lz committed
222
223
224
225
226
227
228
229
230
231
function toUnit(value: string | number, targetUnit: string, allUnits: any): number {
    if (typeof value === 'number') {
        return value;
    }
    value = value.toLowerCase();
    for (const [unit, factor] of Object.entries(allUnits)) {
        if (value.endsWith(unit)) {
            const digits = value.slice(0, -unit.length);
            const num = Number(digits) * (factor as number);
            return Math.ceil(num / allUnits[targetUnit]);
232
233
        }
    }
liuzhe-lz's avatar
liuzhe-lz committed
234
    throw new Error(`Bad unit in "${value}"`);
235
236
}

liuzhe-lz's avatar
liuzhe-lz committed
237
238
export function toSeconds(time: string | number): number {
    return toUnit(time, 's', timeUnits);
239
240
}

liuzhe-lz's avatar
liuzhe-lz committed
241
242
export function toMegaBytes(size: string | number): number {
    return toUnit(size, 'mb', sizeUnits);
243
244
}

liuzhe-lz's avatar
liuzhe-lz committed
245
246
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
        return gpuIndices === undefined ? '' : gpuIndices.join(',');
liuzhe-lz's avatar
liuzhe-lz committed
247
}