experimentConfig.ts 6.61 KB
Newer Older
1
2
3
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

4
import assert from 'assert';
5

6
7
8
import { KubeflowOperator, OperatorApiVersion } from '../training_service/kubernetes/kubeflow/kubeflowConfig'
import { KubernetesStorageKind } from '../training_service/kubernetes/kubernetesConfig';

9
10
export interface TrainingServiceConfig {
    platform: string;
liuzhe-lz's avatar
liuzhe-lz committed
11
12
13
14
15
16
17
18
19
20
21
    trialCommand: string;
    trialCodeDirectory: string;
    trialGpuNumber?: number;
    nniManagerIp?: string;

    // FIXME
    // "debug" is only used by openpai to decide whether to check remote nni version
    // it should be better to check when local nni version is not "dev"
    // it should be even better to check version before launching the experiment and let user to confirm
    // log level is currently handled by global logging module and has nothing to do with this
    debug?: boolean;
22
23
24
25
26
27
28
29
30
}

/* Local */

export interface LocalConfig extends TrainingServiceConfig {
    platform: 'local';
    useActiveGpu?: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
liuzhe-lz's avatar
liuzhe-lz committed
31
    reuseMode: boolean;
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
}

/* Remote */

export interface RemoteMachineConfig {
    host: string;
    port: number;
    user: string;
    password?: string;
    sshKeyFile: string;
    sshPassphrase?: string;
    useActiveGpu: boolean;
    maxTrialNumberPerGpu: number;
    gpuIndices?: number[];
    pythonPath?: string;
}

export interface RemoteConfig extends TrainingServiceConfig {
    platform: 'remote';
    machineList: RemoteMachineConfig[];
liuzhe-lz's avatar
liuzhe-lz committed
52
    reuseMode: boolean;
53
54
55
56
57
58
59
60
61
62
63
64
65
}

/* OpenPAI */

export interface OpenpaiConfig extends TrainingServiceConfig {
    platform: 'openpai';
    host: string;
    username: string;
    token: string;
    trialCpuNumber: number;
    trialMemorySize: string;
    storageConfigName: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
66
    virtualCluster?: string;
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
    reuseMode: boolean;
    openpaiConfig?: object;
}

/* AML */

export interface AmlConfig extends TrainingServiceConfig {
    platform: 'aml';
    subscriptionId: string;
    resourceGroup: string;
    workspaceName: string;
    computeTarget: string;
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
82
    maxTrialNumberPerGpu: number;
83
84
}

85
86
87
88
89
90
91
92
93
94

/*  Alibaba PAI DLC  */
export interface DlcConfig extends TrainingServiceConfig {
    platfrom: 'dlc';
    type: string;
    image: string;
    jobType: string;
    podCount: number;
    ecsSpec: string;
    region: string;
95
    workspaceId: string;
96
    nasDataSourceId: string;
97
    ossDataSourceId?: string;
98
99
100
101
102
103
104
    accessKeyId: string;
    accessKeySecret: string;
    localStorageMountPoint: string;
    containerStorageMountPoint: string;
}
/* Kubeflow */

105
export interface KubernetesStorageConfig {
106
    storageType: string;
107
108
109
110
    server?: string;
    path?: string;
    azureAccount?: string;
    azureShare?: string;
111
112
    keyVaultName?: string;
    keyVaultKey?: string;
113
114
115
116
117
118
119
}

export interface KubeflowRoleConfig {
    replicas: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
120
    memorySize: string | number;
121
    dockerImage: string;
liuzhe-lz's avatar
liuzhe-lz committed
122
    codeDirectory: string;
123
    privateRegistryAuthPath?: string;
124
125
126
127
}

export interface KubeflowConfig extends TrainingServiceConfig {
    platform: 'kubeflow';
128
129
    operator: KubeflowOperator;
    apiVersion: OperatorApiVersion;
130
    storage: KubernetesStorageConfig;
liuzhe-lz's avatar
liuzhe-lz committed
131
132
133
    worker?: KubeflowRoleConfig;
    ps?: KubeflowRoleConfig;
    master?: KubeflowRoleConfig;
134
    reuseMode: boolean;
liuzhe-lz's avatar
liuzhe-lz committed
135
    maxTrialNumberPerGpu?: number;
J-shang's avatar
J-shang committed
136
    namespace?: string;
137
138
}

139
export interface FrameworkControllerTaskRoleConfig {
140
    name: string;
liuzhe-lz's avatar
liuzhe-lz committed
141
    dockerImage: string;
142
143
144
145
    taskNumber: number;
    command: string;
    gpuNumber: number;
    cpuNumber: number;
liuzhe-lz's avatar
liuzhe-lz committed
146
    memorySize: string | number;
147
148
149
150
    frameworkAttemptCompletionPolicy: {
        minFailedTaskCount: number;
        minSucceedTaskCount: number;
    };
liuzhe-lz's avatar
liuzhe-lz committed
151
    privateRegistryAuthPath?: string;
152
153
154
155
}

export interface FrameworkControllerConfig extends TrainingServiceConfig {
    platform: 'frameworkcontroller';
156
    storage: KubernetesStorageConfig;
157
    serviceAccountName: string;
liuzhe-lz's avatar
liuzhe-lz committed
158
159
160
    taskRoles: FrameworkControllerTaskRoleConfig[];
    reuseMode: boolean;
    maxTrialNumberPerGpu?: number;
J-shang's avatar
J-shang committed
161
    namespace?: string;
liuzhe-lz's avatar
liuzhe-lz committed
162
    apiVersion?: string;
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
}

/* shared storage */

export interface SharedStorageConfig {
    storageType: string;
    localMountPoint: string;
    remoteMountPoint: string;
    localMounted: string;
}

export interface NfsConfig extends SharedStorageConfig {
    storageType: 'NFS';
    nfsServer: string;
    exportedDirectory: string;
}

export interface AzureBlobConfig extends SharedStorageConfig {
    storageAccountName: string;
    storageAccountKey?: string;
    containerName: string;
}

/* common */

export interface AlgorithmConfig {
    name?: string;
    className?: string;
    codeDirectory?: string;
    classArgs?: object;
}

export interface ExperimentConfig {
    experimentName?: string;
liuzhe-lz's avatar
liuzhe-lz committed
197
    // searchSpaceFile  (handled in python part)
198
199
200
201
202
    searchSpace: any;
    trialCommand: string;
    trialCodeDirectory: string;
    trialConcurrency: number;
    trialGpuNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
203
    maxExperimentDuration?: string | number;
204
    maxTrialNumber?: number;
liuzhe-lz's avatar
liuzhe-lz committed
205
    maxTrialDuration?: string | number;
206
    nniManagerIp?: string;
liuzhe-lz's avatar
liuzhe-lz committed
207
    // useAnnotation  (handled in python part)
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    debug: boolean;
    logLevel?: string;
    experimentWorkingDirectory?: string;
    tunerGpuIndices?: number[];
    tuner?: AlgorithmConfig;
    assessor?: AlgorithmConfig;
    advisor?: AlgorithmConfig;
    trainingService: TrainingServiceConfig | TrainingServiceConfig[];
    sharedStorage?: SharedStorageConfig;
    deprecated?: any;  // configs that are not yet natively supported by v2 (workaround)
}

/* util functions */

const timeUnits = { d: 24 * 3600, h: 3600, m: 60, s: 1 };
liuzhe-lz's avatar
liuzhe-lz committed
223
const sizeUnits = { tb: 1024 ** 4, gb: 1024 ** 3, mb: 1024 ** 2, kb: 1024, b: 1 };
224

liuzhe-lz's avatar
liuzhe-lz committed
225
226
227
228
229
230
231
232
233
234
function toUnit(value: string | number, targetUnit: string, allUnits: any): number {
    if (typeof value === 'number') {
        return value;
    }
    value = value.toLowerCase();
    for (const [unit, factor] of Object.entries(allUnits)) {
        if (value.endsWith(unit)) {
            const digits = value.slice(0, -unit.length);
            const num = Number(digits) * (factor as number);
            return Math.ceil(num / allUnits[targetUnit]);
235
236
        }
    }
liuzhe-lz's avatar
liuzhe-lz committed
237
    throw new Error(`Bad unit in "${value}"`);
238
239
}

liuzhe-lz's avatar
liuzhe-lz committed
240
241
export function toSeconds(time: string | number): number {
    return toUnit(time, 's', timeUnits);
242
243
}

liuzhe-lz's avatar
liuzhe-lz committed
244
245
export function toMegaBytes(size: string | number): number {
    return toUnit(size, 'mb', sizeUnits);
246
247
}

liuzhe-lz's avatar
liuzhe-lz committed
248
249
export function toCudaVisibleDevices(gpuIndices?: number[]): string {
        return gpuIndices === undefined ? '' : gpuIndices.join(',');
liuzhe-lz's avatar
liuzhe-lz committed
250
}