main.ts 7.89 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5
6
7

'use strict';

import { Container, Scope } from 'typescript-ioc';

8
import * as fs from 'fs';
9
import * as path from 'path';
10
import * as component from './common/component';
Deshui Yu's avatar
Deshui Yu committed
11
12
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
chicm-ms's avatar
chicm-ms committed
13
import { getLogger, Logger, logLevelNameMap } from './common/log';
SparkSnail's avatar
SparkSnail committed
14
import { Manager, ExperimentStartUpMode } from './common/manager';
15
import { ExperimentManager } from './common/experimentManager';
Deshui Yu's avatar
Deshui Yu committed
16
import { TrainingService } from './common/trainingService';
17
import { getLogDir, mkDirP, parseArg } from './common/utils';
Deshui Yu's avatar
Deshui Yu committed
18
19
20
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
21
import { NNIExperimentsManager } from './core/nniExperimentsManager';
22
import { NNIRestServer } from './rest_server/nniRestServer';
23
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
24
import { AdlTrainingService } from './training_service/kubernetes/adl/adlTrainingService';
25
26
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService';
27
import { RouterTrainingService } from './training_service/reusable/routerTrainingService';
28
import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService';
George Cheng's avatar
George Cheng committed
29
import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService';
Deshui Yu's avatar
Deshui Yu committed
30

31
function initStartupInfo(
32
    startExpMode: string, experimentId: string, basePort: number, platform: string,
SparkSnail's avatar
SparkSnail committed
33
34
    logDirectory: string, experimentLogLevel: string, readonly: boolean): void {
    const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW);
35
    setExperimentStartupInfo(createNew, experimentId, basePort, platform, logDirectory, experimentLogLevel, readonly);
Deshui Yu's avatar
Deshui Yu committed
36
37
}

38
async function initContainer(foreground: boolean, platformMode: string, logFileName?: string): Promise<void> {
39
40
41
42
43
    if (platformMode === 'adl') {
        Container.bind(TrainingService)
            .to(AdlTrainingService)
            .scope(Scope.Singleton);
    } else if (platformMode === 'local') {
44
45
46
        Container.bind(TrainingService)
            .to(LocalTrainingService)
            .scope(Scope.Singleton);
Deshui Yu's avatar
Deshui Yu committed
47
    } else if (platformMode === 'remote') {
48
        Container.bind(TrainingService)
49
            .to(RouterTrainingService)
50
            .scope(Scope.Singleton);
51
    } else if (platformMode === 'pai') {
52
        Container.bind(TrainingService)
53
            .to(RouterTrainingService)
54
55
            .scope(Scope.Singleton);
    } else if (platformMode === 'paiYarn') {
56
        Container.bind(TrainingService)
57
            .to(PAIYarnTrainingService)
58
            .scope(Scope.Singleton);
59
    } else if (platformMode === 'kubeflow') {
60
61
62
        Container.bind(TrainingService)
            .to(KubeflowTrainingService)
            .scope(Scope.Singleton);
63
    } else if (platformMode === 'frameworkcontroller') {
64
65
66
        Container.bind(TrainingService)
            .to(FrameworkControllerTrainingService)
            .scope(Scope.Singleton);
George Cheng's avatar
George Cheng committed
67
68
69
70
    } else if (platformMode === 'dlts') {
        Container.bind(TrainingService)
            .to(DLTSTrainingService)
            .scope(Scope.Singleton);
SparkSnail's avatar
SparkSnail committed
71
72
73
74
    } else if (platformMode === 'aml') {
        Container.bind(TrainingService)
            .to(RouterTrainingService)
            .scope(Scope.Singleton);
75
    } else {
chicm-ms's avatar
chicm-ms committed
76
        throw new Error(`Error: unsupported mode: ${platformMode}`);
Deshui Yu's avatar
Deshui Yu committed
77
    }
78
79
80
81
82
83
84
85
86
    Container.bind(Manager)
        .to(NNIManager)
        .scope(Scope.Singleton);
    Container.bind(Database)
        .to(SqlDB)
        .scope(Scope.Singleton);
    Container.bind(DataStore)
        .to(NNIDataStore)
        .scope(Scope.Singleton);
87
88
89
    Container.bind(ExperimentManager)
        .to(NNIExperimentsManager)
        .scope(Scope.Singleton);
90
91
92
93
94
95
    const DEFAULT_LOGFILE: string = path.join(getLogDir(), 'nnimanager.log');
    if (foreground) {
        logFileName = undefined;
    } else if (logFileName === undefined) {
        logFileName = DEFAULT_LOGFILE;
    }
SparkSnail's avatar
SparkSnail committed
96
97
98
    Container.bind(Logger).provider({
        get: (): Logger => new Logger(logFileName)
    });
Deshui Yu's avatar
Deshui Yu committed
99
100
101
102
103
104
    const ds: DataStore = component.get(DataStore);

    await ds.init();
}

function usage(): void {
105
    console.info('usage: node main.js --port <port> --mode \
106
    <adl/local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>');
Deshui Yu's avatar
Deshui Yu committed
107
108
109
}

const strPort: string = parseArg(['--port', '-p']);
goooxu's avatar
goooxu committed
110
111
112
if (!strPort || strPort.length === 0) {
    usage();
    process.exit(1);
Deshui Yu's avatar
Deshui Yu committed
113
114
}

115
116
117
118
119
120
121
122
const foregroundArg: string = parseArg(['--foreground', '-f']);
if (!('true' || 'false').includes(foregroundArg.toLowerCase())) {
    console.log(`FATAL: foreground property should only be true or false`);
    usage();
    process.exit(1);
}
const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : false;

goooxu's avatar
goooxu committed
123
124
const port: number = parseInt(strPort, 10);

Deshui Yu's avatar
Deshui Yu committed
125
const mode: string = parseArg(['--mode', '-m']);
126
if (!['adl', 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'].includes(mode)) {
127
    console.log(`FATAL: unknown mode: ${mode}`);
Deshui Yu's avatar
Deshui Yu committed
128
129
130
131
132
    usage();
    process.exit(1);
}

const startMode: string = parseArg(['--start_mode', '-s']);
SparkSnail's avatar
SparkSnail committed
133
if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMode)) {
134
    console.log(`FATAL: unknown start_mode: ${startMode}`);
Deshui Yu's avatar
Deshui Yu committed
135
136
137
138
139
    usage();
    process.exit(1);
}

const experimentId: string = parseArg(['--experiment_id', '-id']);
140
if (experimentId.trim().length < 1) {
SparkSnail's avatar
SparkSnail committed
141
    console.log(`FATAL: cannot resume the experiment, invalid experiment_id: ${experimentId}`);
Deshui Yu's avatar
Deshui Yu committed
142
143
144
145
    usage();
    process.exit(1);
}

146
147
148
149
150
151
152
153
const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
    if (!fs.existsSync(logDir)) {
        console.log(`FATAL: log_dir ${logDir} does not exist`);
    }
}

const logLevel: string = parseArg(['--log_level', '-ll']);
chicm-ms's avatar
chicm-ms committed
154
if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
155
156
157
    console.log(`FATAL: invalid log_level: ${logLevel}`);
}

SparkSnail's avatar
SparkSnail committed
158
159
160
161
162
163
164
165
const readonlyArg: string = parseArg(['--readonly', '-r']);
if (!('true' || 'false').includes(readonlyArg.toLowerCase())) {
    console.log(`FATAL: readonly property should only be true or false`);
    usage();
    process.exit(1);
}
const readonly = readonlyArg.toLowerCase() == 'true' ? true : false;

166
initStartupInfo(startMode, experimentId, port, mode, logDir, logLevel, readonly);
Deshui Yu's avatar
Deshui Yu committed
167

168
169
mkDirP(getLogDir())
    .then(async () => {
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
        try {
            await initContainer(foreground, mode);
            const restServer: NNIRestServer = component.get(NNIRestServer);
            await restServer.start();
            const log: Logger = getLogger();
            log.info(`Rest server listening on: ${restServer.endPoint}`);
        } catch (err) {
            const log: Logger = getLogger();
            log.error(`${err.stack}`);
            throw err;
        }
    })
    .catch((err: Error) => {
        console.error(`Failed to create log dir: ${err.stack}`);
    });
185

186
async function cleanUp(): Promise<void> {
187
    const log: Logger = getLogger();
SparkSnail's avatar
SparkSnail committed
188
    let hasError: boolean = false;
189
    try {
SparkSnail's avatar
SparkSnail committed
190
191
        const nniManager: Manager = component.get(Manager);
        await nniManager.stopExperiment();
192
193
        const experimentManager: ExperimentManager = component.get(ExperimentManager);
        await experimentManager.stop();
SparkSnail's avatar
SparkSnail committed
194
195
196
197
        const ds: DataStore = component.get(DataStore);
        await ds.close();
        const restServer: NNIRestServer = component.get(NNIRestServer);
        await restServer.stop();
198
    } catch (err) {
SparkSnail's avatar
SparkSnail committed
199
200
        hasError = true;
        log.error(`${err.stack}`);
201
    } finally {
202
        log.close();
203
        process.exit(hasError ? 1 : 0);
SparkSnail's avatar
SparkSnail committed
204
    }
205
206
207
208
209
}

process.on('SIGTERM', cleanUp);
process.on('SIGBREAK', cleanUp);
process.on('SIGINT', cleanUp);