main.ts 7.23 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import { Container, Scope } from 'typescript-ioc';

24
import * as fs from 'fs';
25
import * as component from './common/component';
Deshui Yu's avatar
Deshui Yu committed
26
27
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
chicm-ms's avatar
chicm-ms committed
28
import { getLogger, Logger, logLevelNameMap } from './common/log';
SparkSnail's avatar
SparkSnail committed
29
import { Manager, ExperimentStartUpMode } from './common/manager';
Deshui Yu's avatar
Deshui Yu committed
30
import { TrainingService } from './common/trainingService';
31
import { getLogDir, mkDirP, parseArg, uniqueString } from './common/utils';
Deshui Yu's avatar
Deshui Yu committed
32
33
34
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
35
import { NNIRestServer } from './rest_server/nniRestServer';
36
37
38
39
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { LocalTrainingService } from './training_service/local/localTrainingService';
import { PAITrainingService } from './training_service/pai/paiTrainingService';
Deshui Yu's avatar
Deshui Yu committed
40
41
42
43
import {
    RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';

44
45
function initStartupInfo(
    startExpMode: string, resumeExperimentId: string, basePort: number,
SparkSnail's avatar
SparkSnail committed
46
47
    logDirectory: string, experimentLogLevel: string, readonly: boolean): void {
    const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW);
Deshui Yu's avatar
Deshui Yu committed
48
    const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
SparkSnail's avatar
SparkSnail committed
49
    setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel, readonly);
Deshui Yu's avatar
Deshui Yu committed
50
51
}

SparkSnail's avatar
SparkSnail committed
52
async function initContainer(platformMode: string, logFileName?: string): Promise<void> {
Deshui Yu's avatar
Deshui Yu committed
53
    if (platformMode === 'local') {
54
55
56
        Container.bind(TrainingService)
            .to(LocalTrainingService)
            .scope(Scope.Singleton);
Deshui Yu's avatar
Deshui Yu committed
57
    } else if (platformMode === 'remote') {
58
59
60
        Container.bind(TrainingService)
            .to(RemoteMachineTrainingService)
            .scope(Scope.Singleton);
61
    } else if (platformMode === 'pai') {
62
63
64
        Container.bind(TrainingService)
            .to(PAITrainingService)
            .scope(Scope.Singleton);
65
    } else if (platformMode === 'kubeflow') {
66
67
68
        Container.bind(TrainingService)
            .to(KubeflowTrainingService)
            .scope(Scope.Singleton);
69
    } else if (platformMode === 'frameworkcontroller') {
70
71
72
73
        Container.bind(TrainingService)
            .to(FrameworkControllerTrainingService)
            .scope(Scope.Singleton);
    } else {
Deshui Yu's avatar
Deshui Yu committed
74
75
        throw new Error(`Error: unsupported mode: ${mode}`);
    }
76
77
78
79
80
81
82
83
84
    Container.bind(Manager)
        .to(NNIManager)
        .scope(Scope.Singleton);
    Container.bind(Database)
        .to(SqlDB)
        .scope(Scope.Singleton);
    Container.bind(DataStore)
        .to(NNIDataStore)
        .scope(Scope.Singleton);
SparkSnail's avatar
SparkSnail committed
85
86
87
    Container.bind(Logger).provider({
        get: (): Logger => new Logger(logFileName)
    });
Deshui Yu's avatar
Deshui Yu committed
88
89
90
91
92
93
    const ds: DataStore = component.get(DataStore);

    await ds.init();
}

function usage(): void {
94
95
    console.info('usage: node main.js --port <port> --mode \
    <local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
Deshui Yu's avatar
Deshui Yu committed
96
97
98
}

const strPort: string = parseArg(['--port', '-p']);
goooxu's avatar
goooxu committed
99
100
101
if (!strPort || strPort.length === 0) {
    usage();
    process.exit(1);
Deshui Yu's avatar
Deshui Yu committed
102
103
}

goooxu's avatar
goooxu committed
104
105
const port: number = parseInt(strPort, 10);

Deshui Yu's avatar
Deshui Yu committed
106
const mode: string = parseArg(['--mode', '-m']);
107
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode)) {
108
    console.log(`FATAL: unknown mode: ${mode}`);
Deshui Yu's avatar
Deshui Yu committed
109
110
111
112
113
    usage();
    process.exit(1);
}

const startMode: string = parseArg(['--start_mode', '-s']);
SparkSnail's avatar
SparkSnail committed
114
if (![ExperimentStartUpMode.NEW, ExperimentStartUpMode.RESUME].includes(startMode)) {
115
    console.log(`FATAL: unknown start_mode: ${startMode}`);
Deshui Yu's avatar
Deshui Yu committed
116
117
118
119
120
    usage();
    process.exit(1);
}

const experimentId: string = parseArg(['--experiment_id', '-id']);
SparkSnail's avatar
SparkSnail committed
121
122
if ((startMode === ExperimentStartUpMode.RESUME) && experimentId.trim().length < 1) {
    console.log(`FATAL: cannot resume the experiment, invalid experiment_id: ${experimentId}`);
Deshui Yu's avatar
Deshui Yu committed
123
124
125
126
    usage();
    process.exit(1);
}

127
128
129
130
131
132
133
134
const logDir: string = parseArg(['--log_dir', '-ld']);
if (logDir.length > 0) {
    if (!fs.existsSync(logDir)) {
        console.log(`FATAL: log_dir ${logDir} does not exist`);
    }
}

const logLevel: string = parseArg(['--log_level', '-ll']);
chicm-ms's avatar
chicm-ms committed
135
if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) {
136
137
138
    console.log(`FATAL: invalid log_level: ${logLevel}`);
}

SparkSnail's avatar
SparkSnail committed
139
140
141
142
143
144
145
146
147
const readonlyArg: string = parseArg(['--readonly', '-r']);
if (!('true' || 'false').includes(readonlyArg.toLowerCase())) {
    console.log(`FATAL: readonly property should only be true or false`);
    usage();
    process.exit(1);
}
const readonly = readonlyArg.toLowerCase() == 'true' ? true : false;

initStartupInfo(startMode, experimentId, port, logDir, logLevel, readonly);
Deshui Yu's avatar
Deshui Yu committed
148

149
150
mkDirP(getLogDir())
    .then(async () => {
Deshui Yu's avatar
Deshui Yu committed
151
152
    try {
        await initContainer(mode);
153
        const restServer: NNIRestServer = component.get(NNIRestServer);
154
        await restServer.start();
SparkSnail's avatar
SparkSnail committed
155
        const log: Logger = getLogger();
Deshui Yu's avatar
Deshui Yu committed
156
157
        log.info(`Rest server listening on: ${restServer.endPoint}`);
    } catch (err) {
SparkSnail's avatar
SparkSnail committed
158
        const log: Logger = getLogger();
Deshui Yu's avatar
Deshui Yu committed
159
        log.error(`${err.stack}`);
160
        throw err;
Deshui Yu's avatar
Deshui Yu committed
161
    }
162
163
})
.catch((err: Error) => {
Deshui Yu's avatar
Deshui Yu committed
164
165
    console.error(`Failed to create log dir: ${err.stack}`);
});
166

167
168
169
170
171
172
173
174
175
176
function getStopSignal(): any {
    if (process.platform === "win32") {
        return 'SIGBREAK';
    }
    else{
        return 'SIGTERM';
    }
}

process.on(getStopSignal(), async () => {
177
    const log: Logger = getLogger();
SparkSnail's avatar
SparkSnail committed
178
    let hasError: boolean = false;
179
    try {
SparkSnail's avatar
SparkSnail committed
180
181
182
183
184
185
        const nniManager: Manager = component.get(Manager);
        await nniManager.stopExperiment();
        const ds: DataStore = component.get(DataStore);
        await ds.close();
        const restServer: NNIRestServer = component.get(NNIRestServer);
        await restServer.stop();
186
    } catch (err) {
SparkSnail's avatar
SparkSnail committed
187
188
        hasError = true;
        log.error(`${err.stack}`);
189
    } finally {
SparkSnail's avatar
SparkSnail committed
190
        await log.close();
191
        process.exit(hasError ? 1 : 0);
SparkSnail's avatar
SparkSnail committed
192
    }
193
});