main.ts 5.78 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

import { Container, Scope } from 'typescript-ioc';

import * as component from './common/component';
import { Database, DataStore } from './common/datastore';
import { setExperimentStartupInfo } from './common/experimentStartupInfo';
import { getLogger, Logger } from './common/log';
import { Manager } from './common/manager';
import { TrainingService } from './common/trainingService';
import { parseArg, uniqueString, mkDirP, getLogDir } from './common/utils';
import { NNIDataStore } from './core/nniDataStore';
import { NNIManager } from './core/nnimanager';
import { SqlDB } from './core/sqlDatabase';
34
import { NNIRestServer } from './rest_server/nniRestServer';
35
import { LocalTrainingServiceForGPU } from './training_service/local/localTrainingServiceForGPU';
Deshui Yu's avatar
Deshui Yu committed
36
37
38
import {
    RemoteMachineTrainingService
} from './training_service/remote_machine/remoteMachineTrainingService';
39
import { PAITrainingService } from './training_service/pai/paiTrainingService';
40
41
import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService';
import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService';
Deshui Yu's avatar
Deshui Yu committed
42

43
function initStartupInfo(startExpMode: string, resumeExperimentId: string, basePort: number) {
Deshui Yu's avatar
Deshui Yu committed
44
45
    const createNew: boolean = (startExpMode === 'new');
    const expId: string = createNew ? uniqueString(8) : resumeExperimentId;
46
    setExperimentStartupInfo(createNew, expId, basePort);
Deshui Yu's avatar
Deshui Yu committed
47
48
49
50
}

async function initContainer(platformMode: string): Promise<void> {
    if (platformMode === 'local') {
51
        Container.bind(TrainingService).to(LocalTrainingServiceForGPU).scope(Scope.Singleton);
Deshui Yu's avatar
Deshui Yu committed
52
53
    } else if (platformMode === 'remote') {
        Container.bind(TrainingService).to(RemoteMachineTrainingService).scope(Scope.Singleton);
54
    } else if (platformMode === 'pai') {
55
        Container.bind(TrainingService).to(PAITrainingService).scope(Scope.Singleton);
56
57
    } else if (platformMode === 'kubeflow') {
        Container.bind(TrainingService).to(KubeflowTrainingService).scope(Scope.Singleton);
58
59
60
61
    } else if (platformMode === 'frameworkcontroller') {
        Container.bind(TrainingService).to(FrameworkControllerTrainingService).scope(Scope.Singleton);
    }
    else {
Deshui Yu's avatar
Deshui Yu committed
62
63
64
65
66
67
68
69
70
71
72
        throw new Error(`Error: unsupported mode: ${mode}`);
    }
    Container.bind(Manager).to(NNIManager).scope(Scope.Singleton);
    Container.bind(Database).to(SqlDB).scope(Scope.Singleton);
    Container.bind(DataStore).to(NNIDataStore).scope(Scope.Singleton);
    const ds: DataStore = component.get(DataStore);

    await ds.init();
}

function usage(): void {
73
    console.info('usage: node main.js --port <port> --mode <local/remote/pai/kubeflow/frameworkcontroller> --start_mode <new/resume> --experiment_id <id>');
Deshui Yu's avatar
Deshui Yu committed
74
75
76
}

const strPort: string = parseArg(['--port', '-p']);
goooxu's avatar
goooxu committed
77
78
79
if (!strPort || strPort.length === 0) {
    usage();
    process.exit(1);
Deshui Yu's avatar
Deshui Yu committed
80
81
}

goooxu's avatar
goooxu committed
82
83
const port: number = parseInt(strPort, 10);

Deshui Yu's avatar
Deshui Yu committed
84
const mode: string = parseArg(['--mode', '-m']);
85
if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller'].includes(mode)) {
86
    console.log(`FATAL: unknown mode: ${mode}`);
Deshui Yu's avatar
Deshui Yu committed
87
88
89
90
91
92
    usage();
    process.exit(1);
}

const startMode: string = parseArg(['--start_mode', '-s']);
if (!['new', 'resume'].includes(startMode)) {
93
    console.log(`FATAL: unknown start_mode: ${startMode}`);
Deshui Yu's avatar
Deshui Yu committed
94
95
96
97
98
99
    usage();
    process.exit(1);
}

const experimentId: string = parseArg(['--experiment_id', '-id']);
if (startMode === 'resume' && experimentId.trim().length < 1) {
100
    console.log(`FATAL: cannot resume experiment, invalid experiment_id: ${experimentId}`);
Deshui Yu's avatar
Deshui Yu committed
101
102
103
104
    usage();
    process.exit(1);
}

105
initStartupInfo(startMode, experimentId, port);
Deshui Yu's avatar
Deshui Yu committed
106
107
108
109
110

mkDirP(getLogDir()).then(async () => {
    const log: Logger = getLogger();
    try {
        await initContainer(mode);
111
        const restServer: NNIRestServer = component.get(NNIRestServer);
112
        await restServer.start();
Deshui Yu's avatar
Deshui Yu committed
113
114
115
        log.info(`Rest server listening on: ${restServer.endPoint}`);
    } catch (err) {
        log.error(`${err.stack}`);
116
        throw err;
Deshui Yu's avatar
Deshui Yu committed
117
118
119
120
    }
}).catch((err: Error) => {
    console.error(`Failed to create log dir: ${err.stack}`);
});
121
122
123

process.on('SIGTERM', async () => {
    const log: Logger = getLogger();
SparkSnail's avatar
SparkSnail committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    let hasError: boolean = false;
    try{
        const nniManager: Manager = component.get(Manager);
        await nniManager.stopExperiment();
        const ds: DataStore = component.get(DataStore);
        await ds.close();
        const restServer: NNIRestServer = component.get(NNIRestServer);
        await restServer.stop();
    }catch(err){
        hasError = true;
        log.error(`${err.stack}`);
    }finally{
        await log.close();
        process.exit(hasError?1:0);
    }
139
})