remoteMachineData.ts 9.57 KB
Newer Older
Deshui Yu's avatar
Deshui Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/**
 * Copyright (c) Microsoft Corporation
 * All rights reserved.
 *
 * MIT License
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 * documentation files (the "Software"), to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
 * to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

'use strict';

22
import * as fs from 'fs';
SparkSnail's avatar
SparkSnail committed
23
24
import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred';
25
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../common/trainingService';
26
import { GPUInfo, GPUSummary } from '../common/gpuData';
Deshui Yu's avatar
Deshui Yu committed
27
28
29
30
31

/**
 * Metadata of remote machine for configuration and statuc query
 */
export class RemoteMachineMeta {
32
33
34
35
    public readonly ip : string = '';
    public readonly port : number = 22;
    public readonly username : string = '';
    public readonly passwd: string = '';
36
37
    public readonly sshKeyPath?: string;
    public readonly passphrase?: string;
Deshui Yu's avatar
Deshui Yu committed
38
    public gpuSummary : GPUSummary | undefined;
39
    public readonly gpuIndices?: string;
40
    public readonly maxTrialNumPerGpu?: number;
41
42
    //TODO: initialize varialbe in constructor
    public occupiedGpuIndexMap?: Map<number, number>;
43
    public readonly useActiveGpu?: boolean = false;
44
45
46
47
48
49
50
51
52
53
54
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
    if (gpuIndices !== undefined) {
        const indices: number[] = gpuIndices.split(',')
            .map((x: string) => parseInt(x, 10));
        if (indices.length > 0) {
            return new Set(indices);
        } else {
            throw new Error('gpuIndices can not be empty if specified.');
        }
Deshui Yu's avatar
Deshui Yu committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    }
}

/**
 * The execution result for command executed on remote machine
 */
export class RemoteCommandResult {
    public readonly stdout : string;
    public readonly stderr : string;
    public readonly exitCode : number;

    constructor(stdout : string, stderr : string, exitCode : number) {
        this.stdout = stdout;
        this.stderr = stderr;
        this.exitCode = exitCode;
    }
}

/**
 * RemoteMachineTrialJobDetail
 */
export class RemoteMachineTrialJobDetail implements TrialJobDetail {
    public id: string;
    public status: TrialJobStatus;
79
80
81
    public submitTime: number;
    public startTime?: number;
    public endTime?: number;
Deshui Yu's avatar
Deshui Yu committed
82
83
84
    public tags?: string[];
    public url?: string;
    public workingDirectory: string;
85
    public form: TrialJobApplicationForm;
Deshui Yu's avatar
Deshui Yu committed
86
    public rmMeta?: RemoteMachineMeta;
87
    public isEarlyStopped?: boolean;
88
    public gpuIndices: GPUInfo[];
Deshui Yu's avatar
Deshui Yu committed
89

90
    constructor(id: string, status: TrialJobStatus, submitTime: number,
91
                workingDirectory: string, form: TrialJobApplicationForm) {
Deshui Yu's avatar
Deshui Yu committed
92
93
94
95
96
97
        this.id = id;
        this.status = status;
        this.submitTime = submitTime;
        this.workingDirectory = workingDirectory;
        this.form = form;
        this.tags = [];
98
        this.gpuIndices = [];
Deshui Yu's avatar
Deshui Yu committed
99
100
101
    }
}

SparkSnail's avatar
SparkSnail committed
102
103
104
105
106
107
108
109
110
111
/**
 * The remote machine ssh client used for trial and gpu detector
 */
export class SSHClient {
    private readonly sshClient: Client;
    private usedConnectionNumber: number; //count the connection number of every client
    constructor(sshClient: Client, usedConnectionNumber: number) {
        this.sshClient = sshClient;
        this.usedConnectionNumber = usedConnectionNumber;
    }
112

SparkSnail's avatar
SparkSnail committed
113
114
115
116
117
118
119
120
    public get getSSHClientInstance(): Client {
        return this.sshClient;
    }

    public get getUsedConnectionNumber(): number {
        return this.usedConnectionNumber;
    }

121
    public addUsedConnectionNumber(): void {
SparkSnail's avatar
SparkSnail committed
122
123
124
        this.usedConnectionNumber += 1;
    }

125
    public minusUsedConnectionNumber(): void {
SparkSnail's avatar
SparkSnail committed
126
127
128
129
        this.usedConnectionNumber -= 1;
    }
}

130
131
132
/**
 * The remote machine ssh client manager
 */
SparkSnail's avatar
SparkSnail committed
133
export class SSHClientManager {
134
    private readonly sshClientArray: SSHClient[];
SparkSnail's avatar
SparkSnail committed
135
136
137
138
139
140
141
142
143
144
145
146
147
    private readonly maxTrialNumberPerConnection: number;
    private readonly rmMeta: RemoteMachineMeta;
    constructor(sshClientArray: SSHClient[], maxTrialNumberPerConnection: number, rmMeta: RemoteMachineMeta) {
        this.rmMeta = rmMeta;
        this.sshClientArray = sshClientArray;
        this.maxTrialNumberPerConnection = maxTrialNumberPerConnection;
    }

    /**
     * find a available ssh client in ssh array, if no ssh client available, return undefined
     */
    public async getAvailableSSHClient(): Promise<Client> {
        const deferred: Deferred<Client> = new Deferred<Client>();
148
149
150
        for (const index of this.sshClientArray.keys()) {
            const connectionNumber: number = this.sshClientArray[index].getUsedConnectionNumber;
            if (connectionNumber < this.maxTrialNumberPerConnection) {
SparkSnail's avatar
SparkSnail committed
151
152
                this.sshClientArray[index].addUsedConnectionNumber();
                deferred.resolve(this.sshClientArray[index].getSSHClientInstance);
153

SparkSnail's avatar
SparkSnail committed
154
155
                return deferred.promise;
            }
156
157
        }

SparkSnail's avatar
SparkSnail committed
158
        //init a new ssh client if could not get an available one
159
        return this.initNewSSHClient();
SparkSnail's avatar
SparkSnail committed
160
    }
161

SparkSnail's avatar
SparkSnail committed
162
163
    /**
     * add a new ssh client to sshClientArray
164
     * @param sshClient SSH Client
SparkSnail's avatar
SparkSnail committed
165
     */
166
    public addNewSSHClient(client: Client): void {
SparkSnail's avatar
SparkSnail committed
167
168
        this.sshClientArray.push(new SSHClient(client, 1));
    }
169

SparkSnail's avatar
SparkSnail committed
170
    /**
171
     * first ssh client instance is used for gpu collector and host job
SparkSnail's avatar
SparkSnail committed
172
     */
173
    public getFirstSSHClient(): Client {
SparkSnail's avatar
SparkSnail committed
174
175
        return this.sshClientArray[0].getSSHClientInstance;
    }
176

SparkSnail's avatar
SparkSnail committed
177
178
179
    /**
     * close all of ssh client
     */
180
181
    public closeAllSSHClient(): void {
        for (const sshClient of this.sshClientArray) {
SparkSnail's avatar
SparkSnail committed
182
183
184
            sshClient.getSSHClientInstance.end();
        }
    }
185

SparkSnail's avatar
SparkSnail committed
186
187
    /**
     * retrieve resource, minus a number for given ssh client
188
     * @param client SSH Client
SparkSnail's avatar
SparkSnail committed
189
     */
190
191
    public releaseConnection(client: Client | undefined): void {
        if (client === undefined) {
SparkSnail's avatar
SparkSnail committed
192
193
            throw new Error(`could not release a undefined ssh client`);
        }
194
195
        for (const index of this.sshClientArray.keys()) {
            if (this.sshClientArray[index].getSSHClientInstance === client) {
SparkSnail's avatar
SparkSnail committed
196
197
198
199
200
201
                this.sshClientArray[index].minusUsedConnectionNumber();
                break;
            }
        }
    }

202
203
204
205
206
207
208
209
210
211
    /**
     * Create a new ssh connection client and initialize it
     */
    // tslint:disable:non-literal-fs-path
    private initNewSSHClient(): Promise<Client> {
        const deferred: Deferred<Client> = new Deferred<Client>();
        const conn: Client = new Client();
        const connectConfig: ConnectConfig = {
            host: this.rmMeta.ip,
            port: this.rmMeta.port,
212
213
            username: this.rmMeta.username,
            tryKeyboard: true };
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
        if (this.rmMeta.passwd !== undefined) {
            connectConfig.password = this.rmMeta.passwd;
        } else if (this.rmMeta.sshKeyPath !== undefined) {
            if (!fs.existsSync(this.rmMeta.sshKeyPath)) {
                //SSh key path is not a valid file, reject
                deferred.reject(new Error(`${this.rmMeta.sshKeyPath} does not exist.`));
            }
            const privateKey: string = fs.readFileSync(this.rmMeta.sshKeyPath, 'utf8');

            connectConfig.privateKey = privateKey;
            connectConfig.passphrase = this.rmMeta.passphrase;
        } else {
            deferred.reject(new Error(`No valid passwd or sshKeyPath is configed.`));
        }
        conn.on('ready', () => {
            this.addNewSSHClient(conn);
            deferred.resolve(conn);
        })
          .on('error', (err: Error) => {
            // SSH connection error, reject with error message
            deferred.reject(new Error(err.message));
235
236
        }).on("keyboard-interactive", (name, instructions, lang, prompts, finish) => {
            finish([this.rmMeta.passwd]);
237
238
239
240
241
242
        })
          .connect(connectConfig);

        return deferred.promise;
    }
}
SparkSnail's avatar
SparkSnail committed
243

244
export type RemoteMachineScheduleResult = { scheduleInfo : RemoteMachineScheduleInfo | undefined; resultType : ScheduleResultType};
Deshui Yu's avatar
Deshui Yu committed
245

246
export type RemoteMachineScheduleInfo = { rmMeta : RemoteMachineMeta; cuda_visible_device : string};
Deshui Yu's avatar
Deshui Yu committed
247
248

export enum ScheduleResultType {
249
    // Schedule succeeded
Deshui Yu's avatar
Deshui Yu committed
250
251
    SUCCEED,

252
    // Temporarily, no enough available GPU right now
Deshui Yu's avatar
Deshui Yu committed
253
254
    TMP_NO_AVAILABLE_GPU,

255
    // Cannot match requirement even if all GPU are a
Deshui Yu's avatar
Deshui Yu committed
256
257
258
    REQUIRE_EXCEED_TOTAL
}

SparkSnail's avatar
SparkSnail committed
259
export const REMOTEMACHINE_TRIAL_COMMAND_FORMAT: string =
Deshui Yu's avatar
Deshui Yu committed
260
`#!/bin/bash
261
262
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
Deshui Yu's avatar
Deshui Yu committed
263
cd $NNI_SYS_DIR
SparkSnail's avatar
SparkSnail committed
264
265
sh install_nni.sh
echo $$ >{6}
266
267
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
SparkSnail's avatar
SparkSnail committed
268
echo $? \`date +%s%3N\` >{12}`;
Deshui Yu's avatar
Deshui Yu committed
269

270
export const HOST_JOB_SHELL_FORMAT: string =
Deshui Yu's avatar
Deshui Yu committed
271
272
273
274
275
`#!/bin/bash
cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`;