remoteMachineData.ts 8.51 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import * as fs from 'fs';
SparkSnail's avatar
SparkSnail committed
7
8
import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred';
9
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus  } from '../../common/trainingService';
10
import { GPUInfo, GPUSummary } from '../common/gpuData';
Deshui Yu's avatar
Deshui Yu committed
11
12
13
14
15

/**
 * Metadata of remote machine for configuration and statuc query
 */
export class RemoteMachineMeta {
chicm-ms's avatar
chicm-ms committed
16
17
18
    public readonly ip: string = '';
    public readonly port: number = 22;
    public readonly username: string = '';
19
    public readonly passwd: string = '';
20
21
    public readonly sshKeyPath?: string;
    public readonly passphrase?: string;
chicm-ms's avatar
chicm-ms committed
22
    public gpuSummary: GPUSummary | undefined;
23
    public readonly gpuIndices?: string;
24
    public readonly maxTrialNumPerGpu?: number;
25
26
    //TODO: initialize varialbe in constructor
    public occupiedGpuIndexMap?: Map<number, number>;
27
    public readonly useActiveGpu?: boolean = false;
28
29
30
31
32
33
34
35
36
37
38
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
    if (gpuIndices !== undefined) {
        const indices: number[] = gpuIndices.split(',')
            .map((x: string) => parseInt(x, 10));
        if (indices.length > 0) {
            return new Set(indices);
        } else {
            throw new Error('gpuIndices can not be empty if specified.');
        }
Deshui Yu's avatar
Deshui Yu committed
39
40
41
42
43
44
45
    }
}

/**
 * The execution result for command executed on remote machine
 */
export class RemoteCommandResult {
chicm-ms's avatar
chicm-ms committed
46
47
48
    public readonly stdout: string;
    public readonly stderr: string;
    public readonly exitCode: number;
Deshui Yu's avatar
Deshui Yu committed
49

chicm-ms's avatar
chicm-ms committed
50
    constructor(stdout: string, stderr: string, exitCode: number) {
Deshui Yu's avatar
Deshui Yu committed
51
52
53
54
55
56
57
58
59
60
61
62
        this.stdout = stdout;
        this.stderr = stderr;
        this.exitCode = exitCode;
    }
}

/**
 * RemoteMachineTrialJobDetail
 */
export class RemoteMachineTrialJobDetail implements TrialJobDetail {
    public id: string;
    public status: TrialJobStatus;
63
64
65
    public submitTime: number;
    public startTime?: number;
    public endTime?: number;
Deshui Yu's avatar
Deshui Yu committed
66
67
68
    public tags?: string[];
    public url?: string;
    public workingDirectory: string;
69
    public form: TrialJobApplicationForm;
Deshui Yu's avatar
Deshui Yu committed
70
    public rmMeta?: RemoteMachineMeta;
71
    public isEarlyStopped?: boolean;
72
    public gpuIndices: GPUInfo[];
Deshui Yu's avatar
Deshui Yu committed
73

74
    constructor(id: string, status: TrialJobStatus, submitTime: number,
75
                workingDirectory: string, form: TrialJobApplicationForm) {
Deshui Yu's avatar
Deshui Yu committed
76
77
78
79
80
81
        this.id = id;
        this.status = status;
        this.submitTime = submitTime;
        this.workingDirectory = workingDirectory;
        this.form = form;
        this.tags = [];
82
        this.gpuIndices = [];
Deshui Yu's avatar
Deshui Yu committed
83
84
85
    }
}

SparkSnail's avatar
SparkSnail committed
86
87
88
89
90
91
92
93
94
95
/**
 * The remote machine ssh client used for trial and gpu detector
 */
export class SSHClient {
    private readonly sshClient: Client;
    private usedConnectionNumber: number; //count the connection number of every client
    constructor(sshClient: Client, usedConnectionNumber: number) {
        this.sshClient = sshClient;
        this.usedConnectionNumber = usedConnectionNumber;
    }
96

SparkSnail's avatar
SparkSnail committed
97
98
99
100
101
102
103
104
    public get getSSHClientInstance(): Client {
        return this.sshClient;
    }

    public get getUsedConnectionNumber(): number {
        return this.usedConnectionNumber;
    }

105
    public addUsedConnectionNumber(): void {
SparkSnail's avatar
SparkSnail committed
106
107
108
        this.usedConnectionNumber += 1;
    }

109
    public minusUsedConnectionNumber(): void {
SparkSnail's avatar
SparkSnail committed
110
111
112
113
        this.usedConnectionNumber -= 1;
    }
}

114
115
116
/**
 * The remote machine ssh client manager
 */
SparkSnail's avatar
SparkSnail committed
117
export class SSHClientManager {
118
    private readonly sshClientArray: SSHClient[];
SparkSnail's avatar
SparkSnail committed
119
120
121
122
123
124
125
126
127
128
129
130
131
    private readonly maxTrialNumberPerConnection: number;
    private readonly rmMeta: RemoteMachineMeta;
    constructor(sshClientArray: SSHClient[], maxTrialNumberPerConnection: number, rmMeta: RemoteMachineMeta) {
        this.rmMeta = rmMeta;
        this.sshClientArray = sshClientArray;
        this.maxTrialNumberPerConnection = maxTrialNumberPerConnection;
    }

    /**
     * find a available ssh client in ssh array, if no ssh client available, return undefined
     */
    public async getAvailableSSHClient(): Promise<Client> {
        const deferred: Deferred<Client> = new Deferred<Client>();
132
133
134
        for (const index of this.sshClientArray.keys()) {
            const connectionNumber: number = this.sshClientArray[index].getUsedConnectionNumber;
            if (connectionNumber < this.maxTrialNumberPerConnection) {
SparkSnail's avatar
SparkSnail committed
135
136
                this.sshClientArray[index].addUsedConnectionNumber();
                deferred.resolve(this.sshClientArray[index].getSSHClientInstance);
137

SparkSnail's avatar
SparkSnail committed
138
139
                return deferred.promise;
            }
140
141
        }

SparkSnail's avatar
SparkSnail committed
142
        //init a new ssh client if could not get an available one
143
        return this.initNewSSHClient();
SparkSnail's avatar
SparkSnail committed
144
    }
145

SparkSnail's avatar
SparkSnail committed
146
147
    /**
     * add a new ssh client to sshClientArray
148
     * @param sshClient SSH Client
SparkSnail's avatar
SparkSnail committed
149
     */
150
    public addNewSSHClient(client: Client): void {
SparkSnail's avatar
SparkSnail committed
151
152
        this.sshClientArray.push(new SSHClient(client, 1));
    }
153

SparkSnail's avatar
SparkSnail committed
154
    /**
155
     * first ssh client instance is used for gpu collector and host job
SparkSnail's avatar
SparkSnail committed
156
     */
157
    public getFirstSSHClient(): Client {
SparkSnail's avatar
SparkSnail committed
158
159
        return this.sshClientArray[0].getSSHClientInstance;
    }
160

SparkSnail's avatar
SparkSnail committed
161
162
163
    /**
     * close all of ssh client
     */
164
165
    public closeAllSSHClient(): void {
        for (const sshClient of this.sshClientArray) {
SparkSnail's avatar
SparkSnail committed
166
167
168
            sshClient.getSSHClientInstance.end();
        }
    }
169

SparkSnail's avatar
SparkSnail committed
170
171
    /**
     * retrieve resource, minus a number for given ssh client
172
     * @param client SSH Client
SparkSnail's avatar
SparkSnail committed
173
     */
174
175
    public releaseConnection(client: Client | undefined): void {
        if (client === undefined) {
SparkSnail's avatar
SparkSnail committed
176
177
            throw new Error(`could not release a undefined ssh client`);
        }
178
179
        for (const index of this.sshClientArray.keys()) {
            if (this.sshClientArray[index].getSSHClientInstance === client) {
SparkSnail's avatar
SparkSnail committed
180
181
182
183
184
185
                this.sshClientArray[index].minusUsedConnectionNumber();
                break;
            }
        }
    }

186
187
188
189
190
191
192
193
194
195
    /**
     * Create a new ssh connection client and initialize it
     */
    // tslint:disable:non-literal-fs-path
    private initNewSSHClient(): Promise<Client> {
        const deferred: Deferred<Client> = new Deferred<Client>();
        const conn: Client = new Client();
        const connectConfig: ConnectConfig = {
            host: this.rmMeta.ip,
            port: this.rmMeta.port,
196
197
            username: this.rmMeta.username,
            tryKeyboard: true };
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
        if (this.rmMeta.passwd !== undefined) {
            connectConfig.password = this.rmMeta.passwd;
        } else if (this.rmMeta.sshKeyPath !== undefined) {
            if (!fs.existsSync(this.rmMeta.sshKeyPath)) {
                //SSh key path is not a valid file, reject
                deferred.reject(new Error(`${this.rmMeta.sshKeyPath} does not exist.`));
            }
            const privateKey: string = fs.readFileSync(this.rmMeta.sshKeyPath, 'utf8');

            connectConfig.privateKey = privateKey;
            connectConfig.passphrase = this.rmMeta.passphrase;
        } else {
            deferred.reject(new Error(`No valid passwd or sshKeyPath is configed.`));
        }
        conn.on('ready', () => {
            this.addNewSSHClient(conn);
            deferred.resolve(conn);
        })
          .on('error', (err: Error) => {
            // SSH connection error, reject with error message
            deferred.reject(new Error(err.message));
219
220
        }).on("keyboard-interactive", (name, instructions, lang, prompts, finish) => {
            finish([this.rmMeta.passwd]);
221
222
223
224
225
226
        })
          .connect(connectConfig);

        return deferred.promise;
    }
}
SparkSnail's avatar
SparkSnail committed
227

chicm-ms's avatar
chicm-ms committed
228
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType};
Deshui Yu's avatar
Deshui Yu committed
229

chicm-ms's avatar
chicm-ms committed
230
export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string};
Deshui Yu's avatar
Deshui Yu committed
231
232

export enum ScheduleResultType {
233
    // Schedule succeeded
Deshui Yu's avatar
Deshui Yu committed
234
235
    SUCCEED,

236
    // Temporarily, no enough available GPU right now
Deshui Yu's avatar
Deshui Yu committed
237
238
    TMP_NO_AVAILABLE_GPU,

239
    // Cannot match requirement even if all GPU are a
Deshui Yu's avatar
Deshui Yu committed
240
241
242
    REQUIRE_EXCEED_TOTAL
}

SparkSnail's avatar
SparkSnail committed
243
export const REMOTEMACHINE_TRIAL_COMMAND_FORMAT: string =
Deshui Yu's avatar
Deshui Yu committed
244
`#!/bin/bash
245
246
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
NNI_TRIAL_SEQ_ID={4} export MULTI_PHASE={5}
Deshui Yu's avatar
Deshui Yu committed
247
cd $NNI_SYS_DIR
SparkSnail's avatar
SparkSnail committed
248
249
sh install_nni.sh
echo $$ >{6}
250
251
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' \
--nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
SparkSnail's avatar
SparkSnail committed
252
echo $? \`date +%s%3N\` >{12}`;
Deshui Yu's avatar
Deshui Yu committed
253

254
export const HOST_JOB_SHELL_FORMAT: string =
Deshui Yu's avatar
Deshui Yu committed
255
256
257
258
259
`#!/bin/bash
cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`;