remoteMachineData.ts 6.17 KB
Newer Older
liuzhe-lz's avatar
liuzhe-lz committed
1
2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
Deshui Yu's avatar
Deshui Yu committed
3
4
5

'use strict';

6
import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
7
import { GPUInfo, GPUSummary } from '../common/gpuData';
8
import { ShellExecutor } from './shellExecutor';
Deshui Yu's avatar
Deshui Yu committed
9
10
11
12
13

/**
 * Metadata of remote machine for configuration and statuc query
 */
export class RemoteMachineMeta {
chicm-ms's avatar
chicm-ms committed
14
15
16
    public readonly ip: string = '';
    public readonly port: number = 22;
    public readonly username: string = '';
17
    public readonly passwd: string = '';
18
19
    public readonly sshKeyPath?: string;
    public readonly passphrase?: string;
chicm-ms's avatar
chicm-ms committed
20
    public gpuSummary: GPUSummary | undefined;
21
    public readonly gpuIndices?: string;
22
    public readonly maxTrialNumPerGpu?: number;
23
24
    //TODO: initialize varialbe in constructor
    public occupiedGpuIndexMap?: Map<number, number>;
25
    public readonly useActiveGpu?: boolean = false;
26
27
28
29
30
31
32
33
34
35
36
}

export function parseGpuIndices(gpuIndices?: string): Set<number> | undefined {
    if (gpuIndices !== undefined) {
        const indices: number[] = gpuIndices.split(',')
            .map((x: string) => parseInt(x, 10));
        if (indices.length > 0) {
            return new Set(indices);
        } else {
            throw new Error('gpuIndices can not be empty if specified.');
        }
Deshui Yu's avatar
Deshui Yu committed
37
38
39
40
41
42
43
    }
}

/**
 * The execution result for command executed on remote machine
 */
export class RemoteCommandResult {
chicm-ms's avatar
chicm-ms committed
44
45
46
    public readonly stdout: string;
    public readonly stderr: string;
    public readonly exitCode: number;
Deshui Yu's avatar
Deshui Yu committed
47

chicm-ms's avatar
chicm-ms committed
48
    constructor(stdout: string, stderr: string, exitCode: number) {
Deshui Yu's avatar
Deshui Yu committed
49
50
51
52
53
54
55
56
57
58
59
60
        this.stdout = stdout;
        this.stderr = stderr;
        this.exitCode = exitCode;
    }
}

/**
 * RemoteMachineTrialJobDetail
 */
export class RemoteMachineTrialJobDetail implements TrialJobDetail {
    public id: string;
    public status: TrialJobStatus;
61
62
63
    public submitTime: number;
    public startTime?: number;
    public endTime?: number;
Deshui Yu's avatar
Deshui Yu committed
64
65
66
    public tags?: string[];
    public url?: string;
    public workingDirectory: string;
67
    public form: TrialJobApplicationForm;
Deshui Yu's avatar
Deshui Yu committed
68
    public rmMeta?: RemoteMachineMeta;
69
    public isEarlyStopped?: boolean;
70
    public gpuIndices: GPUInfo[];
Deshui Yu's avatar
Deshui Yu committed
71

72
    constructor(id: string, status: TrialJobStatus, submitTime: number,
73
        workingDirectory: string, form: TrialJobApplicationForm) {
Deshui Yu's avatar
Deshui Yu committed
74
75
76
77
78
79
        this.id = id;
        this.status = status;
        this.submitTime = submitTime;
        this.workingDirectory = workingDirectory;
        this.form = form;
        this.tags = [];
80
        this.gpuIndices = [];
Deshui Yu's avatar
Deshui Yu committed
81
82
83
    }
}

SparkSnail's avatar
SparkSnail committed
84
/**
85
 * The remote machine executor manager
SparkSnail's avatar
SparkSnail committed
86
 */
87
88
export class ExecutorManager {
    private readonly executorArray: ShellExecutor[];
SparkSnail's avatar
SparkSnail committed
89
90
    private readonly maxTrialNumberPerConnection: number;
    private readonly rmMeta: RemoteMachineMeta;
91
    constructor(executorArray: ShellExecutor[], maxTrialNumberPerConnection: number, rmMeta: RemoteMachineMeta) {
SparkSnail's avatar
SparkSnail committed
92
        this.rmMeta = rmMeta;
93
        this.executorArray = executorArray;
SparkSnail's avatar
SparkSnail committed
94
95
96
97
        this.maxTrialNumberPerConnection = maxTrialNumberPerConnection;
    }

    /**
98
     * find a available executor, if no executor available, return a new one
SparkSnail's avatar
SparkSnail committed
99
     */
100
101
102
    public async getAvailableExecutor(): Promise<ShellExecutor> {
        for (const index of this.executorArray.keys()) {
            const connectionNumber: number = this.executorArray[index].getUsedConnectionNumber;
103
            if (connectionNumber < this.maxTrialNumberPerConnection) {
104
                this.executorArray[index].addUsedConnectionNumber();
105

106
                return this.executorArray[index];
SparkSnail's avatar
SparkSnail committed
107
            }
108
109
        }

110
111
        //init a new executor if could not get an available one
        return await this.initNewShellExecutor();
SparkSnail's avatar
SparkSnail committed
112
    }
113

SparkSnail's avatar
SparkSnail committed
114
    /**
115
116
     * add a new executor to executorArray
     * @param executor ShellExecutor
SparkSnail's avatar
SparkSnail committed
117
     */
118
119
    public addNewShellExecutor(executor: ShellExecutor): void {
        this.executorArray.push(executor);
SparkSnail's avatar
SparkSnail committed
120
    }
121

SparkSnail's avatar
SparkSnail committed
122
    /**
123
     * first executor instance is used for gpu collector and host job
SparkSnail's avatar
SparkSnail committed
124
     */
125
126
    public getFirstExecutor(): ShellExecutor {
        return this.executorArray[0];
SparkSnail's avatar
SparkSnail committed
127
    }
128

SparkSnail's avatar
SparkSnail committed
129
    /**
130
     * close all of executor
SparkSnail's avatar
SparkSnail committed
131
     */
132
133
134
    public closeAllExecutor(): void {
        for (const executor of this.executorArray) {
            executor.close();
SparkSnail's avatar
SparkSnail committed
135
136
        }
    }
137

SparkSnail's avatar
SparkSnail committed
138
    /**
139
140
     * retrieve resource, minus a number for given executor
     * @param executor executor
SparkSnail's avatar
SparkSnail committed
141
     */
142
143
144
    public releaseConnection(executor: ShellExecutor | undefined): void {
        if (executor === undefined) {
            throw new Error(`could not release a undefined executor`);
SparkSnail's avatar
SparkSnail committed
145
        }
146
147
148
        for (const index of this.executorArray.keys()) {
            if (this.executorArray[index] === executor) {
                this.executorArray[index].minusUsedConnectionNumber();
SparkSnail's avatar
SparkSnail committed
149
150
151
152
153
                break;
            }
        }
    }

154
    /**
155
     * Create a new connection executor and initialize it
156
     */
157
158
159
160
    private async initNewShellExecutor(): Promise<ShellExecutor> {
        const executor = new ShellExecutor();
        await executor.initialize(this.rmMeta);
        return executor;
161
162
    }
}
SparkSnail's avatar
SparkSnail committed
163

164
export type RemoteMachineScheduleResult = { scheduleInfo: RemoteMachineScheduleInfo | undefined; resultType: ScheduleResultType };
Deshui Yu's avatar
Deshui Yu committed
165

166
export type RemoteMachineScheduleInfo = { rmMeta: RemoteMachineMeta; cudaVisibleDevice: string };
Deshui Yu's avatar
Deshui Yu committed
167
168

export enum ScheduleResultType {
169
    // Schedule succeeded
Deshui Yu's avatar
Deshui Yu committed
170
171
    SUCCEED,

172
    // Temporarily, no enough available GPU right now
Deshui Yu's avatar
Deshui Yu committed
173
174
    TMP_NO_AVAILABLE_GPU,

175
    // Cannot match requirement even if all GPU are a
Deshui Yu's avatar
Deshui Yu committed
176
177
178
    REQUIRE_EXCEED_TOTAL
}

SparkSnail's avatar
SparkSnail committed
179
export const REMOTEMACHINE_TRIAL_COMMAND_FORMAT: string =
180
    `#!/bin/bash
181
export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} \
182
183
NNI_TRIAL_SEQ_ID={4} MULTI_PHASE={5} NNI_CODE_DIR={6} 
cp -r $NNI_CODE_DIR/. $NNI_SYS_DIR
Deshui Yu's avatar
Deshui Yu committed
184
cd $NNI_SYS_DIR
SparkSnail's avatar
SparkSnail committed
185
sh install_nni.sh
186
187
188
189
echo $$ >{7}
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip '{9}' --nnimanager_port '{10}' \
--nni_manager_version '{11}' --log_collection '{12}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $? \`date +%s%3N\` >{13}`;
Deshui Yu's avatar
Deshui Yu committed
190

191
export const HOST_JOB_SHELL_FORMAT: string =
192
    `#!/bin/bash
Deshui Yu's avatar
Deshui Yu committed
193
194
195
196
cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`;