Refactor local gpu scheduler (#943)

f05e685f · SparkSnail · GitHub · f075aab0 · f05e685f · f05e685f
Unverified Commit f05e685f authored Apr 01, 2019 by SparkSnail Committed by GitHub Apr 01, 2019
7 changed files
--- a/src/nni_manager/package.json
+++ b/src/nni_manager/package.json
@@ -18,7 +18,6 @@
    "express-joi-validator": "^2.0.0",
    "js-base64": "^2.4.9",
    "kubernetes-client": "^6.5.0",
-    "node-nvidia-smi": "^1.0.0",
    "rx": "^4.1.0",
    "sqlite3": "^4.0.2",
    "ssh2": "^0.6.1",

--- a/src/nni_manager/training_service/common/gpuData.ts
+++ b/src/nni_manager/training_service/common/gpuData.ts
@@ -58,3 +58,11 @@ export class GPUSummary {
        this.gpuInfos = gpuInfos;
    }
 }
+export const GPU_INFO_COLLECTOR_FORMAT: string = 
+`
+#!/bin/bash
+export METRIC_OUTPUT_DIR={0}
+echo $$ >{1}
+python3 -m nni_gpu_tool.gpu_metrics_collector
+`
--- a/src/nni_manager/training_service/local/gpuScheduler.ts
+++ b/src/nni_manager/training_service/local/gpuScheduler.ts
@@ -19,268 +19,16 @@
 'use strict';
-import * as assert from 'assert';
-import * as nodeNvidiaSmi from 'node-nvidia-smi';
 import { delay } from '../../common/utils';
 import { GPUInfo, GPUSummary } from '../common/gpuData';
 import { getLogger, Logger } from '../../common/log';
+import * as cp from 'child_process';
-/* Example of nvidia-smi result
+import * as cpp from 'child-process-promise';
-{
+import * as path from 'path';
-    "nvidia_smi_log": {
+import * as os from 'os';
-        "timestamp": "Fri Jul 13 15:17:27 2018",
+import * as fs from 'fs';
-        "driver_version": "396.26",
+import { String } from 'typescript-string-operations';
-        "attached_gpus": "8",
+import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
-        "gpu": [
-            ...,
-            {
-                ...
-                "minor_number": "5",
-                "utilization": {
-                    "gpu_util": "100 %",
-                    "memory_util": "27 %",
-                    "encoder_util": "0 %",
-                    "decoder_util": "0 %"
-                },
-                ...
-                "processes": {
-                    "process_info": {
-                        "pid": "39943",
-                        "type": "C",
-                        "process_name": "python",
-                        "used_memory": "16229 MiB"
-                    }
-                },
-                ...
-            },
-            {
-                "$": {
-                    "id": "00000000:8E:00.0"
-                },
-                "product_name": "Tesla P100-PCIE-16GB",
-                "product_brand": "Tesla",
-                "display_mode": "Enabled",
-                "display_active": "Disabled",
-                "persistence_mode": "Disabled",
-                "accounting_mode": "Disabled",
-                "accounting_mode_buffer_size": "4000",
-                "driver_model": {
-                    "current_dm": "N/A",
-                    "pending_dm": "N/A"
-                },
-                "serial": "0321017108732",
-                "uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
-                "minor_number": "6",
-                "vbios_version": "86.00.40.00.01",
-                "multigpu_board": "No",
-                "board_id": "0x8e00",
-                "gpu_part_number": "900-2H400-0300-031",
-                "inforom_version": {
-                    "img_version": "H400.0201.00.08",
-                    "oem_object": "1.1",
-                    "ecc_object": "4.1",
-                    "pwr_object": "N/A"
-                },
-                "gpu_operation_mode": {
-                    "current_gom": "N/A",
-                    "pending_gom": "N/A"
-                },
-                "gpu_virtualization_mode": {
-                    "virtualization_mode": "None"
-                },
-                "ibmnpu": {
-                    "relaxed_ordering_mode": "N/A"
-                },
-                "pci": {
-                    "pci_bus": "8E",
-                    "pci_device": "00",
-                    "pci_domain": "0000",
-                    "pci_device_id": "15F810DE",
-                    "pci_bus_id": "00000000:8E:00.0",
-                    "pci_sub_system_id": "118F10DE",
-                    "pci_gpu_link_info": {
-                        "pcie_gen": {
-                            "max_link_gen": "3",
-                            "current_link_gen": "3"
-                        },
-                        "link_widths": {
-                            "max_link_width": "16x",
-                            "current_link_width": "16x"
-                        }
-                    },
-                    "pci_bridge_chip": {
-                        "bridge_chip_type": "N/A",
-                        "bridge_chip_fw": "N/A"
-                    },
-                    "replay_counter": "0",
-                    "tx_util": "0 KB/s",
-                    "rx_util": "0 KB/s"
-                },
-                "fan_speed": "N/A",
-                "performance_state": "P0",
-                "clocks_throttle_reasons": {
-                    "clocks_throttle_reason_gpu_idle": "Not Active",
-                    "clocks_throttle_reason_applications_clocks_setting": "Not Active",
-                    "clocks_throttle_reason_sw_power_cap": "Not Active",
-                    "clocks_throttle_reason_hw_slowdown": "Not Active",
-                    "clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
-                    "clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
-                    "clocks_throttle_reason_sync_boost": "Not Active",
-                    "clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
-                },
-                "fb_memory_usage": {
-                    "total": "16280 MiB",
-                    "used": "16239 MiB",
-                    "free": "41 MiB"
-                },
-                "bar1_memory_usage": {
-                    "total": "16384 MiB",
-                    "used": "2 MiB",
-                    "free": "16382 MiB"
-                },
-                "compute_mode": "Default",
-                "utilization": {
-                    "gpu_util": "0 %",
-                    "memory_util": "0 %",
-                    "encoder_util": "0 %",
-                    "decoder_util": "0 %"
-                },
-                "encoder_stats": {
-                    "session_count": "0",
-                    "average_fps": "0",
-                    "average_latency": "0"
-                },
-                "ecc_mode": {
-                    "current_ecc": "Enabled",
-                    "pending_ecc": "Enabled"
-                },
-                "ecc_errors": {
-                    "volatile": {
-                        "single_bit": {
-                            "device_memory": "0",
-                            "register_file": "0",
-                            "l1_cache": "N/A",
-                            "l2_cache": "0",
-                            "texture_memory": "0",
-                            "texture_shm": "0",
-                            "cbu": "N/A",
-                            "total": "0"
-                        },
-                        "double_bit": {
-                            "device_memory": "0",
-                            "register_file": "0",
-                            "l1_cache": "N/A",
-                            "l2_cache": "0",
-                            "texture_memory": "0",
-                            "texture_shm": "0",
-                            "cbu": "N/A",
-                            "total": "0"
-                        }
-                    },
-                    "aggregate": {
-                        "single_bit": {
-                            "device_memory": "0",
-                            "register_file": "0",
-                            "l1_cache": "N/A",
-                            "l2_cache": "0",
-                            "texture_memory": "0",
-                            "texture_shm": "0",
-                            "cbu": "N/A",
-                            "total": "0"
-                        },
-                        "double_bit": {
-                            "device_memory": "0",
-                            "register_file": "0",
-                            "l1_cache": "N/A",
-                            "l2_cache": "0",
-                            "texture_memory": "0",
-                            "texture_shm": "0",
-                            "cbu": "N/A",
-                            "total": "0"
-                        }
-                    }
-                },
-                "retired_pages": {
-                    "multiple_single_bit_retirement": {
-                        "retired_count": "0",
-                        "retired_page_addresses": "\n\t\t\t\t"
-                    },
-                    "double_bit_retirement": {
-                        "retired_count": "0",
-                        "retired_page_addresses": "\n\t\t\t\t"
-                    },
-                    "pending_retirement": "No"
-                },
-                "temperature": {
-                    "gpu_temp": "33 C",
-                    "gpu_temp_max_threshold": "85 C",
-                    "gpu_temp_slow_threshold": "82 C",
-                    "gpu_temp_max_gpu_threshold": "N/A",
-                    "memory_temp": "N/A",
-                    "gpu_temp_max_mem_threshold": "N/A"
-                },
-                "power_readings": {
-                    "power_state": "P0",
-                    "power_management": "Supported",
-                    "power_draw": "37.29 W",
-                    "power_limit": "250.00 W",
-                    "default_power_limit": "250.00 W",
-                    "enforced_power_limit": "250.00 W",
-                    "min_power_limit": "125.00 W",
-                    "max_power_limit": "250.00 W"
-                },
-                "clocks": {
-                    "graphics_clock": "1328 MHz",
-                    "sm_clock": "1328 MHz",
-                    "mem_clock": "715 MHz",
-                    "video_clock": "1189 MHz"
-                },
-                "applications_clocks": {
-                    "graphics_clock": "1189 MHz",
-                    "mem_clock": "715 MHz"
-                },
-                "default_applications_clocks": {
-                    "graphics_clock": "1189 MHz",
-                    "mem_clock": "715 MHz"
-                },
-                "max_clocks": {
-                    "graphics_clock": "1328 MHz",
-                    "sm_clock": "1328 MHz",
-                    "mem_clock": "715 MHz",
-                    "video_clock": "1328 MHz"
-                },
-                "max_customer_boost_clocks": {
-                    "graphics_clock": "1328 MHz"
-                },
-                "clock_policy": {
-                    "auto_boost": "N/A",
-                    "auto_boost_default": "N/A"
-                },
-                "supported_clocks": {
-                    "supported_mem_clock": {
-                        "value": "715 MHz",
-                        "supported_graphics_clock": [
-                            "1328 MHz",
-                            "1316 MHz",
-                            "1303 MHz",
-                            ...
-                        ]
-                    }
-                },
-                "processes": {
-                    "process_info": {
-                        "pid": "40788",
-                        "type": "C",
-                        "process_name": "python",
-                        "used_memory": "16229 MiB"
-                    }
-                },
-                "accounted_processes": "\n\t\t"
-            },
-            ...
-        ]
-    }
-}*/
 /**
 * GPUScheduler
@@ -290,29 +38,43 @@ class GPUScheduler {
    private gpuSummary!: GPUSummary;
    private stopping: boolean;
    private log: Logger;
-    private nvdmNotFoundRegex: RegExp;
+    private gpuMetricCollectorScriptFolder: string;
    constructor() {
        this.stopping = false;
        this.log = getLogger();
-        this.nvdmNotFoundRegex = /nvidia-smi: not found/gi;
+        this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/nni/script`;
    }
    public async run(): Promise<void> {
+        await this.runGpuMetricsCollectorScript();
        while (!this.stopping) {
            try {
-                this.gpuSummary = await this.readGPUSummary();
+                await this.updateGPUSummary();
            } catch (error) {
                this.log.error('Read GPU summary failed with error: ', error);
-                // If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
-                if(this.nvdmNotFoundRegex.test(error)) {
-                    break;
-                }
            }
            await delay(5000);
        }
    }
+    /**
+     * Generate gpu metric collector shell script in local machine, 
+     * used to run in remote machine, and will be deleted after uploaded from local. 
+     */
+    private async runGpuMetricsCollectorScript(): Promise<void> {
+        await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
+        //generate gpu_metrics_collector.sh
+        let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
+        const gpuMetricsCollectorScriptContent: string = String.Format(
+            GPU_INFO_COLLECTOR_FORMAT,
+            this.gpuMetricCollectorScriptFolder,
+            path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
+        );
+        await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
+        cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
+    }
    public getAvailableGPUIndices(): number[] {
        if (this.gpuSummary !== undefined) {
            return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index);
@@ -321,51 +83,20 @@ class GPUScheduler {
        return [];
    }
-    public stop(): void {
+    public async stop() {
        this.stopping = true;
+        const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
+        await cpp.exec(`pkill -P ${pid}`);
+        await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
    }
+    private async updateGPUSummary() {
-    private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] {
+        const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
-        let gpuInfos : GPUInfo[] = [];
+        if(cmdresult && cmdresult.stdout) {
-        const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
+            this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
-        assert(gpuNumber > 0);
-        if(gpuNumber == 1) {
-            const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
-            gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
        } else {
-            const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu;
+            this.log.error('Could not get gpu metrics information!');
-            gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
        }
-        return gpuInfos;
-    }
-    private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
-        return new GPUInfo(
-            typeof embededGPUSummary.process === 'object' ? 1 : 0,
-            parseFloat(embededGPUSummary.utilization.memory_util),
-            parseFloat(embededGPUSummary.utilization.gpu_util),
-            parseInt(embededGPUSummary.minor_number, 10));
-    }
-    private readGPUSummary(): Promise<GPUSummary> {
-        return new Promise((resolve: Function, reject: Function): void => {
-            nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => {
-                if (error) {
-                    reject(error);
-                } else {
-                    const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
-                    const gpuSummary: GPUSummary = new GPUSummary(
-                        gpuNumber,
-                        Date().toString(),
-                        this.generateEmbededGPUSummary(data)
-                    );
-                    resolve(gpuSummary);
-                }
-            });
-        });
    }
 }

--- a/src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
+++ b/src/nni_manager/training_service/local/localTrainingServiceForGPU.ts
@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
        }
    }
-    public cleanUp(): Promise<void> {
+    public async cleanUp(): Promise<void> {
        if (this.gpuScheduler !== undefined) {
-            this.gpuScheduler.stop();
+            await this.gpuScheduler.stop();
        }
        return super.cleanUp();

--- a/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineData.ts
@@ -259,11 +259,3 @@ cd {0}
 echo $$ >{1}
 eval {2} >stdout 2>stderr
 echo $? \`date +%s%3N\` >{3}`;
-export const GPU_COLLECTOR_FORMAT: string = 
-`
-#!/bin/bash
-export METRIC_OUTPUT_DIR={0}
-echo $$ >{1}
-python3 -m nni_gpu_tool.gpu_metrics_collector
-`
--- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
+++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
 import {
    HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
    RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
-    RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, 
+    RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
-    GPU_COLLECTOR_FORMAT
 } from './remoteMachineData';
+import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
 import { SSHClientUtility } from './sshClientUtility';
 import { validateCodeDir } from '../common/util';
 import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
        let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
        const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
        const gpuMetricsCollectorScriptContent: string = String.Format(
-            GPU_COLLECTOR_FORMAT, 
+            GPU_INFO_COLLECTOR_FORMAT, 
            remoteGPUScriptsDir, 
            path.join(remoteGPUScriptsDir, 'pid'), 
        );

--- a/src/nni_manager/types/node-nvidia-smi/index.d.ts
+++ b/src/nni_manager/types/node-nvidia-smi/index.d.ts
-declare module 'node-nvidia-smi' {   
-    function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
-    namespace smi {
-        interface EmbededGPUSummary {
-            minor_number: string;
-            utilization: {
-                gpu_util: string;
-                memory_util: string;
-            };
-            process: string | object;
-        }
-        interface GPUInfo {
-            nvidia_smi_log: {
-                attached_gpus: string;
-                gpu: EmbededGPUSummary[] | EmbededGPUSummary;
-            };
-        }
-    }
-    export = smi;
-}
\ No newline at end of file