Unverified Commit f05e685f authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Refactor local gpu scheduler (#943)

parent f075aab0
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.0",
"js-base64": "^2.4.9", "js-base64": "^2.4.9",
"kubernetes-client": "^6.5.0", "kubernetes-client": "^6.5.0",
"node-nvidia-smi": "^1.0.0",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^4.0.2", "sqlite3": "^4.0.2",
"ssh2": "^0.6.1", "ssh2": "^0.6.1",
......
...@@ -58,3 +58,11 @@ export class GPUSummary { ...@@ -58,3 +58,11 @@ export class GPUSummary {
this.gpuInfos = gpuInfos; this.gpuInfos = gpuInfos;
} }
} }
export const GPU_INFO_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
...@@ -19,268 +19,16 @@ ...@@ -19,268 +19,16 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import * as nodeNvidiaSmi from 'node-nvidia-smi';
import { delay } from '../../common/utils'; import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import * as cp from 'child_process';
/* Example of nvidia-smi result import * as cpp from 'child-process-promise';
{ import * as path from 'path';
"nvidia_smi_log": { import * as os from 'os';
"timestamp": "Fri Jul 13 15:17:27 2018", import * as fs from 'fs';
"driver_version": "396.26", import { String } from 'typescript-string-operations';
"attached_gpus": "8", import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
/** /**
* GPUScheduler * GPUScheduler
...@@ -290,29 +38,43 @@ class GPUScheduler { ...@@ -290,29 +38,43 @@ class GPUScheduler {
private gpuSummary!: GPUSummary; private gpuSummary!: GPUSummary;
private stopping: boolean; private stopping: boolean;
private log: Logger; private log: Logger;
private nvdmNotFoundRegex: RegExp; private gpuMetricCollectorScriptFolder: string;
constructor() { constructor() {
this.stopping = false; this.stopping = false;
this.log = getLogger(); this.log = getLogger();
this.nvdmNotFoundRegex = /nvidia-smi: not found/gi; this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/nni/script`;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
await this.runGpuMetricsCollectorScript();
while (!this.stopping) { while (!this.stopping) {
try { try {
this.gpuSummary = await this.readGPUSummary(); await this.updateGPUSummary();
} catch (error) { } catch (error) {
this.log.error('Read GPU summary failed with error: ', error); this.log.error('Read GPU summary failed with error: ', error);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if(this.nvdmNotFoundRegex.test(error)) {
break;
}
} }
await delay(5000); await delay(5000);
} }
} }
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
public getAvailableGPUIndices(): number[] { public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) { if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index); return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index);
...@@ -321,51 +83,20 @@ class GPUScheduler { ...@@ -321,51 +83,20 @@ class GPUScheduler {
return []; return [];
} }
public stop(): void { public async stop() {
this.stopping = true; this.stopping = true;
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
} }
private async updateGPUSummary() {
private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] { const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
let gpuInfos : GPUInfo[] = []; if(cmdresult && cmdresult.stdout) {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10); this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
assert(gpuNumber > 0);
if(gpuNumber == 1) {
const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
} else { } else {
const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu; this.log.error('Could not get gpu metrics information!');
gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
} }
return gpuInfos;
}
private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
return new GPUInfo(
typeof embededGPUSummary.process === 'object' ? 1 : 0,
parseFloat(embededGPUSummary.utilization.memory_util),
parseFloat(embededGPUSummary.utilization.gpu_util),
parseInt(embededGPUSummary.minor_number, 10));
}
private readGPUSummary(): Promise<GPUSummary> {
return new Promise((resolve: Function, reject: Function): void => {
nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => {
if (error) {
reject(error);
} else {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
const gpuSummary: GPUSummary = new GPUSummary(
gpuNumber,
Date().toString(),
this.generateEmbededGPUSummary(data)
);
resolve(gpuSummary);
}
});
});
} }
} }
......
...@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService { ...@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
} }
} }
public cleanUp(): Promise<void> { public async cleanUp(): Promise<void> {
if (this.gpuScheduler !== undefined) { if (this.gpuScheduler !== undefined) {
this.gpuScheduler.stop(); await this.gpuScheduler.stop();
} }
return super.cleanUp(); return super.cleanUp();
......
...@@ -259,11 +259,3 @@ cd {0} ...@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1} echo $$ >{1}
eval {2} >stdout 2>stderr eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`; echo $? \`date +%s%3N\` >{3}`;
export const GPU_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
...@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler'; ...@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import { import {
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta, HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
GPU_COLLECTOR_FORMAT
} from './remoteMachineData'; } from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util'; import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
...@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -452,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format( const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_COLLECTOR_FORMAT, GPU_INFO_COLLECTOR_FORMAT,
remoteGPUScriptsDir, remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'), path.join(remoteGPUScriptsDir, 'pid'),
); );
......
declare module 'node-nvidia-smi' {
function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
namespace smi {
interface EmbededGPUSummary {
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: EmbededGPUSummary[] | EmbededGPUSummary;
};
}
}
export = smi;
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment