Unverified Commit afce6d4a authored by fishyds's avatar fishyds Committed by GitHub
Browse files

Merge pull request #950 from Microsoft/v0.6

Merge V0.6 branch to master
parents 6545540d 29a23335
...@@ -98,3 +98,6 @@ Trial configuration in frameworkcontroller mode have the following configuration ...@@ -98,3 +98,6 @@ Trial configuration in frameworkcontroller mode have the following configuration
## How to run example ## How to run example
After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please refer the [document](./KubeflowMode.md) for more information. After you prepare a config file, you could run your experiment by nnictl. The way to start an experiment on frameworkcontroller is similar to kubeflow, please refer the [document](./KubeflowMode.md) for more information.
## version check
NNI support version check feature in since version 0.6, [refer](PAIMode.md)
\ No newline at end of file
...@@ -196,4 +196,7 @@ Notice: In kubeflow mode, NNIManager will start a rest server and listen on a po ...@@ -196,4 +196,7 @@ Notice: In kubeflow mode, NNIManager will start a rest server and listen on a po
Once a trial job is completed, you can goto NNI WebUI's overview page (like http://localhost:8080/oview) to check trial's information. Once a trial job is completed, you can goto NNI WebUI's overview page (like http://localhost:8080/oview) to check trial's information.
## version check
NNI support version check feature in since version 0.6, [refer](PAIMode.md)
Any problems when using NNI in kubeflow mode, please create issues on [NNI Github repo](https://github.com/Microsoft/nni). Any problems when using NNI in kubeflow mode, please create issues on [NNI Github repo](https://github.com/Microsoft/nni).
...@@ -83,3 +83,13 @@ You can see there're three fils in output folder: stderr, stdout, and trial.log ...@@ -83,3 +83,13 @@ You can see there're three fils in output folder: stderr, stdout, and trial.log
If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS. If you also want to save trial's other output into HDFS, like model files, you can use environment variable `NNI_OUTPUT_DIR` in your trial code to save your own output files, and NNI SDK will copy all the files in `NNI_OUTPUT_DIR` from trial's container to HDFS.
Any problems when using NNI in pai mode, please create issues on [NNI github repo](https://github.com/Microsoft/nni). Any problems when using NNI in pai mode, please create issues on [NNI github repo](https://github.com/Microsoft/nni).
## version check
NNI support version check feature in since version 0.6. It is a policy to insure the version of NNIManager is consistent with trialKeeper, and avoid errors caused by version incompatibility.
Check policy:
1. NNIManager before v0.6 could run any version of trialKeeper, trialKeeper support backward compatibility.
2. Since version 0.6, NNIManager version should keep same with triakKeeper version. For example, if NNIManager version is 0.6, trialKeeper version should be 0.6 too.
3. Note that the version check feature only check first two digits of version.For example, NNIManager v0.6.1 could use trialKeeper v0.6 or trialKeeper v0.6.2, but could not use trialKeeper v0.5.1 or trialKeeper v0.7.
If you could not run your experiment and want to know if it is caused by version check, you could check your webUI, and there will be an error message about version check.
![](../img/version_check.png)
\ No newline at end of file
...@@ -63,3 +63,6 @@ nnictl create --config ~/nni/examples/trials/mnist-annotation/config_remote.yml ...@@ -63,3 +63,6 @@ nnictl create --config ~/nni/examples/trials/mnist-annotation/config_remote.yml
``` ```
to start the experiment. to start the experiment.
## version check
NNI support version check feature in since version 0.6, [refer](PAIMode.md)
\ No newline at end of file
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
"express-joi-validator": "^2.0.0", "express-joi-validator": "^2.0.0",
"js-base64": "^2.4.9", "js-base64": "^2.4.9",
"kubernetes-client": "^6.5.0", "kubernetes-client": "^6.5.0",
"node-nvidia-smi": "^1.0.0",
"rx": "^4.1.0", "rx": "^4.1.0",
"sqlite3": "^4.0.2", "sqlite3": "^4.0.2",
"ssh2": "^0.6.1", "ssh2": "^0.6.1",
......
...@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
private readonly expId: string = getExperimentId(); private readonly expId: string = getExperimentId();
private enableVersionCheck: boolean = true; //switch to enable version check
private versionCheckSuccess: boolean | undefined;
private errorMessage?: string;
/** /**
* constructor to provide NNIRestServer's own rest property, e.g. port * constructor to provide NNIRestServer's own rest property, e.g. port
*/ */
...@@ -59,6 +63,14 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -59,6 +63,14 @@ export abstract class ClusterJobRestServer extends RestServer{
return this.port; return this.port;
} }
public get getErrorMessage(): string | undefined{
return this.errorMessage;
}
public set setEnableVersionCheck(versionCheck: boolean) {
this.enableVersionCheck = versionCheck;
}
/** /**
* NNIRestServer's own router registration * NNIRestServer's own router registration
*/ */
...@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
next(); next();
}); });
router.post(`/version/${this.expId}/:trialId`, (req: Request, res: Response) => {
if (this.enableVersionCheck) {
try {
const checkResultSuccess: boolean = req.body.tag === 'VCSuccess'? true: false;
if (this.versionCheckSuccess !== undefined && this.versionCheckSuccess !== checkResultSuccess) {
this.errorMessage = 'Version check error, version check result is inconsistent!';
this.log.error(this.errorMessage);
} else if (checkResultSuccess) {
this.log.info(`Version check in trialKeeper success!`);
this.versionCheckSuccess = true;
} else {
this.versionCheckSuccess = false;
this.errorMessage = req.body.msg;
}
} catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
} else {
this.log.info(`Skipping version check!`);
}
res.send();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => { router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try { try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`); this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
...@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{ ...@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
}); });
router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => { router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => {
if(this.enableVersionCheck && !this.versionCheckSuccess && !this.errorMessage) {
this.errorMessage = `Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+ `NNIManager and TrialKeeper!`
}
const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`); const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`);
try { try {
let skipLogging: boolean = false; let skipLogging: boolean = false;
......
...@@ -58,3 +58,11 @@ export class GPUSummary { ...@@ -58,3 +58,11 @@ export class GPUSummary {
this.gpuInfos = gpuInfos; this.gpuInfos = gpuInfos;
} }
} }
export const GPU_INFO_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
...@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple ...@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
await this.kubernetesJobRestServer.start(); await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server // collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true;
}
} }
} }
......
...@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber ...@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('kubernetesJobRestServer not initialized!'); throw new Error('kubernetesJobRestServer not initialized!');
} }
await this.kubernetesJobRestServer.start(); await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`); this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server // collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await delay(3000); await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient); await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true;
}
} }
this.log.info('Kubeflow training service exit.'); this.log.info('Kubeflow training service exit.');
} }
......
...@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR ...@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR cd $NNI_SYS_DIR
sh install_nni.sh sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --version '{11}' --log_collection '{12}'` python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --nni_manager_version '{11}' --log_collection '{12}'`
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr` + `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
...@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService { ...@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
protected kubernetesCRDClient?: KubernetesCRDClient; protected kubernetesCRDClient?: KubernetesCRDClient;
protected kubernetesJobRestServer?: KubernetesJobRestServer; protected kubernetesJobRestServer?: KubernetesJobRestServer;
protected kubernetesClusterConfig?: KubernetesClusterConfig; protected kubernetesClusterConfig?: KubernetesClusterConfig;
protected versionCheck?: boolean = true; protected versionCheck: boolean = true;
protected logCollection: string; protected logCollection: string;
constructor() { constructor() {
......
...@@ -19,268 +19,16 @@ ...@@ -19,268 +19,16 @@
'use strict'; 'use strict';
import * as assert from 'assert';
import * as nodeNvidiaSmi from 'node-nvidia-smi';
import { delay } from '../../common/utils'; import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData'; import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import * as cp from 'child_process';
/* Example of nvidia-smi result import * as cpp from 'child-process-promise';
{ import * as path from 'path';
"nvidia_smi_log": { import * as os from 'os';
"timestamp": "Fri Jul 13 15:17:27 2018", import * as fs from 'fs';
"driver_version": "396.26", import { String } from 'typescript-string-operations';
"attached_gpus": "8", import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
/** /**
* GPUScheduler * GPUScheduler
...@@ -290,29 +38,43 @@ class GPUScheduler { ...@@ -290,29 +38,43 @@ class GPUScheduler {
private gpuSummary!: GPUSummary; private gpuSummary!: GPUSummary;
private stopping: boolean; private stopping: boolean;
private log: Logger; private log: Logger;
private nvdmNotFoundRegex: RegExp; private gpuMetricCollectorScriptFolder: string;
constructor() { constructor() {
this.stopping = false; this.stopping = false;
this.log = getLogger(); this.log = getLogger();
this.nvdmNotFoundRegex = /nvidia-smi: not found/gi; this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/nni/script`;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
await this.runGpuMetricsCollectorScript();
while (!this.stopping) { while (!this.stopping) {
try { try {
this.gpuSummary = await this.readGPUSummary(); await this.updateGPUSummary();
} catch (error) { } catch (error) {
this.log.error('Read GPU summary failed with error: ', error); this.log.error('Read GPU summary failed with error: ', error);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if(this.nvdmNotFoundRegex.test(error)) {
break;
}
} }
await delay(5000); await delay(5000);
} }
} }
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
public getAvailableGPUIndices(): number[] { public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) { if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index); return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index);
...@@ -321,51 +83,24 @@ class GPUScheduler { ...@@ -321,51 +83,24 @@ class GPUScheduler {
return []; return [];
} }
public stop(): void { public async stop() {
this.stopping = true; this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
} catch (error){
this.log.error(`GPU scheduler error: ${error}`);
} }
private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] {
let gpuInfos : GPUInfo[] = [];
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
assert(gpuNumber > 0);
if(gpuNumber == 1) {
const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
} else {
const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu;
gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
}
return gpuInfos;
}
private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
return new GPUInfo(
typeof embededGPUSummary.process === 'object' ? 1 : 0,
parseFloat(embededGPUSummary.utilization.memory_util),
parseFloat(embededGPUSummary.utilization.gpu_util),
parseInt(embededGPUSummary.minor_number, 10));
} }
private readGPUSummary(): Promise<GPUSummary> { private async updateGPUSummary() {
return new Promise((resolve: Function, reject: Function): void => { const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => { if(cmdresult && cmdresult.stdout) {
if (error) { this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
reject(error);
} else { } else {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10); this.log.error('Could not get gpu metrics information!');
const gpuSummary: GPUSummary = new GPUSummary(
gpuNumber,
Date().toString(),
this.generateEmbededGPUSummary(data)
);
resolve(gpuSummary);
} }
});
});
} }
} }
......
...@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService { ...@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
} }
} }
public cleanUp(): Promise<void> { public async cleanUp(): Promise<void> {
if (this.gpuScheduler !== undefined) { if (this.gpuScheduler !== undefined) {
this.gpuScheduler.stop(); await this.gpuScheduler.stop();
} }
return super.cleanUp(); return super.cleanUp();
......
...@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string = ...@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4} `export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh && cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}' && python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --version '{12}' --log_collection '{13}'`; --pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`;
export const PAI_OUTPUT_DIR_FORMAT: string = export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`; `hdfs://{0}:9000/`;
......
...@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService { ...@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
private paiRestServerPort?: number; private paiRestServerPort?: number;
private nniManagerIpConfig?: NNIManagerIpConfig; private nniManagerIpConfig?: NNIManagerIpConfig;
private copyExpCodeDirPromise?: Promise<void>; private copyExpCodeDirPromise?: Promise<void>;
private versionCheck?: boolean = true; private versionCheck: boolean = true;
private logCollection: string; private logCollection: string;
constructor() { constructor() {
...@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService { ...@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
this.log.info('Run PAI training service.'); this.log.info('Run PAI training service.');
const restServer: PAIJobRestServer = component.get(PAIJobRestServer); const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
await restServer.start(); await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`); this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) { while (!this.stopping) {
await this.updatePaiToken(); await this.updatePaiToken();
await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig); await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
if (restServer.getErrorMessage) {
throw new Error(restServer.getErrorMessage)
this.stopping = true;
}
await delay(3000); await delay(3000);
} }
this.log.info('PAI training service exit.'); this.log.info('PAI training service exit.');
......
...@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={ ...@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR cd $NNI_SYS_DIR
sh install_nni.sh sh install_nni.sh
echo $$ >{6} echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $? \`date +%s%3N\` >{12}`; echo $? \`date +%s%3N\` >{12}`;
export const HOST_JOB_SHELL_FORMAT: string = export const HOST_JOB_SHELL_FORMAT: string =
...@@ -259,11 +259,3 @@ cd {0} ...@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1} echo $$ >{1}
eval {2} >stdout 2>stderr eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`; echo $? \`date +%s%3N\` >{3}`;
export const GPU_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
...@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler'; ...@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import { import {
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta, HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager, RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT, RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
GPU_COLLECTOR_FORMAT
} from './remoteMachineData'; } from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility'; import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util'; import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer'; import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
...@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
public async run(): Promise<void> { public async run(): Promise<void> {
const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer); const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
await restServer.start(); await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck;
this.log.info('Run remote machine training service.'); this.log.info('Run remote machine training service.');
while (!this.stopping) { while (!this.stopping) {
while (this.jobQueue.length > 0) { while (this.jobQueue.length > 0) {
...@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
break; break;
} }
} }
if(restServer.getErrorMessage) {
throw new Error(restServer.getErrorMessage);
this.stopping = true;
}
await delay(3000); await delay(3000);
} }
this.log.info('Remote machine training service exit.'); this.log.info('Remote machine training service exit.');
...@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh'); let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format( const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_COLLECTOR_FORMAT, GPU_INFO_COLLECTOR_FORMAT,
remoteGPUScriptsDir, remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'), path.join(remoteGPUScriptsDir, 'pid'),
); );
......
declare module 'node-nvidia-smi' {
function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
namespace smi {
interface EmbededGPUSummary {
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: EmbededGPUSummary[] | EmbededGPUSummary;
};
}
}
export = smi;
}
\ No newline at end of file
...@@ -143,7 +143,7 @@ class Bracket(): ...@@ -143,7 +143,7 @@ class Bracket():
self.s_max = s_max self.s_max = s_max
self.eta = eta self.eta = eta
self.n = math.ceil((s_max + 1) * (eta**s) / (s + 1) - _epsilon) # pylint: disable=invalid-name self.n = math.ceil((s_max + 1) * (eta**s) / (s + 1) - _epsilon) # pylint: disable=invalid-name
self.r = math.ceil(R / eta**s - _epsilon) # pylint: disable=invalid-name self.r = R / eta**s # pylint: disable=invalid-name
self.i = 0 self.i = 0
self.hyper_configs = [] # [ {id: params}, {}, ... ] self.hyper_configs = [] # [ {id: params}, {}, ... ]
self.configs_perf = [] # [ {id: [seq, acc]}, {}, ... ] self.configs_perf = [] # [ {id: [seq, acc]}, {}, ... ]
...@@ -158,7 +158,7 @@ class Bracket(): ...@@ -158,7 +158,7 @@ class Bracket():
def get_n_r(self): def get_n_r(self):
"""return the values of n and r for the next round""" """return the values of n and r for the next round"""
return math.floor(self.n / self.eta**self.i + _epsilon), self.r * self.eta**self.i return math.floor(self.n / self.eta**self.i + _epsilon), math.floor(self.r * self.eta**self.i + _epsilon)
def increase_i(self): def increase_i(self):
"""i means the ith round. Increase i by 1""" """i means the ith round. Increase i by 1"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment