Unverified Commit d95c3513 authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #155 from Microsoft/master

merge master
parents 77526d37 e7d31abd
......@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
private readonly expId: string = getExperimentId();
private enableVersionCheck: boolean = true; //switch to enable version check
private versionCheckSuccess: boolean | undefined;
private errorMessage?: string;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
......@@ -58,6 +62,14 @@ export abstract class ClusterJobRestServer extends RestServer{
}
return this.port;
}
public get getErrorMessage(): string | undefined{
return this.errorMessage;
}
public set setEnableVersionCheck(versionCheck: boolean) {
this.enableVersionCheck = versionCheck;
}
/**
* NNIRestServer's own router registration
......@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
next();
});
router.post(`/version/${this.expId}/:trialId`, (req: Request, res: Response) => {
if (this.enableVersionCheck) {
try {
const checkResultSuccess: boolean = req.body.tag === 'VCSuccess'? true: false;
if (this.versionCheckSuccess !== undefined && this.versionCheckSuccess !== checkResultSuccess) {
this.errorMessage = 'Version check error, version check result is inconsistent!';
this.log.error(this.errorMessage);
} else if (checkResultSuccess) {
this.log.info(`Version check in trialKeeper success!`);
this.versionCheckSuccess = true;
} else {
this.versionCheckSuccess = false;
this.errorMessage = req.body.msg;
}
} catch(err) {
this.log.error(`json parse metrics error: ${err}`);
res.status(500);
res.send(err.message);
}
} else {
this.log.info(`Skipping version check!`);
}
res.send();
});
router.post(`/update-metrics/${this.expId}/:trialId`, (req: Request, res: Response) => {
try {
this.log.info(`Get update-metrics request, trial job id is ${req.params.trialId}`);
......@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
});
router.post(`/stdout/${this.expId}/:trialId`, (req: Request, res: Response) => {
if(this.enableVersionCheck && !this.versionCheckSuccess && !this.errorMessage) {
this.errorMessage = `Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+ `NNIManager and TrialKeeper!`
}
const trialLogPath: string = path.join(getLogDir(), `trial_${req.params.trialId}.log`);
try {
let skipLogging: boolean = false;
......
......@@ -58,3 +58,11 @@ export class GPUSummary {
this.gpuInfos = gpuInfos;
}
}
export const GPU_INFO_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
......@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw new Error('kubernetesJobRestServer not initialized!');
}
await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`frameworkcontroller Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) {
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await delay(3000);
await this.fcJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true;
}
}
}
......
......@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw new Error('kubernetesJobRestServer not initialized!');
}
await this.kubernetesJobRestServer.start();
this.kubernetesJobRestServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`Kubeflow Training service rest server listening on: ${this.kubernetesJobRestServer.endPoint}`);
while (!this.stopping) {
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await delay(3000);
await this.kubeflowJobInfoCollector.retrieveTrialStatus(this.kubernetesCRDClient);
if(this.kubernetesJobRestServer.getErrorMessage) {
throw new Error(this.kubernetesJobRestServer.getErrorMessage);
this.stopping = true;
}
}
this.log.info('Kubeflow training service exit.');
}
......
......@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --version '{11}' --log_collection '{12}'`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --nni_manager_version '{11}' --log_collection '{12}'`
+ `1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
......@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
protected kubernetesCRDClient?: KubernetesCRDClient;
protected kubernetesJobRestServer?: KubernetesJobRestServer;
protected kubernetesClusterConfig?: KubernetesClusterConfig;
protected versionCheck?: boolean = true;
protected versionCheck: boolean = true;
protected logCollection: string;
constructor() {
......
......@@ -19,268 +19,16 @@
'use strict';
import * as assert from 'assert';
import * as nodeNvidiaSmi from 'node-nvidia-smi';
import { delay } from '../../common/utils';
import { GPUInfo, GPUSummary } from '../common/gpuData';
import { getLogger, Logger } from '../../common/log';
/* Example of nvidia-smi result
{
"nvidia_smi_log": {
"timestamp": "Fri Jul 13 15:17:27 2018",
"driver_version": "396.26",
"attached_gpus": "8",
"gpu": [
...,
{
...
"minor_number": "5",
"utilization": {
"gpu_util": "100 %",
"memory_util": "27 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
...
"processes": {
"process_info": {
"pid": "39943",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
...
},
{
"$": {
"id": "00000000:8E:00.0"
},
"product_name": "Tesla P100-PCIE-16GB",
"product_brand": "Tesla",
"display_mode": "Enabled",
"display_active": "Disabled",
"persistence_mode": "Disabled",
"accounting_mode": "Disabled",
"accounting_mode_buffer_size": "4000",
"driver_model": {
"current_dm": "N/A",
"pending_dm": "N/A"
},
"serial": "0321017108732",
"uuid": "GPU-df3e8a0a-ce99-350c-b196-c3775eb32309",
"minor_number": "6",
"vbios_version": "86.00.40.00.01",
"multigpu_board": "No",
"board_id": "0x8e00",
"gpu_part_number": "900-2H400-0300-031",
"inforom_version": {
"img_version": "H400.0201.00.08",
"oem_object": "1.1",
"ecc_object": "4.1",
"pwr_object": "N/A"
},
"gpu_operation_mode": {
"current_gom": "N/A",
"pending_gom": "N/A"
},
"gpu_virtualization_mode": {
"virtualization_mode": "None"
},
"ibmnpu": {
"relaxed_ordering_mode": "N/A"
},
"pci": {
"pci_bus": "8E",
"pci_device": "00",
"pci_domain": "0000",
"pci_device_id": "15F810DE",
"pci_bus_id": "00000000:8E:00.0",
"pci_sub_system_id": "118F10DE",
"pci_gpu_link_info": {
"pcie_gen": {
"max_link_gen": "3",
"current_link_gen": "3"
},
"link_widths": {
"max_link_width": "16x",
"current_link_width": "16x"
}
},
"pci_bridge_chip": {
"bridge_chip_type": "N/A",
"bridge_chip_fw": "N/A"
},
"replay_counter": "0",
"tx_util": "0 KB/s",
"rx_util": "0 KB/s"
},
"fan_speed": "N/A",
"performance_state": "P0",
"clocks_throttle_reasons": {
"clocks_throttle_reason_gpu_idle": "Not Active",
"clocks_throttle_reason_applications_clocks_setting": "Not Active",
"clocks_throttle_reason_sw_power_cap": "Not Active",
"clocks_throttle_reason_hw_slowdown": "Not Active",
"clocks_throttle_reason_hw_thermal_slowdown": "Not Active",
"clocks_throttle_reason_hw_power_brake_slowdown": "Not Active",
"clocks_throttle_reason_sync_boost": "Not Active",
"clocks_throttle_reason_sw_thermal_slowdown": "Not Active"
},
"fb_memory_usage": {
"total": "16280 MiB",
"used": "16239 MiB",
"free": "41 MiB"
},
"bar1_memory_usage": {
"total": "16384 MiB",
"used": "2 MiB",
"free": "16382 MiB"
},
"compute_mode": "Default",
"utilization": {
"gpu_util": "0 %",
"memory_util": "0 %",
"encoder_util": "0 %",
"decoder_util": "0 %"
},
"encoder_stats": {
"session_count": "0",
"average_fps": "0",
"average_latency": "0"
},
"ecc_mode": {
"current_ecc": "Enabled",
"pending_ecc": "Enabled"
},
"ecc_errors": {
"volatile": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
},
"aggregate": {
"single_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
},
"double_bit": {
"device_memory": "0",
"register_file": "0",
"l1_cache": "N/A",
"l2_cache": "0",
"texture_memory": "0",
"texture_shm": "0",
"cbu": "N/A",
"total": "0"
}
}
},
"retired_pages": {
"multiple_single_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"double_bit_retirement": {
"retired_count": "0",
"retired_page_addresses": "\n\t\t\t\t"
},
"pending_retirement": "No"
},
"temperature": {
"gpu_temp": "33 C",
"gpu_temp_max_threshold": "85 C",
"gpu_temp_slow_threshold": "82 C",
"gpu_temp_max_gpu_threshold": "N/A",
"memory_temp": "N/A",
"gpu_temp_max_mem_threshold": "N/A"
},
"power_readings": {
"power_state": "P0",
"power_management": "Supported",
"power_draw": "37.29 W",
"power_limit": "250.00 W",
"default_power_limit": "250.00 W",
"enforced_power_limit": "250.00 W",
"min_power_limit": "125.00 W",
"max_power_limit": "250.00 W"
},
"clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1189 MHz"
},
"applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"default_applications_clocks": {
"graphics_clock": "1189 MHz",
"mem_clock": "715 MHz"
},
"max_clocks": {
"graphics_clock": "1328 MHz",
"sm_clock": "1328 MHz",
"mem_clock": "715 MHz",
"video_clock": "1328 MHz"
},
"max_customer_boost_clocks": {
"graphics_clock": "1328 MHz"
},
"clock_policy": {
"auto_boost": "N/A",
"auto_boost_default": "N/A"
},
"supported_clocks": {
"supported_mem_clock": {
"value": "715 MHz",
"supported_graphics_clock": [
"1328 MHz",
"1316 MHz",
"1303 MHz",
...
]
}
},
"processes": {
"process_info": {
"pid": "40788",
"type": "C",
"process_name": "python",
"used_memory": "16229 MiB"
}
},
"accounted_processes": "\n\t\t"
},
...
]
}
}*/
import * as cp from 'child_process';
import * as cpp from 'child-process-promise';
import * as path from 'path';
import * as os from 'os';
import * as fs from 'fs';
import { String } from 'typescript-string-operations';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData'
/**
* GPUScheduler
......@@ -290,29 +38,43 @@ class GPUScheduler {
private gpuSummary!: GPUSummary;
private stopping: boolean;
private log: Logger;
private nvdmNotFoundRegex: RegExp;
private gpuMetricCollectorScriptFolder: string;
constructor() {
this.stopping = false;
this.log = getLogger();
this.nvdmNotFoundRegex = /nvidia-smi: not found/gi;
this.gpuMetricCollectorScriptFolder = `${os.tmpdir()}/nni/script`;
}
public async run(): Promise<void> {
await this.runGpuMetricsCollectorScript();
while (!this.stopping) {
try {
this.gpuSummary = await this.readGPUSummary();
await this.updateGPUSummary();
} catch (error) {
this.log.error('Read GPU summary failed with error: ', error);
// If nvidia-smi command is not found, break the gpu summary reading loop to avoid unnecessary periodically checking
if(this.nvdmNotFoundRegex.test(error)) {
break;
}
}
await delay(5000);
}
}
/**
* Generate gpu metric collector shell script in local machine,
* used to run in remote machine, and will be deleted after uploaded from local.
*/
private async runGpuMetricsCollectorScript(): Promise<void> {
await cpp.exec(`mkdir -p ${this.gpuMetricCollectorScriptFolder}`);
//generate gpu_metrics_collector.sh
let gpuMetricsCollectorScriptPath: string = path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics_collector.sh');
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_INFO_COLLECTOR_FORMAT,
this.gpuMetricCollectorScriptFolder,
path.join(this.gpuMetricCollectorScriptFolder, 'pid'),
);
await fs.promises.writeFile(gpuMetricsCollectorScriptPath, gpuMetricsCollectorScriptContent, { encoding: 'utf8' });
cp.exec(`bash ${gpuMetricsCollectorScriptPath}`);
}
public getAvailableGPUIndices(): number[] {
if (this.gpuSummary !== undefined) {
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0).map((info: GPUInfo) => info.index);
......@@ -321,51 +83,24 @@ class GPUScheduler {
return [];
}
public stop(): void {
public async stop() {
this.stopping = true;
try {
const pid: string = await fs.promises.readFile(path.join(this.gpuMetricCollectorScriptFolder, 'pid'), 'utf8');
await cpp.exec(`pkill -P ${pid}`);
await cpp.exec(`rm -rf ${this.gpuMetricCollectorScriptFolder}`);
} catch (error){
this.log.error(`GPU scheduler error: ${error}`);
}
}
private generateEmbededGPUSummary(data: nodeNvidiaSmi.GPUInfo) : GPUInfo[] {
let gpuInfos : GPUInfo[] = [];
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
assert(gpuNumber > 0);
if(gpuNumber == 1) {
const embededGPUSummary = <nodeNvidiaSmi.EmbededGPUSummary>data.nvidia_smi_log.gpu;
gpuInfos.push(this.convertGPUSummaryToInfo(embededGPUSummary));
private async updateGPUSummary() {
const cmdresult = await cpp.exec(`tail -n 1 ${path.join(this.gpuMetricCollectorScriptFolder, 'gpu_metrics')}`);
if(cmdresult && cmdresult.stdout) {
this.gpuSummary = <GPUSummary>JSON.parse(cmdresult.stdout);
} else {
const embededGPUSummaryArray = <nodeNvidiaSmi.EmbededGPUSummary[]>data.nvidia_smi_log.gpu;
gpuInfos = embededGPUSummaryArray.map(embededGPUSummary => this.convertGPUSummaryToInfo(embededGPUSummary));
this.log.error('Could not get gpu metrics information!');
}
return gpuInfos;
}
private convertGPUSummaryToInfo(embededGPUSummary : nodeNvidiaSmi.EmbededGPUSummary) : GPUInfo {
return new GPUInfo(
typeof embededGPUSummary.process === 'object' ? 1 : 0,
parseFloat(embededGPUSummary.utilization.memory_util),
parseFloat(embededGPUSummary.utilization.gpu_util),
parseInt(embededGPUSummary.minor_number, 10));
}
private readGPUSummary(): Promise<GPUSummary> {
return new Promise((resolve: Function, reject: Function): void => {
nodeNvidiaSmi((error: Error, data: nodeNvidiaSmi.GPUInfo) => {
if (error) {
reject(error);
} else {
const gpuNumber : number = parseInt(data.nvidia_smi_log.attached_gpus, 10);
const gpuSummary: GPUSummary = new GPUSummary(
gpuNumber,
Date().toString(),
this.generateEmbededGPUSummary(data)
);
resolve(gpuSummary);
}
});
});
}
}
......
......@@ -69,9 +69,9 @@ class LocalTrainingServiceForGPU extends LocalTrainingService {
}
}
public cleanUp(): Promise<void> {
public async cleanUp(): Promise<void> {
if (this.gpuScheduler !== undefined) {
this.gpuScheduler.stop();
await this.gpuScheduler.stop();
}
return super.cleanUp();
......
......@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --version '{12}' --log_collection '{13}'`;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --nni_manager_version '{12}' --log_collection '{13}'`;
export const PAI_OUTPUT_DIR_FORMAT: string =
`hdfs://{0}:9000/`;
......
......@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
private paiRestServerPort?: number;
private nniManagerIpConfig?: NNIManagerIpConfig;
private copyExpCodeDirPromise?: Promise<void>;
private versionCheck?: boolean = true;
private versionCheck: boolean = true;
private logCollection: string;
constructor() {
......@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
this.log.info('Run PAI training service.');
const restServer: PAIJobRestServer = component.get(PAIJobRestServer);
await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck;
this.log.info(`PAI Training service rest server listening on: ${restServer.endPoint}`);
while (!this.stopping) {
await this.updatePaiToken();
await this.paiJobCollector.retrieveTrialStatus(this.paiToken, this.paiClusterConfig);
if (restServer.getErrorMessage) {
throw new Error(restServer.getErrorMessage)
this.stopping = true;
}
await delay(3000);
}
this.log.info('PAI training service exit.');
......
......@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR
sh install_nni.sh
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --nni_manager_version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $? \`date +%s%3N\` >{12}`;
export const HOST_JOB_SHELL_FORMAT: string =
......@@ -259,11 +259,3 @@ cd {0}
echo $$ >{1}
eval {2} >stdout 2>stderr
echo $? \`date +%s%3N\` >{3}`;
export const GPU_COLLECTOR_FORMAT: string =
`
#!/bin/bash
export METRIC_OUTPUT_DIR={0}
echo $$ >{1}
python3 -m nni_gpu_tool.gpu_metrics_collector
`
......@@ -44,9 +44,9 @@ import { GPUScheduler } from './gpuScheduler';
import {
HOST_JOB_SHELL_FORMAT, RemoteCommandResult, RemoteMachineMeta,
RemoteMachineScheduleInfo, RemoteMachineScheduleResult, SSHClient, SSHClientManager,
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT,
GPU_COLLECTOR_FORMAT
RemoteMachineTrialJobDetail, ScheduleResultType, REMOTEMACHINE_TRIAL_COMMAND_FORMAT
} from './remoteMachineData';
import { GPU_INFO_COLLECTOR_FORMAT } from '../common/gpuData';
import { SSHClientUtility } from './sshClientUtility';
import { validateCodeDir } from '../common/util';
import { RemoteMachineJobRestServer } from './remoteMachineJobRestServer';
......@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
public async run(): Promise<void> {
const restServer: RemoteMachineJobRestServer = component.get(RemoteMachineJobRestServer);
await restServer.start();
restServer.setEnableVersionCheck = this.versionCheck;
this.log.info('Run remote machine training service.');
while (!this.stopping) {
while (this.jobQueue.length > 0) {
......@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
break;
}
}
if(restServer.getErrorMessage) {
throw new Error(restServer.getErrorMessage);
this.stopping = true;
}
await delay(3000);
}
this.log.info('Remote machine training service exit.');
......@@ -447,7 +452,7 @@ class RemoteMachineTrainingService implements TrainingService {
let gpuMetricsCollectorScriptPath: string = path.join(gpuMetricCollectorScriptFolder, userName, 'gpu_metrics_collector.sh');
const remoteGPUScriptsDir: string = this.getRemoteScriptsPath(userName); // This directory is used to store gpu_metrics and pid created by script
const gpuMetricsCollectorScriptContent: string = String.Format(
GPU_COLLECTOR_FORMAT,
GPU_INFO_COLLECTOR_FORMAT,
remoteGPUScriptsDir,
path.join(remoteGPUScriptsDir, 'pid'),
);
......
declare module 'node-nvidia-smi' {
function smi(callback: (error: Error, data: smi.GPUInfo) => void): void;
namespace smi {
interface EmbededGPUSummary {
minor_number: string;
utilization: {
gpu_util: string;
memory_util: string;
};
process: string | object;
}
interface GPUInfo {
nvidia_smi_log: {
attached_gpus: string;
gpu: EmbededGPUSummary[] | EmbededGPUSummary;
};
}
}
export = smi;
}
\ No newline at end of file
......@@ -143,7 +143,7 @@ class Bracket():
self.s_max = s_max
self.eta = eta
self.n = math.ceil((s_max + 1) * (eta**s) / (s + 1) - _epsilon) # pylint: disable=invalid-name
self.r = math.ceil(R / eta**s - _epsilon) # pylint: disable=invalid-name
self.r = R / eta**s # pylint: disable=invalid-name
self.i = 0
self.hyper_configs = [] # [ {id: params}, {}, ... ]
self.configs_perf = [] # [ {id: [seq, acc]}, {}, ... ]
......@@ -158,7 +158,7 @@ class Bracket():
def get_n_r(self):
"""return the values of n and r for the next round"""
return math.floor(self.n / self.eta**self.i + _epsilon), self.r * self.eta**self.i
return math.floor(self.n / self.eta**self.i + _epsilon), math.floor(self.r * self.eta**self.i + _epsilon)
def increase_i(self):
"""i means the ith round. Increase i by 1"""
......
......@@ -40,7 +40,7 @@ def create_model(samples_x, samples_y_aggregation,
regressor = gp.GaussianProcessRegressor(kernel=kernel,
n_restarts_optimizer=n_restarts_optimizer,
normalize_y=True,
alpha=0)
alpha=1e-10)
regressor.fit(numpy.array(samples_x), numpy.array(samples_y_aggregation))
model = {}
......
......@@ -65,7 +65,7 @@ class MetisTuner(Tuner):
"""
def __init__(self, optimize_mode="maximize", no_resampling=True, no_candidates=True,
selection_num_starting_points=600, cold_start_num=10, exploration_probability=0.1):
selection_num_starting_points=600, cold_start_num=10, exploration_probability=0.9):
"""
Parameters
----------
......@@ -126,13 +126,7 @@ class MetisTuner(Tuner):
for key in search_space:
key_type = search_space[key]['_type']
key_range = search_space[key]['_value']
try:
idx = self.key_order.index(key)
except Exception as ex:
logger.exception(ex)
raise RuntimeError("The format search space contains \
some key that didn't define in key_order." )
idx = self.key_order.index(key)
if key_type == 'quniform':
if key_range[2] == 1:
self.x_bounds[idx] = [key_range[0], key_range[1]]
......@@ -271,7 +265,6 @@ class MetisTuner(Tuner):
samples_size_unique = len(samples_y)
# ===== STEP 1: Compute the current optimum =====
#sys.stderr.write("[%s] Predicting the optimal configuration from the current training dataset...\n" % (os.path.basename(__file__)))
gp_model = gp_create_model.create_model(samples_x, samples_y_aggregation)
lm_current = gp_selection.selection(
"lm",
......@@ -291,8 +284,6 @@ class MetisTuner(Tuner):
'reason': "exploitation_gp"})
# ===== STEP 2: Get recommended configurations for exploration =====
#sys.stderr.write("[%s] Getting candidates for exploration...\n"
#% \(os.path.basename(__file__)))
results_exploration = gp_selection.selection(
"lc",
samples_y_aggregation,
......@@ -309,15 +300,11 @@ class MetisTuner(Tuner):
'expected_sigma': results_exploration['expected_sigma'],
'reason': "exploration"})
logger.info("DEBUG: 1 exploration candidate selected\n")
#sys.stderr.write("[%s] DEBUG: 1 exploration candidate selected\n" % (os.path.basename(__file__)))
else:
logger.info("DEBUG: No suitable exploration candidates were")
# sys.stderr.write("[%s] DEBUG: No suitable exploration candidates were \
# found\n" % (os.path.basename(__file__)))
# ===== STEP 3: Get recommended configurations for exploitation =====
if samples_size_all >= threshold_samplessize_exploitation:
#sys.stderr.write("[%s] Getting candidates for exploitation...\n" % (os.path.basename(__file__)))
print("Getting candidates for exploitation...\n")
try:
gmm = gmm_create_model.create_model(samples_x, samples_y_aggregation)
......@@ -385,13 +372,6 @@ class MetisTuner(Tuner):
temp_improvement = threads_result['expected_lowest_mu'] - lm_current['expected_mu']
if next_improvement > temp_improvement:
logger.info("DEBUG: \"next_candidate\" changed: \
lowest mu might reduce from %f (%s) to %f (%s), %s\n" %\
lm_current['expected_mu'], str(lm_current['hyperparameter']),\
threads_result['expected_lowest_mu'],\
str(threads_result['candidate']['hyperparameter']),\
threads_result['candidate']['reason'])
next_improvement = temp_improvement
next_candidate = threads_result['candidate']
else:
......@@ -415,7 +395,7 @@ class MetisTuner(Tuner):
if next_candidate is not None:
outputs = self._pack_output(next_candidate['hyperparameter'])
else:
random_parameter = _rand_init(self.x_bounds, self.x_types, 1)[0]
random_parameter = _rand_init(x_bounds, x_types, 1)[0]
outputs = self._pack_output(random_parameter)
self.history_parameters.append(outputs)
return outputs
......@@ -443,8 +423,6 @@ def _rand_with_constraints(x_bounds, x_types):
def _calculate_lowest_mu_threaded(inputs):
[candidate, samples_x, samples_y, x_bounds, x_types, minimize_constraints_fun, minimize_starting_points] = inputs
sys.stderr.write("[%s] Evaluating information gain of %s (%s)...\n" % \
(os.path.basename(__file__), candidate['hyperparameter'], candidate['reason']))
outputs = {"candidate": candidate, "expected_lowest_mu": None}
for expected_mu in [candidate['expected_mu'] + 1.96 * candidate['expected_sigma'],
......
......@@ -254,7 +254,7 @@ class StubConv(StubWeightBiasLayer):
keras_layer.set_weights((self.weights[0].T, self.weights[1]))
def size(self):
return self.filters * self.kernel_size * self.kernel_size + self.filters
return (self.input_channel * self.kernel_size * self.kernel_size + 1) * self.filters
@abstractmethod
def to_real_layer(self):
......
......@@ -192,17 +192,20 @@ class SMACTuner(Tuner):
Returns
-------
dict
challenger dict
dict which stores copy of challengers
"""
converted_dict = {}
for key, value in challenger_dict.items():
# convert to loguniform
if key in self.loguniform_key:
challenger_dict[key] = np.exp(challenger_dict[key])
converted_dict[key] = np.exp(challenger_dict[key])
# convert categorical back to original value
if key in self.categorical_dict:
elif key in self.categorical_dict:
idx = challenger_dict[key]
challenger_dict[key] = self.categorical_dict[key][idx]
return challenger_dict
converted_dict[key] = self.categorical_dict[key][idx]
else:
converted_dict[key] = value
return converted_dict
def generate_parameters(self, parameter_id):
"""generate one instance of hyperparameters
......@@ -220,13 +223,11 @@ class SMACTuner(Tuner):
if self.first_one:
init_challenger = self.smbo_solver.nni_smac_start()
self.total_data[parameter_id] = init_challenger
json_tricks.dumps(init_challenger.get_dictionary())
return self.convert_loguniform_categorical(init_challenger.get_dictionary())
else:
challengers = self.smbo_solver.nni_smac_request_challengers()
for challenger in challengers:
self.total_data[parameter_id] = challenger
json_tricks.dumps(challenger.get_dictionary())
return self.convert_loguniform_categorical(challenger.get_dictionary())
def generate_multiple_parameters(self, parameter_id_list):
......@@ -247,7 +248,6 @@ class SMACTuner(Tuner):
for one_id in parameter_id_list:
init_challenger = self.smbo_solver.nni_smac_start()
self.total_data[one_id] = init_challenger
json_tricks.dumps(init_challenger.get_dictionary())
params.append(self.convert_loguniform_categorical(init_challenger.get_dictionary()))
else:
challengers = self.smbo_solver.nni_smac_request_challengers()
......@@ -257,7 +257,6 @@ class SMACTuner(Tuner):
if cnt >= len(parameter_id_list):
break
self.total_data[parameter_id_list[cnt]] = challenger
json_tricks.dumps(challenger.get_dictionary())
params.append(self.convert_loguniform_categorical(challenger.get_dictionary()))
cnt += 1
return params
......@@ -20,6 +20,7 @@ require('../static/style/overviewTitle.scss');
interface OverviewState {
tableData: Array<TableObj>;
experimentAPI: object;
searchSpace: object;
status: string;
errorStr: string;
......@@ -47,6 +48,7 @@ class Overview extends React.Component<{}, OverviewState> {
super(props);
this.state = {
searchSpace: {},
experimentAPI: {},
status: '',
errorStr: '',
trialProfile: {
......@@ -143,6 +145,7 @@ class Overview extends React.Component<{}, OverviewState> {
});
if (this._isMounted) {
this.setState({
experimentAPI: res.data,
trialProfile: trialPro[0],
searchSpace: searchSpace,
isLogCollection: expLogCollection
......@@ -390,7 +393,7 @@ class Overview extends React.Component<{}, OverviewState> {
const {
trialProfile, searchSpace, tableData, accuracyData,
accNodata, status, errorStr, trialNumber, bestAccuracy,
titleMaxbgcolor, titleMinbgcolor, isLogCollection
titleMaxbgcolor, titleMinbgcolor, isLogCollection, experimentAPI
} = this.state;
return (
......@@ -425,9 +428,7 @@ class Overview extends React.Component<{}, OverviewState> {
<Row className="experiment">
{/* the scroll bar all the trial profile in the searchSpace div*/}
<div className="experiment searchSpace">
<TrialPro
tiralProInfo={trialProfile}
/>
<TrialPro experiment={experimentAPI} />
</div>
</Row>
</Col>
......
import * as React from 'react';
import { Experiment } from '../../static/interface';
import MonacoEditor from 'react-monaco-editor';
import { MONACO } from '../../static/const';
interface TrialInfoProps {
tiralProInfo: Experiment;
experiment: object;
}
class TrialInfo extends React.Component<TrialInfoProps, {}> {
......@@ -13,19 +12,32 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
super(props);
}
render() {
const { tiralProInfo } = this.props;
const showProInfo = [];
showProInfo.push({
revision: tiralProInfo.revision,
authorName: tiralProInfo.author,
trialConcurrency: tiralProInfo.runConcurren,
tuner: tiralProInfo.tuner,
assessor: tiralProInfo.assessor ? tiralProInfo.assessor : undefined,
logCollection: tiralProInfo.logCollection ? tiralProInfo.logCollection : undefined,
advisor: tiralProInfo.advisor ? tiralProInfo.advisor : undefined,
clusterMetaData: tiralProInfo.clusterMetaData ? tiralProInfo.clusterMetaData : undefined
componentWillReceiveProps(nextProps: TrialInfoProps) {
const experiments = nextProps.experiment;
Object.keys(experiments).map(key => {
switch (key) {
case 'id':
case 'logDir':
case 'startTime':
case 'endTime':
experiments[key] = undefined;
break;
case 'params':
const params = experiments[key];
Object.keys(params).map(item => {
if (item === 'experimentName' || item === 'searchSpace'
|| item === 'trainingServicePlatform') {
params[item] = undefined;
}
});
break;
default:
}
});
}
render() {
const { experiment } = this.props;
return (
<div className="profile">
<MonacoEditor
......@@ -33,7 +45,7 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
height="380"
language="json"
theme="vs-light"
value={JSON.stringify(showProInfo[0], null, 2)}
value={JSON.stringify(experiment, null, 2)}
options={MONACO}
/>
</div>
......@@ -41,4 +53,4 @@ class TrialInfo extends React.Component<TrialInfoProps, {}> {
}
}
export default TrialInfo;
\ No newline at end of file
export default TrialInfo;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment