Commit 3be88922 authored by suiguoxin's avatar suiguoxin
Browse files

Merge branch 'master' of git://github.com/microsoft/nni

parents b92c4ab2 5a058baf
......@@ -210,6 +210,16 @@ class MockedDataStore implements DataStore {
return result;
}
async exportTrialHpConfigs(): Promise<string> {
const ret: string = '';
return Promise.resolve(ret);
}
async getImportedData(): Promise<string[]> {
const ret: string[] = [];
return Promise.resolve(ret);
}
public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
throw new Error("Method not implemented.");
}
......
......@@ -72,6 +72,7 @@ class NNIRestHandler {
this.addTrialJob(router);
this.cancelTrialJob(router);
this.getMetricData(router);
this.exportData(router);
// Express-joi-validator configuration
router.use((err: any, req: Request, res: Response, next: any) => {
......@@ -261,6 +262,16 @@ class NNIRestHandler {
});
}
private exportData(router: Router): void {
router.get('/export-data', (req: Request, res: Response) => {
this.nniManager.exportData().then((exportedData: string) => {
res.send(exportedData);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo {
if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) {
return jobInfo;
......
......@@ -31,10 +31,14 @@ export namespace ValidationSchemas {
passwd: joi.string(),
sshKeyPath: joi.string(),
passphrase: joi.string(),
gpuIndices: joi.string()
gpuIndices: joi.string(),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean()
})),
local_config: joi.object({
gpuIndices: joi.string()
gpuIndices: joi.string(),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean()
}),
trial_config: joi.object({
image: joi.string().min(1),
......
......@@ -49,6 +49,10 @@ export class MockedNNIManager extends Manager {
public importData(data: string): Promise<void> {
return Promise.resolve();
}
public async exportData(): Promise<string> {
const ret: string = '';
return Promise.resolve(ret);
}
public getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>();
deferred.resolve([{
......
......@@ -71,14 +71,15 @@ class GPUScheduler {
execScript(gpuMetricsCollectorScriptPath)
}
public getAvailableGPUIndices(): number[] {
public getAvailableGPUIndices(useActiveGpu: boolean, occupiedGpuIndexNumMap: Map<number, number>): number[] {
if (this.gpuSummary !== undefined) {
if(process.platform === 'win32') {
if(process.platform === 'win32' || useActiveGpu) {
return this.gpuSummary.gpuInfos.map((info: GPUInfo) => info.index);
}
else{
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0)
.map((info: GPUInfo) => info.index);
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) =>
occupiedGpuIndexNumMap.get(info.index) === undefined && info.activeProcessNum === 0 ||
occupiedGpuIndexNumMap.get(info.index) !== undefined).map((info: GPUInfo) => info.index);
}
}
......
......@@ -97,11 +97,19 @@ class LocalTrialJobDetail implements TrialJobDetail {
* Local training service config
*/
class LocalConfig {
public maxTrialNumPerGpu?: number;
public gpuIndices?: string;
constructor(gpuIndices?: string) {
public useActiveGpu?: boolean;
constructor(gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
if (gpuIndices !== undefined) {
this.gpuIndices = gpuIndices;
}
if (maxTrialNumPerGpu !== undefined) {
this.maxTrialNumPerGpu = maxTrialNumPerGpu;
}
if (useActiveGpu !== undefined) {
this.useActiveGpu = useActiveGpu;
}
}
}
......@@ -117,13 +125,15 @@ class LocalTrainingService implements TrainingService {
private rootDir!: string;
private trialSequenceId: number;
private gpuScheduler!: GPUScheduler;
private occupiedGpuIndices: Set<number>;
private occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>;
private log: Logger;
private localTrailConfig?: TrialConfig;
private localConfig?: LocalConfig;
private isMultiPhase: boolean = false;
private isMultiPhase: boolean;
private jobStreamMap: Map<string, ts.Stream>;
private maxTrialNumPerGpu: number;
private useActiveGpu: boolean;
constructor() {
this.eventEmitter = new EventEmitter();
......@@ -135,7 +145,10 @@ class LocalTrainingService implements TrainingService {
this.trialSequenceId = -1;
this.jobStreamMap = new Map<string, ts.Stream>();
this.log.info('Construct local machine training service.');
this.occupiedGpuIndices = new Set<number>();
this.occupiedGpuIndexNumMap = new Map<number, number>();
this.maxTrialNumPerGpu = 1;
this.useActiveGpu = false;
this.isMultiPhase = false;
}
public async run(): Promise<void> {
......@@ -304,6 +317,13 @@ class LocalTrainingService implements TrainingService {
throw new Error('gpuIndices can not be empty if specified.');
}
}
if (this.localConfig.maxTrialNumPerGpu !== undefined) {
this.maxTrialNumPerGpu = this.localConfig.maxTrialNumPerGpu;
}
if (this.localConfig.useActiveGpu !== undefined) {
this.useActiveGpu = this.localConfig.useActiveGpu;
}
break;
case TrialConfigMetadataKey.MULTI_PHASE:
this.isMultiPhase = (value === 'true' || value === 'True');
......@@ -356,7 +376,14 @@ class LocalTrainingService implements TrainingService {
if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length > 0 && this.gpuScheduler !== undefined) {
if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
for (const index of trialJob.gpuIndices) {
this.occupiedGpuIndices.delete(index);
let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined) {
throw new Error(`gpu resource schedule error`);
} else if(num === 1) {
this.occupiedGpuIndexNumMap.delete(index);
} else {
this.occupiedGpuIndexNumMap.set(index, num - 1)
}
}
}
}
......@@ -396,8 +423,14 @@ class LocalTrainingService implements TrainingService {
return [true, resource];
}
let selectedGPUIndices: number[] = this.gpuScheduler.getAvailableGPUIndices()
.filter((index: number) => !this.occupiedGpuIndices.has(index));
let selectedGPUIndices: number[] = [];
let availableGpuIndices: number[] = this.gpuScheduler.getAvailableGPUIndices(this.useActiveGpu, this.occupiedGpuIndexNumMap);
for(let index of availableGpuIndices) {
let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined || num < this.maxTrialNumPerGpu) {
selectedGPUIndices.push(index);
}
}
if (this.designatedGpuIndices !== undefined) {
this.checkSpecifiedGpuIndices();
......@@ -428,7 +461,12 @@ class LocalTrainingService implements TrainingService {
private occupyResource(resource: {gpuIndices: number[]}): void {
if (this.gpuScheduler !== undefined) {
for (const index of resource.gpuIndices) {
this.occupiedGpuIndices.add(index);
let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined) {
this.occupiedGpuIndexNumMap.set(index, 1)
} else {
this.occupiedGpuIndexNumMap.set(index, num + 1)
}
}
}
}
......
......@@ -23,7 +23,8 @@ import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData';
import { parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType, SSHClientManager } from './remoteMachineData';
import { RemoteMachineTrialJobDetail, parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType, SSHClientManager } from './remoteMachineData';
import { TrialJobDetail } from 'common/trainingService';
/**
* A simple GPU scheduler implementation
......@@ -45,7 +46,7 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
*/
public scheduleMachine(requiredGPUNum: number, trialJobId : string) : RemoteMachineScheduleResult {
public scheduleMachine(requiredGPUNum: number, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
assert(requiredGPUNum >= 0);
const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys());
assert(allRMs.length > 0);
......@@ -66,7 +67,7 @@ export class GPUScheduler {
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if (requiredGPUNum > 0) {
// Trial job requires GPU
const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobId);
const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobDetail);
if (result !== undefined) {
return result;
}
......@@ -74,9 +75,9 @@ export class GPUScheduler {
// Trail job does not need GPU
const allocatedRm: RemoteMachineMeta = this.selectMachine(allRMs);
return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobId);
return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobDetail);
}
this.log.warning(`Scheduler: trialJob id ${trialJobId}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
......@@ -87,21 +88,35 @@ export class GPUScheduler {
/**
* remove the job's gpu reversion
*/
public removeGpuReservation(trialJobId: string, rmMeta?: RemoteMachineMeta): void {
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
if (rmMeta !== undefined && rmMeta.gpuReservation !== undefined) {
rmMeta.gpuReservation.forEach((reserveTrialJobId : string, gpuIndex : number) => {
if (reserveTrialJobId === trialJobId) {
rmMeta.gpuReservation.delete(gpuIndex);
public removeGpuReservation(trialJobId: string, trialJobMap: Map<string, RemoteMachineTrialJobDetail>): void {
let trialJobDetail: RemoteMachineTrialJobDetail | undefined = trialJobMap.get(trialJobId);
if(trialJobDetail === undefined) {
throw new Error(`could not get trialJobDetail by id ${trialJobId}`);
}
if (trialJobDetail.rmMeta !== undefined &&
trialJobDetail.rmMeta.occupiedGpuIndexMap !== undefined &&
trialJobDetail.gpuIndices !== undefined &&
trialJobDetail.gpuIndices.length > 0) {
for (const gpuInfo of trialJobDetail.gpuIndices) {
let num: number | undefined = trialJobDetail.rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
if(num !== undefined) {
if(num === 1) {
trialJobDetail.rmMeta.occupiedGpuIndexMap.delete(gpuInfo.index);
} else {
trialJobDetail.rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num - 1)
}
}
}
});
}
trialJobDetail.gpuIndices = [];
trialJobMap.set(trialJobId, trialJobDetail);
}
private scheduleGPUHost(requiredGPUNum: number, trialJobId: string): RemoteMachineScheduleResult | undefined {
private scheduleGPUHost(requiredGPUNum: number, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult | undefined {
const totalResourceMap: Map<RemoteMachineMeta, GPUInfo[]> = this.gpuResourceDetection();
const qualifiedRMs: RemoteMachineMeta[] = [];
totalResourceMap.forEach((gpuInfos: GPUInfo[], rmMeta: RemoteMachineMeta) => {
if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNum) {
qualifiedRMs.push(rmMeta);
}
......@@ -110,7 +125,7 @@ export class GPUScheduler {
const allocatedRm: RemoteMachineMeta = this.selectMachine(qualifiedRMs);
const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedRm);
if (gpuInfos !== undefined) { // should always true
return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobId);
return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobDetail);
} else {
assert(false, 'gpuInfos is undefined');
}
......@@ -130,9 +145,6 @@ export class GPUScheduler {
// Assgin totoal GPU count as init available GPU number
if (rmMeta.gpuSummary !== undefined) {
const availableGPUs: GPUInfo[] = [];
if (rmMeta.gpuReservation === undefined) {
rmMeta.gpuReservation = new Map<number, string>();
}
const designatedGpuIndices: Set<number> | undefined = parseGpuIndices(rmMeta.gpuIndices);
if (designatedGpuIndices !== undefined) {
for (const gpuIndex of designatedGpuIndices) {
......@@ -145,11 +157,21 @@ export class GPUScheduler {
rmMeta.gpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => {
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU
if (gpuInfo.activeProcessNum === 0 && !rmMeta.gpuReservation.has(gpuInfo.index)
&& (designatedGpuIndices === undefined || designatedGpuIndices.has(gpuInfo.index))) {
// if users set useActiveGpu, use the gpu whether there is another activeProcess
if (designatedGpuIndices === undefined || designatedGpuIndices.has(gpuInfo.index)) {
if(rmMeta.occupiedGpuIndexMap !== undefined) {
let num = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
let maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu? rmMeta.maxTrialNumPerGpu: 1;
if((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
throw new Error(`occupiedGpuIndexMap initialize error!`);
}
}
});
totalResourceMap.set(rmMeta, availableGPUs);
}
......@@ -170,14 +192,22 @@ export class GPUScheduler {
}
private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobId: string): RemoteMachineScheduleResult {
gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
rmMeta.gpuReservation.set(gpuInfo.index, trialJobId);
if(rmMeta.occupiedGpuIndexMap !== undefined) {
let num = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
if(num === undefined) {
num = 0;
}
rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num + 1);
}else {
throw new Error(`Machine ${rmMeta.ip} occupiedGpuIndexMap initialize error!`);
}
});
trialJobDetail.gpuIndices = allocatedGPUs;
trialJobDetail.rmMeta = rmMeta;
return {
resultType: ScheduleResultType.SUCCEED,
scheduleInfo: {
......
......@@ -23,7 +23,7 @@ import * as fs from 'fs';
import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUSummary } from '../common/gpuData';
import { GPUSummary, GPUInfo } from '../common/gpuData';
/**
* Metadata of remote machine for configuration and statuc query
......@@ -36,20 +36,23 @@ export class RemoteMachineMeta {
public readonly sshKeyPath?: string;
public readonly passphrase?: string;
public gpuSummary : GPUSummary | undefined;
// GPU Reservation info, the key is GPU index, the value is the job id which reserves this GPU
public gpuReservation : Map<number, string>;
public readonly gpuIndices?: string;
public readonly maxTrialNumPerGpu?: number;
public occupiedGpuIndexMap: Map<number, number>;
public readonly useActiveGpu?: boolean = false;
constructor(ip : string, port : number, username : string, passwd : string,
sshKeyPath: string, passphrase : string, gpuIndices?: string) {
sshKeyPath: string, passphrase : string, gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
this.ip = ip;
this.port = port;
this.username = username;
this.passwd = passwd;
this.sshKeyPath = sshKeyPath;
this.passphrase = passphrase;
this.gpuReservation = new Map<number, string>();
this.gpuIndices = gpuIndices;
this.maxTrialNumPerGpu = maxTrialNumPerGpu;
this.occupiedGpuIndexMap = new Map<number, number>();
this.useActiveGpu = useActiveGpu;
}
}
......@@ -97,6 +100,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public sequenceId: number;
public rmMeta?: RemoteMachineMeta;
public isEarlyStopped?: boolean;
public gpuIndices: GPUInfo[];
constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) {
......@@ -107,6 +111,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this.form = form;
this.sequenceId = sequenceId;
this.tags = [];
this.gpuIndices = []
}
}
......
......@@ -282,7 +282,7 @@ class RemoteMachineTrainingService implements TrainingService {
private updateGpuReservation() {
for (const [key, value] of this.trialJobsMap) {
if(!['WAITING', 'RUNNING'].includes(value.status)) {
this.gpuScheduler.removeGpuReservation(value.id, value.rmMeta);
this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
}
};
}
......@@ -521,7 +521,7 @@ class RemoteMachineTrainingService implements TrainingService {
return deferred.promise;
}
// get an ssh client from scheduler
const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobId);
const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobDetail);
if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) {
const errorMessage : string = `Required GPU number ${this.trialConfig.gpuNum} is too large, no machine can meet`;
this.log.error(errorMessage);
......@@ -542,6 +542,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`;
trialJobDetail.startTime = Date.now();
this.trialJobsMap.set(trialJobId, trialJobDetail);
deferred.resolve(true);
} else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
......
......@@ -31,7 +31,7 @@ import ConfigSpace.hyperparameters as CSH
from nni.protocol import CommandType, send
from nni.msg_dispatcher_base import MsgDispatcherBase
from nni.utils import OptimizeMode, extract_scalar_reward
from nni.utils import OptimizeMode, extract_scalar_reward, randint_to_quniform
from .config_generator import CG_BOHB
......@@ -443,6 +443,7 @@ class BOHB(MsgDispatcherBase):
search space of this experiment
"""
search_space = data
randint_to_quniform(search_space)
cs = CS.ConfigurationSpace()
for var in search_space:
_type = str(search_space[var]["_type"])
......
# Naive Evolution Tuner
## Naive Evolution(进化算法)
进化算法来自于 [Large-Scale Evolution of Image Classifiers](https://arxiv.org/pdf/1703.01041.pdf)。 它会基于搜索空间随机生成一个种群。 在每一代中,会选择较好的结果,并对其下一代进行一些变异(例如,改动一个超参,增加或减少一层)。 进化算法需要很多次 Trial 才能有效,但它也非常简单,也很容易扩展新功能。
\ No newline at end of file
......@@ -26,7 +26,7 @@ import random
import numpy as np
from nni.tuner import Tuner
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index, randint_to_quniform
import nni.parameter_expressions as parameter_expressions
......@@ -175,6 +175,7 @@ class EvolutionTuner(Tuner):
search_space : dict
"""
self.searchspace_json = search_space
randint_to_quniform(self.searchspace_json)
self.space = json2space(self.searchspace_json)
self.random_state = np.random.RandomState()
......
# Grid Search
## Grid Search(遍历搜索)
Grid Search 会穷举定义在搜索空间文件中的所有超参组合。 注意,搜索空间仅支持 `choice`, `quniform`, `qloguniform``quniform``qloguniform` 中的 **数字 `q` 有不同的含义(与[搜索空间](../../../../../docs/zh_CN/SearchSpaceSpec.md)说明不同)。 这里的意义是在 `low``high` 之间均匀取值的数量。</p>
\ No newline at end of file
# NNI 中使用 Hyperband
## 1. 介绍
[Hyperband](https://arxiv.org/pdf/1603.06560.pdf) 是一种流行的自动机器学习算法。 Hyperband 的基本思想是对配置分组,每组有 `n` 个随机生成的超参配置,每个配置使用 `r` 次资源(如,epoch 数量,批处理数量等)。 当 `n` 个配置完成后,会选择最好的 `n/eta` 个配置,并增加 `r*eta` 次使用的资源。 最后,会选择出的最好配置。
## 2. 实现并行
首先,此样例是基于 MsgDispatcherBase 来实现的自动机器学习算法,而不是基于 Tuner 和Assessor。 这种实现方法下,Hyperband 集成了 Tuner 和 Assessor 两者的功能,因而将它叫做 Advisor。
其次,本实现完全利用了 Hyperband 内部的并行性。 具体来说,下一个分组不会严格的在当前分组结束后再运行,只要有资源,就可以开始运行新的分组。
## 3. 用法
要使用 Hyperband,需要在 Experiment 的 YAML 配置文件进行如下改动。
advisor:
#可选项: Hyperband
builtinAdvisorName: Hyperband
classArgs:
#R: 最大的步骤
R: 100
#eta: 丢弃的 Trial 的比例
eta: 3
#可选项: maximize, minimize
optimize_mode: maximize
注意,一旦使用了 Advisor,就不能在配置文件中添加 Tuner 和 Assessor。 使用 Hyperband 时,Trial 代码收到的超参(如键值对)中,除了用户定义的超参,会多一个 `STEPS`**使用 `STEPS`,Trial 能够控制其运行的时间。</p>
对于 Trial 代码中 `report_intermediate_result(metric)``report_final_result(metric)`**`指标` 应该是数值,或者用一个 dict,并保证其中有键值为 default 的项目,其值也为数值型**。 这是需要进行最大化或者最小化优化的数值,如精度或者损失度。
`R``eta` 是 Hyperband 中可以改动的参数。 `R` 表示可以分配给配置的最大步数(STEPS)。 这里,STEPS 可以代表 epoch 或 批处理数量。 `STEPS` 应该被 Trial 代码用来控制运行的次数。 参考样例 `examples/trials/mnist-hyperband/` ,了解详细信息。
`eta` 表示 `n` 个配置中的 `n/eta` 个配置会留存下来,并用更多的 STEPS 来运行。
下面是 `R=81``eta=3` 时的样例:
| | s=4 | s=3 | s=2 | s=1 | s=0 |
| - | ---- | ---- | ---- | ---- | ---- |
| i | n r | n r | n r | n r | n r |
| 0 | 81 1 | 27 3 | 9 9 | 6 27 | 5 81 |
| 1 | 27 3 | 9 9 | 3 27 | 2 81 | |
| 2 | 9 9 | 3 27 | 1 81 | | |
| 3 | 3 27 | 1 81 | | | |
| 4 | 1 81 | | | | |
`s` 表示分组, `n` 表示生成的配置数量,相应的 `r` 表示配置会运行多少 STEPS。 `i` 表示轮数,如分组 4 有 5 轮,分组 3 有 4 轮。
关于如何实现 Trial 代码,参考 `examples/trials/mnist-hyperband/` 中的说明。
## 4. 待改进
当前实现的 Hyperband 算法可以通过改进支持的提前终止算法来提高,原因是最好的 `n/eta` 个配置并不一定都表现很好。 不好的配置可以更早的终止。
在当前实现中,遵循了[此论文](https://arxiv.org/pdf/1603.06560.pdf)的设计,配置都是随机生成的。 要进一步提升,配置生成过程可以利用更高级的算法。
\ No newline at end of file
......@@ -31,7 +31,7 @@ import json_tricks
from nni.protocol import CommandType, send
from nni.msg_dispatcher_base import MsgDispatcherBase
from nni.common import init_logger
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, randint_to_quniform
import nni.parameter_expressions as parameter_expressions
_logger = logging.getLogger(__name__)
......@@ -357,6 +357,7 @@ class Hyperband(MsgDispatcherBase):
number of trial jobs
"""
self.searchspace_json = data
randint_to_quniform(self.searchspace_json)
self.random_state = np.random.RandomState()
def handle_trial_end(self, data):
......
# TPE, Random Search, Anneal Tuners
## TPE
Tree-structured Parzen Estimator (TPE) 是一种 sequential model-based optimization(SMBO,即基于序列模型优化)的方法。 SMBO 方法根据历史指标数据来按顺序构造模型,来估算超参的性能,随后基于此模型来选择新的超参。 TPE 方法对 P(x|y) 和 P(y) 建模,其中 x 表示超参,y 表示相关的评估指标。 P(x|y) 通过变换超参的生成过程来建模,用非参数密度(non-parametric densities)代替配置的先验分布。 细节可参考 [Algorithms for Hyper-Parameter Optimization](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf)。 ​
## Random Search(随机搜索)
[Random Search for Hyper-Parameter Optimization](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf) 中介绍了随机搜索惊人的简单和效果。 建议当不清楚超参的先验分布时,采用随机搜索作为基准。
## Anneal(退火算法)
这种简单的退火算法从先前的采样开始,会越来越靠近发现的最佳点取样。 此算法是随机搜索的简单变体,利用了响应面的平滑性。 退火率不是自适应的。
\ No newline at end of file
......@@ -27,7 +27,7 @@ import logging
import hyperopt as hp
import numpy as np
from nni.tuner import Tuner
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index, randint_to_quniform
logger = logging.getLogger('hyperopt_AutoML')
......@@ -153,14 +153,14 @@ def _add_index(in_x, parameter):
Will change to format in hyperopt, like:
{'dropout_rate': 0.8, 'conv_size': {'_index': 1, '_value': 3}, 'hidden_size': {'_index': 1, '_value': 512}}
"""
if TYPE not in in_x: # if at the top level
if NodeType.TYPE not in in_x: # if at the top level
out_y = dict()
for key, value in parameter.items():
out_y[key] = _add_index(in_x[key], value)
return out_y
elif isinstance(in_x, dict):
value_type = in_x[TYPE]
value_format = in_x[VALUE]
value_type = in_x[NodeType.TYPE]
value_format = in_x[NodeType.VALUE]
if value_type == "choice":
choice_name = parameter[0] if isinstance(parameter,
list) else parameter
......@@ -173,15 +173,14 @@ def _add_index(in_x, parameter):
choice_value_format = item[1]
if choice_key == choice_name:
return {
INDEX:
pos,
VALUE: [
NodeType.INDEX: pos,
NodeType.VALUE: [
choice_name,
_add_index(choice_value_format, parameter[1])
]
}
elif choice_name == item:
return {INDEX: pos, VALUE: item}
return {NodeType.INDEX: pos, NodeType.VALUE: item}
else:
return parameter
......@@ -232,6 +231,8 @@ class HyperoptTuner(Tuner):
search_space : dict
"""
self.json = search_space
randint_to_quniform(self.json)
search_space_instance = json2space(self.json)
rstate = np.random.RandomState()
trials = hp.Trials()
......
# Medianstop Assessor
## Median Stop
Medianstop 是一种简单的提前终止 Trial 的策略,可参考[论文](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf)。 如果 Trial X 的在步骤 S 的最好目标值比所有已完成 Trial 的步骤 S 的中位数值明显要低,这个 Trial 就会被提前停止。
\ No newline at end of file
# Metis Tuner
## Metis Tuner
大多数调参工具仅仅预测最优配置,而 [Metis](https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/) 的优势在于有两个输出:(a) 最优配置的当前预测结果, 以及 (b) 下一次 Trial 的建议。 不再需要随机猜测!
大多数工具假设训练集没有噪声数据,但 Metis 会知道是否需要对某个超参重新采样。
大多数工具都有着重于在已有结果上继续发展的问题,而 Metis 的搜索策略可以在探索,发展和重新采样(可选)中进行平衡。
Metis 属于基于序列的贝叶斯优化 (SMBO) 的类别,它也基于贝叶斯优化框架。 为了对超参-性能空间建模,Metis 同时使用了高斯过程(Gaussian Process)和高斯混合模型(GMM)。 由于每次 Trial 都可能有很高的时间成本,Metis 大量使用了已有模型来进行推理计算。 在每次迭代中,Metis 执行两个任务:
在高斯过程空间中找到全局最优点。 这一点表示了最佳配置。
它会标识出下一个超参的候选项。 这是通过对隐含信息的探索、挖掘和重采样来实现的。
注意,搜索空间仅支持 `choice`, `quniform`, `uniform``randint`
更多详情,参考论文:https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
\ No newline at end of file
......@@ -133,7 +133,7 @@ class MetisTuner(Tuner):
self.x_bounds[idx] = bounds
self.x_types[idx] = 'discrete_int'
elif key_type == 'randint':
self.x_bounds[idx] = [0, key_range[0]]
self.x_bounds[idx] = [key_range[0], key_range[1]]
self.x_types[idx] = 'range_int'
elif key_type == 'uniform':
self.x_bounds[idx] = [key_range[0], key_range[1]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment