"runner/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "3892c3a7032c99db250c3266276c4525d243950a"
Commit 3be88922 authored by suiguoxin's avatar suiguoxin
Browse files

Merge branch 'master' of git://github.com/microsoft/nni

parents b92c4ab2 5a058baf
...@@ -210,6 +210,16 @@ class MockedDataStore implements DataStore { ...@@ -210,6 +210,16 @@ class MockedDataStore implements DataStore {
return result; return result;
} }
async exportTrialHpConfigs(): Promise<string> {
const ret: string = '';
return Promise.resolve(ret);
}
async getImportedData(): Promise<string[]> {
const ret: string[] = [];
return Promise.resolve(ret);
}
public getTrialJob(trialJobId: string): Promise<TrialJobInfo> { public getTrialJob(trialJobId: string): Promise<TrialJobInfo> {
throw new Error("Method not implemented."); throw new Error("Method not implemented.");
} }
......
...@@ -72,6 +72,7 @@ class NNIRestHandler { ...@@ -72,6 +72,7 @@ class NNIRestHandler {
this.addTrialJob(router); this.addTrialJob(router);
this.cancelTrialJob(router); this.cancelTrialJob(router);
this.getMetricData(router); this.getMetricData(router);
this.exportData(router);
// Express-joi-validator configuration // Express-joi-validator configuration
router.use((err: any, req: Request, res: Response, next: any) => { router.use((err: any, req: Request, res: Response, next: any) => {
...@@ -261,6 +262,16 @@ class NNIRestHandler { ...@@ -261,6 +262,16 @@ class NNIRestHandler {
}); });
} }
private exportData(router: Router): void {
router.get('/export-data', (req: Request, res: Response) => {
this.nniManager.exportData().then((exportedData: string) => {
res.send(exportedData);
}).catch((err: Error) => {
this.handle_error(err, res);
});
});
}
private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo { private setErrorPathForFailedJob(jobInfo: TrialJobInfo): TrialJobInfo {
if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) { if (jobInfo === undefined || jobInfo.status !== 'FAILED' || jobInfo.logPath === undefined) {
return jobInfo; return jobInfo;
......
...@@ -31,10 +31,14 @@ export namespace ValidationSchemas { ...@@ -31,10 +31,14 @@ export namespace ValidationSchemas {
passwd: joi.string(), passwd: joi.string(),
sshKeyPath: joi.string(), sshKeyPath: joi.string(),
passphrase: joi.string(), passphrase: joi.string(),
gpuIndices: joi.string() gpuIndices: joi.string(),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean()
})), })),
local_config: joi.object({ local_config: joi.object({
gpuIndices: joi.string() gpuIndices: joi.string(),
maxTrialNumPerGpu: joi.number(),
useActiveGpu: joi.boolean()
}), }),
trial_config: joi.object({ trial_config: joi.object({
image: joi.string().min(1), image: joi.string().min(1),
......
...@@ -49,6 +49,10 @@ export class MockedNNIManager extends Manager { ...@@ -49,6 +49,10 @@ export class MockedNNIManager extends Manager {
public importData(data: string): Promise<void> { public importData(data: string): Promise<void> {
return Promise.resolve(); return Promise.resolve();
} }
public async exportData(): Promise<string> {
const ret: string = '';
return Promise.resolve(ret);
}
public getTrialJobStatistics(): Promise<TrialJobStatistics[]> { public getTrialJobStatistics(): Promise<TrialJobStatistics[]> {
const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>(); const deferred: Deferred<TrialJobStatistics[]> = new Deferred<TrialJobStatistics[]>();
deferred.resolve([{ deferred.resolve([{
......
...@@ -71,14 +71,15 @@ class GPUScheduler { ...@@ -71,14 +71,15 @@ class GPUScheduler {
execScript(gpuMetricsCollectorScriptPath) execScript(gpuMetricsCollectorScriptPath)
} }
public getAvailableGPUIndices(): number[] { public getAvailableGPUIndices(useActiveGpu: boolean, occupiedGpuIndexNumMap: Map<number, number>): number[] {
if (this.gpuSummary !== undefined) { if (this.gpuSummary !== undefined) {
if(process.platform === 'win32') { if(process.platform === 'win32' || useActiveGpu) {
return this.gpuSummary.gpuInfos.map((info: GPUInfo) => info.index); return this.gpuSummary.gpuInfos.map((info: GPUInfo) => info.index);
} }
else{ else{
return this.gpuSummary.gpuInfos.filter((info: GPUInfo) => info.activeProcessNum === 0) return this.gpuSummary.gpuInfos.filter((info: GPUInfo) =>
.map((info: GPUInfo) => info.index); occupiedGpuIndexNumMap.get(info.index) === undefined && info.activeProcessNum === 0 ||
occupiedGpuIndexNumMap.get(info.index) !== undefined).map((info: GPUInfo) => info.index);
} }
} }
......
...@@ -97,11 +97,19 @@ class LocalTrialJobDetail implements TrialJobDetail { ...@@ -97,11 +97,19 @@ class LocalTrialJobDetail implements TrialJobDetail {
* Local training service config * Local training service config
*/ */
class LocalConfig { class LocalConfig {
public maxTrialNumPerGpu?: number;
public gpuIndices?: string; public gpuIndices?: string;
constructor(gpuIndices?: string) { public useActiveGpu?: boolean;
constructor(gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
if (gpuIndices !== undefined) { if (gpuIndices !== undefined) {
this.gpuIndices = gpuIndices; this.gpuIndices = gpuIndices;
} }
if (maxTrialNumPerGpu !== undefined) {
this.maxTrialNumPerGpu = maxTrialNumPerGpu;
}
if (useActiveGpu !== undefined) {
this.useActiveGpu = useActiveGpu;
}
} }
} }
...@@ -117,13 +125,15 @@ class LocalTrainingService implements TrainingService { ...@@ -117,13 +125,15 @@ class LocalTrainingService implements TrainingService {
private rootDir!: string; private rootDir!: string;
private trialSequenceId: number; private trialSequenceId: number;
private gpuScheduler!: GPUScheduler; private gpuScheduler!: GPUScheduler;
private occupiedGpuIndices: Set<number>; private occupiedGpuIndexNumMap: Map<number, number>;
private designatedGpuIndices!: Set<number>; private designatedGpuIndices!: Set<number>;
private log: Logger; private log: Logger;
private localTrailConfig?: TrialConfig; private localTrailConfig?: TrialConfig;
private localConfig?: LocalConfig; private localConfig?: LocalConfig;
private isMultiPhase: boolean = false; private isMultiPhase: boolean;
private jobStreamMap: Map<string, ts.Stream>; private jobStreamMap: Map<string, ts.Stream>;
private maxTrialNumPerGpu: number;
private useActiveGpu: boolean;
constructor() { constructor() {
this.eventEmitter = new EventEmitter(); this.eventEmitter = new EventEmitter();
...@@ -135,7 +145,10 @@ class LocalTrainingService implements TrainingService { ...@@ -135,7 +145,10 @@ class LocalTrainingService implements TrainingService {
this.trialSequenceId = -1; this.trialSequenceId = -1;
this.jobStreamMap = new Map<string, ts.Stream>(); this.jobStreamMap = new Map<string, ts.Stream>();
this.log.info('Construct local machine training service.'); this.log.info('Construct local machine training service.');
this.occupiedGpuIndices = new Set<number>(); this.occupiedGpuIndexNumMap = new Map<number, number>();
this.maxTrialNumPerGpu = 1;
this.useActiveGpu = false;
this.isMultiPhase = false;
} }
public async run(): Promise<void> { public async run(): Promise<void> {
...@@ -304,6 +317,13 @@ class LocalTrainingService implements TrainingService { ...@@ -304,6 +317,13 @@ class LocalTrainingService implements TrainingService {
throw new Error('gpuIndices can not be empty if specified.'); throw new Error('gpuIndices can not be empty if specified.');
} }
} }
if (this.localConfig.maxTrialNumPerGpu !== undefined) {
this.maxTrialNumPerGpu = this.localConfig.maxTrialNumPerGpu;
}
if (this.localConfig.useActiveGpu !== undefined) {
this.useActiveGpu = this.localConfig.useActiveGpu;
}
break; break;
case TrialConfigMetadataKey.MULTI_PHASE: case TrialConfigMetadataKey.MULTI_PHASE:
this.isMultiPhase = (value === 'true' || value === 'True'); this.isMultiPhase = (value === 'true' || value === 'True');
...@@ -356,7 +376,14 @@ class LocalTrainingService implements TrainingService { ...@@ -356,7 +376,14 @@ class LocalTrainingService implements TrainingService {
if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length > 0 && this.gpuScheduler !== undefined) { if (trialJob.gpuIndices !== undefined && trialJob.gpuIndices.length > 0 && this.gpuScheduler !== undefined) {
if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') { if (oldStatus === 'RUNNING' && trialJob.status !== 'RUNNING') {
for (const index of trialJob.gpuIndices) { for (const index of trialJob.gpuIndices) {
this.occupiedGpuIndices.delete(index); let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined) {
throw new Error(`gpu resource schedule error`);
} else if(num === 1) {
this.occupiedGpuIndexNumMap.delete(index);
} else {
this.occupiedGpuIndexNumMap.set(index, num - 1)
}
} }
} }
} }
...@@ -396,8 +423,14 @@ class LocalTrainingService implements TrainingService { ...@@ -396,8 +423,14 @@ class LocalTrainingService implements TrainingService {
return [true, resource]; return [true, resource];
} }
let selectedGPUIndices: number[] = this.gpuScheduler.getAvailableGPUIndices() let selectedGPUIndices: number[] = [];
.filter((index: number) => !this.occupiedGpuIndices.has(index)); let availableGpuIndices: number[] = this.gpuScheduler.getAvailableGPUIndices(this.useActiveGpu, this.occupiedGpuIndexNumMap);
for(let index of availableGpuIndices) {
let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined || num < this.maxTrialNumPerGpu) {
selectedGPUIndices.push(index);
}
}
if (this.designatedGpuIndices !== undefined) { if (this.designatedGpuIndices !== undefined) {
this.checkSpecifiedGpuIndices(); this.checkSpecifiedGpuIndices();
...@@ -428,7 +461,12 @@ class LocalTrainingService implements TrainingService { ...@@ -428,7 +461,12 @@ class LocalTrainingService implements TrainingService {
private occupyResource(resource: {gpuIndices: number[]}): void { private occupyResource(resource: {gpuIndices: number[]}): void {
if (this.gpuScheduler !== undefined) { if (this.gpuScheduler !== undefined) {
for (const index of resource.gpuIndices) { for (const index of resource.gpuIndices) {
this.occupiedGpuIndices.add(index); let num: number | undefined = this.occupiedGpuIndexNumMap.get(index);
if(num === undefined) {
this.occupiedGpuIndexNumMap.set(index, 1)
} else {
this.occupiedGpuIndexNumMap.set(index, num + 1)
}
} }
} }
} }
......
...@@ -23,7 +23,8 @@ import * as assert from 'assert'; ...@@ -23,7 +23,8 @@ import * as assert from 'assert';
import { getLogger, Logger } from '../../common/log'; import { getLogger, Logger } from '../../common/log';
import { randomSelect } from '../../common/utils'; import { randomSelect } from '../../common/utils';
import { GPUInfo } from '../common/gpuData'; import { GPUInfo } from '../common/gpuData';
import { parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType, SSHClientManager } from './remoteMachineData'; import { RemoteMachineTrialJobDetail, parseGpuIndices, RemoteMachineMeta, RemoteMachineScheduleResult, ScheduleResultType, SSHClientManager } from './remoteMachineData';
import { TrialJobDetail } from 'common/trainingService';
/** /**
* A simple GPU scheduler implementation * A simple GPU scheduler implementation
...@@ -45,7 +46,7 @@ export class GPUScheduler { ...@@ -45,7 +46,7 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum) * Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number * @param requiredGPUNum required GPU number
*/ */
public scheduleMachine(requiredGPUNum: number, trialJobId : string) : RemoteMachineScheduleResult { public scheduleMachine(requiredGPUNum: number, trialJobDetail : RemoteMachineTrialJobDetail) : RemoteMachineScheduleResult {
assert(requiredGPUNum >= 0); assert(requiredGPUNum >= 0);
const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys()); const allRMs: RemoteMachineMeta[] = Array.from(this.machineSSHClientMap.keys());
assert(allRMs.length > 0); assert(allRMs.length > 0);
...@@ -66,7 +67,7 @@ export class GPUScheduler { ...@@ -66,7 +67,7 @@ export class GPUScheduler {
// Currenty the requireGPUNum parameter for all trial jobs are identical. // Currenty the requireGPUNum parameter for all trial jobs are identical.
if (requiredGPUNum > 0) { if (requiredGPUNum > 0) {
// Trial job requires GPU // Trial job requires GPU
const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobId); const result: RemoteMachineScheduleResult | undefined = this.scheduleGPUHost(requiredGPUNum, trialJobDetail);
if (result !== undefined) { if (result !== undefined) {
return result; return result;
} }
...@@ -74,9 +75,9 @@ export class GPUScheduler { ...@@ -74,9 +75,9 @@ export class GPUScheduler {
// Trail job does not need GPU // Trail job does not need GPU
const allocatedRm: RemoteMachineMeta = this.selectMachine(allRMs); const allocatedRm: RemoteMachineMeta = this.selectMachine(allRMs);
return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobId); return this.allocateHost(requiredGPUNum, allocatedRm, [], trialJobDetail);
} }
this.log.warning(`Scheduler: trialJob id ${trialJobId}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `); this.log.warning(`Scheduler: trialJob id ${trialJobDetail.id}, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `);
return { return {
resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU, resultType : ScheduleResultType.TMP_NO_AVAILABLE_GPU,
...@@ -87,21 +88,35 @@ export class GPUScheduler { ...@@ -87,21 +88,35 @@ export class GPUScheduler {
/** /**
* remove the job's gpu reversion * remove the job's gpu reversion
*/ */
public removeGpuReservation(trialJobId: string, rmMeta?: RemoteMachineMeta): void { public removeGpuReservation(trialJobId: string, trialJobMap: Map<string, RemoteMachineTrialJobDetail>): void {
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined let trialJobDetail: RemoteMachineTrialJobDetail | undefined = trialJobMap.get(trialJobId);
if (rmMeta !== undefined && rmMeta.gpuReservation !== undefined) { if(trialJobDetail === undefined) {
rmMeta.gpuReservation.forEach((reserveTrialJobId : string, gpuIndex : number) => { throw new Error(`could not get trialJobDetail by id ${trialJobId}`);
if (reserveTrialJobId === trialJobId) { }
rmMeta.gpuReservation.delete(gpuIndex); if (trialJobDetail.rmMeta !== undefined &&
trialJobDetail.rmMeta.occupiedGpuIndexMap !== undefined &&
trialJobDetail.gpuIndices !== undefined &&
trialJobDetail.gpuIndices.length > 0) {
for (const gpuInfo of trialJobDetail.gpuIndices) {
let num: number | undefined = trialJobDetail.rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
if(num !== undefined) {
if(num === 1) {
trialJobDetail.rmMeta.occupiedGpuIndexMap.delete(gpuInfo.index);
} else {
trialJobDetail.rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num - 1)
}
} }
}); }
} }
trialJobDetail.gpuIndices = [];
trialJobMap.set(trialJobId, trialJobDetail);
} }
private scheduleGPUHost(requiredGPUNum: number, trialJobId: string): RemoteMachineScheduleResult | undefined { private scheduleGPUHost(requiredGPUNum: number, trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult | undefined {
const totalResourceMap: Map<RemoteMachineMeta, GPUInfo[]> = this.gpuResourceDetection(); const totalResourceMap: Map<RemoteMachineMeta, GPUInfo[]> = this.gpuResourceDetection();
const qualifiedRMs: RemoteMachineMeta[] = []; const qualifiedRMs: RemoteMachineMeta[] = [];
totalResourceMap.forEach((gpuInfos: GPUInfo[], rmMeta: RemoteMachineMeta) => { totalResourceMap.forEach((gpuInfos: GPUInfo[], rmMeta: RemoteMachineMeta) => {
if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNum) { if (gpuInfos !== undefined && gpuInfos.length >= requiredGPUNum) {
qualifiedRMs.push(rmMeta); qualifiedRMs.push(rmMeta);
} }
...@@ -110,7 +125,7 @@ export class GPUScheduler { ...@@ -110,7 +125,7 @@ export class GPUScheduler {
const allocatedRm: RemoteMachineMeta = this.selectMachine(qualifiedRMs); const allocatedRm: RemoteMachineMeta = this.selectMachine(qualifiedRMs);
const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedRm); const gpuInfos: GPUInfo[] | undefined = totalResourceMap.get(allocatedRm);
if (gpuInfos !== undefined) { // should always true if (gpuInfos !== undefined) { // should always true
return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobId); return this.allocateHost(requiredGPUNum, allocatedRm, gpuInfos, trialJobDetail);
} else { } else {
assert(false, 'gpuInfos is undefined'); assert(false, 'gpuInfos is undefined');
} }
...@@ -130,9 +145,6 @@ export class GPUScheduler { ...@@ -130,9 +145,6 @@ export class GPUScheduler {
// Assgin totoal GPU count as init available GPU number // Assgin totoal GPU count as init available GPU number
if (rmMeta.gpuSummary !== undefined) { if (rmMeta.gpuSummary !== undefined) {
const availableGPUs: GPUInfo[] = []; const availableGPUs: GPUInfo[] = [];
if (rmMeta.gpuReservation === undefined) {
rmMeta.gpuReservation = new Map<number, string>();
}
const designatedGpuIndices: Set<number> | undefined = parseGpuIndices(rmMeta.gpuIndices); const designatedGpuIndices: Set<number> | undefined = parseGpuIndices(rmMeta.gpuIndices);
if (designatedGpuIndices !== undefined) { if (designatedGpuIndices !== undefined) {
for (const gpuIndex of designatedGpuIndices) { for (const gpuIndex of designatedGpuIndices) {
...@@ -145,10 +157,20 @@ export class GPUScheduler { ...@@ -145,10 +157,20 @@ export class GPUScheduler {
rmMeta.gpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => { rmMeta.gpuSummary.gpuInfos.forEach((gpuInfo: GPUInfo) => {
// if the GPU has active process, OR be reserved by a job, // if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList, // or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU // We should NOT allocate this GPU
if (gpuInfo.activeProcessNum === 0 && !rmMeta.gpuReservation.has(gpuInfo.index) // if users set useActiveGpu, use the gpu whether there is another activeProcess
&& (designatedGpuIndices === undefined || designatedGpuIndices.has(gpuInfo.index))) { if (designatedGpuIndices === undefined || designatedGpuIndices.has(gpuInfo.index)) {
availableGPUs.push(gpuInfo); if(rmMeta.occupiedGpuIndexMap !== undefined) {
let num = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
let maxTrialNumPerGpu: number = rmMeta.maxTrialNumPerGpu? rmMeta.maxTrialNumPerGpu: 1;
if((num === undefined && (!rmMeta.useActiveGpu && gpuInfo.activeProcessNum === 0 || rmMeta.useActiveGpu)) ||
(num !== undefined && num < maxTrialNumPerGpu)) {
availableGPUs.push(gpuInfo);
}
} else {
throw new Error(`occupiedGpuIndexMap initialize error!`);
}
} }
}); });
totalResourceMap.set(rmMeta, availableGPUs); totalResourceMap.set(rmMeta, availableGPUs);
...@@ -170,14 +192,22 @@ export class GPUScheduler { ...@@ -170,14 +192,22 @@ export class GPUScheduler {
} }
private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta, private allocateHost(requiredGPUNum: number, rmMeta: RemoteMachineMeta,
gpuInfos: GPUInfo[], trialJobId: string): RemoteMachineScheduleResult { gpuInfos: GPUInfo[], trialJobDetail: RemoteMachineTrialJobDetail): RemoteMachineScheduleResult {
assert(gpuInfos.length >= requiredGPUNum); assert(gpuInfos.length >= requiredGPUNum);
const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum); const allocatedGPUs: GPUInfo[] = this.selectGPUsForTrial(gpuInfos, requiredGPUNum);
allocatedGPUs.forEach((gpuInfo: GPUInfo) => { allocatedGPUs.forEach((gpuInfo: GPUInfo) => {
rmMeta.gpuReservation.set(gpuInfo.index, trialJobId); if(rmMeta.occupiedGpuIndexMap !== undefined) {
let num = rmMeta.occupiedGpuIndexMap.get(gpuInfo.index);
if(num === undefined) {
num = 0;
}
rmMeta.occupiedGpuIndexMap.set(gpuInfo.index, num + 1);
}else {
throw new Error(`Machine ${rmMeta.ip} occupiedGpuIndexMap initialize error!`);
}
}); });
trialJobDetail.gpuIndices = allocatedGPUs;
trialJobDetail.rmMeta = rmMeta;
return { return {
resultType: ScheduleResultType.SUCCEED, resultType: ScheduleResultType.SUCCEED,
scheduleInfo: { scheduleInfo: {
......
...@@ -23,7 +23,7 @@ import * as fs from 'fs'; ...@@ -23,7 +23,7 @@ import * as fs from 'fs';
import { Client, ConnectConfig } from 'ssh2'; import { Client, ConnectConfig } from 'ssh2';
import { Deferred } from 'ts-deferred'; import { Deferred } from 'ts-deferred';
import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; import { JobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService';
import { GPUSummary } from '../common/gpuData'; import { GPUSummary, GPUInfo } from '../common/gpuData';
/** /**
* Metadata of remote machine for configuration and statuc query * Metadata of remote machine for configuration and statuc query
...@@ -36,20 +36,23 @@ export class RemoteMachineMeta { ...@@ -36,20 +36,23 @@ export class RemoteMachineMeta {
public readonly sshKeyPath?: string; public readonly sshKeyPath?: string;
public readonly passphrase?: string; public readonly passphrase?: string;
public gpuSummary : GPUSummary | undefined; public gpuSummary : GPUSummary | undefined;
// GPU Reservation info, the key is GPU index, the value is the job id which reserves this GPU
public gpuReservation : Map<number, string>;
public readonly gpuIndices?: string; public readonly gpuIndices?: string;
public readonly maxTrialNumPerGpu?: number;
public occupiedGpuIndexMap: Map<number, number>;
public readonly useActiveGpu?: boolean = false;
constructor(ip : string, port : number, username : string, passwd : string, constructor(ip : string, port : number, username : string, passwd : string,
sshKeyPath: string, passphrase : string, gpuIndices?: string) { sshKeyPath: string, passphrase : string, gpuIndices?: string, maxTrialNumPerGpu?: number, useActiveGpu?: boolean) {
this.ip = ip; this.ip = ip;
this.port = port; this.port = port;
this.username = username; this.username = username;
this.passwd = passwd; this.passwd = passwd;
this.sshKeyPath = sshKeyPath; this.sshKeyPath = sshKeyPath;
this.passphrase = passphrase; this.passphrase = passphrase;
this.gpuReservation = new Map<number, string>();
this.gpuIndices = gpuIndices; this.gpuIndices = gpuIndices;
this.maxTrialNumPerGpu = maxTrialNumPerGpu;
this.occupiedGpuIndexMap = new Map<number, number>();
this.useActiveGpu = useActiveGpu;
} }
} }
...@@ -97,6 +100,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { ...@@ -97,6 +100,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public sequenceId: number; public sequenceId: number;
public rmMeta?: RemoteMachineMeta; public rmMeta?: RemoteMachineMeta;
public isEarlyStopped?: boolean; public isEarlyStopped?: boolean;
public gpuIndices: GPUInfo[];
constructor(id: string, status: TrialJobStatus, submitTime: number, constructor(id: string, status: TrialJobStatus, submitTime: number,
workingDirectory: string, form: JobApplicationForm, sequenceId: number) { workingDirectory: string, form: JobApplicationForm, sequenceId: number) {
...@@ -107,6 +111,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail { ...@@ -107,6 +111,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this.form = form; this.form = form;
this.sequenceId = sequenceId; this.sequenceId = sequenceId;
this.tags = []; this.tags = [];
this.gpuIndices = []
} }
} }
......
...@@ -282,7 +282,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -282,7 +282,7 @@ class RemoteMachineTrainingService implements TrainingService {
private updateGpuReservation() { private updateGpuReservation() {
for (const [key, value] of this.trialJobsMap) { for (const [key, value] of this.trialJobsMap) {
if(!['WAITING', 'RUNNING'].includes(value.status)) { if(!['WAITING', 'RUNNING'].includes(value.status)) {
this.gpuScheduler.removeGpuReservation(value.id, value.rmMeta); this.gpuScheduler.removeGpuReservation(key, this.trialJobsMap);
} }
}; };
} }
...@@ -521,7 +521,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -521,7 +521,7 @@ class RemoteMachineTrainingService implements TrainingService {
return deferred.promise; return deferred.promise;
} }
// get an ssh client from scheduler // get an ssh client from scheduler
const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobId); const rmScheduleResult: RemoteMachineScheduleResult = this.gpuScheduler.scheduleMachine(this.trialConfig.gpuNum, trialJobDetail);
if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) { if (rmScheduleResult.resultType === ScheduleResultType.REQUIRE_EXCEED_TOTAL) {
const errorMessage : string = `Required GPU number ${this.trialConfig.gpuNum} is too large, no machine can meet`; const errorMessage : string = `Required GPU number ${this.trialConfig.gpuNum} is too large, no machine can meet`;
this.log.error(errorMessage); this.log.error(errorMessage);
...@@ -542,6 +542,7 @@ class RemoteMachineTrainingService implements TrainingService { ...@@ -542,6 +542,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`; trialJobDetail.url = `file://${rmScheduleInfo.rmMeta.ip}:${trialWorkingFolder}`;
trialJobDetail.startTime = Date.now(); trialJobDetail.startTime = Date.now();
this.trialJobsMap.set(trialJobId, trialJobDetail);
deferred.resolve(true); deferred.resolve(true);
} else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) { } else if (rmScheduleResult.resultType === ScheduleResultType.TMP_NO_AVAILABLE_GPU) {
this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`); this.log.info(`Right now no available GPU can be allocated for trial ${trialJobId}, will try to schedule later`);
......
...@@ -31,7 +31,7 @@ import ConfigSpace.hyperparameters as CSH ...@@ -31,7 +31,7 @@ import ConfigSpace.hyperparameters as CSH
from nni.protocol import CommandType, send from nni.protocol import CommandType, send
from nni.msg_dispatcher_base import MsgDispatcherBase from nni.msg_dispatcher_base import MsgDispatcherBase
from nni.utils import OptimizeMode, extract_scalar_reward from nni.utils import OptimizeMode, extract_scalar_reward, randint_to_quniform
from .config_generator import CG_BOHB from .config_generator import CG_BOHB
...@@ -443,6 +443,7 @@ class BOHB(MsgDispatcherBase): ...@@ -443,6 +443,7 @@ class BOHB(MsgDispatcherBase):
search space of this experiment search space of this experiment
""" """
search_space = data search_space = data
randint_to_quniform(search_space)
cs = CS.ConfigurationSpace() cs = CS.ConfigurationSpace()
for var in search_space: for var in search_space:
_type = str(search_space[var]["_type"]) _type = str(search_space[var]["_type"])
......
# Naive Evolution Tuner
## Naive Evolution(进化算法)
进化算法来自于 [Large-Scale Evolution of Image Classifiers](https://arxiv.org/pdf/1703.01041.pdf)。 它会基于搜索空间随机生成一个种群。 在每一代中,会选择较好的结果,并对其下一代进行一些变异(例如,改动一个超参,增加或减少一层)。 进化算法需要很多次 Trial 才能有效,但它也非常简单,也很容易扩展新功能。
\ No newline at end of file
...@@ -26,7 +26,7 @@ import random ...@@ -26,7 +26,7 @@ import random
import numpy as np import numpy as np
from nni.tuner import Tuner from nni.tuner import Tuner
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index, randint_to_quniform
import nni.parameter_expressions as parameter_expressions import nni.parameter_expressions as parameter_expressions
...@@ -175,6 +175,7 @@ class EvolutionTuner(Tuner): ...@@ -175,6 +175,7 @@ class EvolutionTuner(Tuner):
search_space : dict search_space : dict
""" """
self.searchspace_json = search_space self.searchspace_json = search_space
randint_to_quniform(self.searchspace_json)
self.space = json2space(self.searchspace_json) self.space = json2space(self.searchspace_json)
self.random_state = np.random.RandomState() self.random_state = np.random.RandomState()
......
# Grid Search
## Grid Search(遍历搜索)
Grid Search 会穷举定义在搜索空间文件中的所有超参组合。 注意,搜索空间仅支持 `choice`, `quniform`, `qloguniform``quniform``qloguniform` 中的 **数字 `q` 有不同的含义(与[搜索空间](../../../../../docs/zh_CN/SearchSpaceSpec.md)说明不同)。 这里的意义是在 `low``high` 之间均匀取值的数量。</p>
\ No newline at end of file
# NNI 中使用 Hyperband
## 1. 介绍
[Hyperband](https://arxiv.org/pdf/1603.06560.pdf) 是一种流行的自动机器学习算法。 Hyperband 的基本思想是对配置分组,每组有 `n` 个随机生成的超参配置,每个配置使用 `r` 次资源(如,epoch 数量,批处理数量等)。 当 `n` 个配置完成后,会选择最好的 `n/eta` 个配置,并增加 `r*eta` 次使用的资源。 最后,会选择出的最好配置。
## 2. 实现并行
首先,此样例是基于 MsgDispatcherBase 来实现的自动机器学习算法,而不是基于 Tuner 和Assessor。 这种实现方法下,Hyperband 集成了 Tuner 和 Assessor 两者的功能,因而将它叫做 Advisor。
其次,本实现完全利用了 Hyperband 内部的并行性。 具体来说,下一个分组不会严格的在当前分组结束后再运行,只要有资源,就可以开始运行新的分组。
## 3. 用法
要使用 Hyperband,需要在 Experiment 的 YAML 配置文件进行如下改动。
advisor:
#可选项: Hyperband
builtinAdvisorName: Hyperband
classArgs:
#R: 最大的步骤
R: 100
#eta: 丢弃的 Trial 的比例
eta: 3
#可选项: maximize, minimize
optimize_mode: maximize
注意,一旦使用了 Advisor,就不能在配置文件中添加 Tuner 和 Assessor。 使用 Hyperband 时,Trial 代码收到的超参(如键值对)中,除了用户定义的超参,会多一个 `STEPS`**使用 `STEPS`,Trial 能够控制其运行的时间。</p>
对于 Trial 代码中 `report_intermediate_result(metric)``report_final_result(metric)`**`指标` 应该是数值,或者用一个 dict,并保证其中有键值为 default 的项目,其值也为数值型**。 这是需要进行最大化或者最小化优化的数值,如精度或者损失度。
`R``eta` 是 Hyperband 中可以改动的参数。 `R` 表示可以分配给配置的最大步数(STEPS)。 这里,STEPS 可以代表 epoch 或 批处理数量。 `STEPS` 应该被 Trial 代码用来控制运行的次数。 参考样例 `examples/trials/mnist-hyperband/` ,了解详细信息。
`eta` 表示 `n` 个配置中的 `n/eta` 个配置会留存下来,并用更多的 STEPS 来运行。
下面是 `R=81``eta=3` 时的样例:
| | s=4 | s=3 | s=2 | s=1 | s=0 |
| - | ---- | ---- | ---- | ---- | ---- |
| i | n r | n r | n r | n r | n r |
| 0 | 81 1 | 27 3 | 9 9 | 6 27 | 5 81 |
| 1 | 27 3 | 9 9 | 3 27 | 2 81 | |
| 2 | 9 9 | 3 27 | 1 81 | | |
| 3 | 3 27 | 1 81 | | | |
| 4 | 1 81 | | | | |
`s` 表示分组, `n` 表示生成的配置数量,相应的 `r` 表示配置会运行多少 STEPS。 `i` 表示轮数,如分组 4 有 5 轮,分组 3 有 4 轮。
关于如何实现 Trial 代码,参考 `examples/trials/mnist-hyperband/` 中的说明。
## 4. 待改进
当前实现的 Hyperband 算法可以通过改进支持的提前终止算法来提高,原因是最好的 `n/eta` 个配置并不一定都表现很好。 不好的配置可以更早的终止。
在当前实现中,遵循了[此论文](https://arxiv.org/pdf/1603.06560.pdf)的设计,配置都是随机生成的。 要进一步提升,配置生成过程可以利用更高级的算法。
\ No newline at end of file
...@@ -31,7 +31,7 @@ import json_tricks ...@@ -31,7 +31,7 @@ import json_tricks
from nni.protocol import CommandType, send from nni.protocol import CommandType, send
from nni.msg_dispatcher_base import MsgDispatcherBase from nni.msg_dispatcher_base import MsgDispatcherBase
from nni.common import init_logger from nni.common import init_logger
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, randint_to_quniform
import nni.parameter_expressions as parameter_expressions import nni.parameter_expressions as parameter_expressions
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
...@@ -357,6 +357,7 @@ class Hyperband(MsgDispatcherBase): ...@@ -357,6 +357,7 @@ class Hyperband(MsgDispatcherBase):
number of trial jobs number of trial jobs
""" """
self.searchspace_json = data self.searchspace_json = data
randint_to_quniform(self.searchspace_json)
self.random_state = np.random.RandomState() self.random_state = np.random.RandomState()
def handle_trial_end(self, data): def handle_trial_end(self, data):
......
# TPE, Random Search, Anneal Tuners
## TPE
Tree-structured Parzen Estimator (TPE) 是一种 sequential model-based optimization(SMBO,即基于序列模型优化)的方法。 SMBO 方法根据历史指标数据来按顺序构造模型,来估算超参的性能,随后基于此模型来选择新的超参。 TPE 方法对 P(x|y) 和 P(y) 建模,其中 x 表示超参,y 表示相关的评估指标。 P(x|y) 通过变换超参的生成过程来建模,用非参数密度(non-parametric densities)代替配置的先验分布。 细节可参考 [Algorithms for Hyper-Parameter Optimization](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf)。 ​
## Random Search(随机搜索)
[Random Search for Hyper-Parameter Optimization](http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf) 中介绍了随机搜索惊人的简单和效果。 建议当不清楚超参的先验分布时,采用随机搜索作为基准。
## Anneal(退火算法)
这种简单的退火算法从先前的采样开始,会越来越靠近发现的最佳点取样。 此算法是随机搜索的简单变体,利用了响应面的平滑性。 退火率不是自适应的。
\ No newline at end of file
...@@ -27,7 +27,7 @@ import logging ...@@ -27,7 +27,7 @@ import logging
import hyperopt as hp import hyperopt as hp
import numpy as np import numpy as np
from nni.tuner import Tuner from nni.tuner import Tuner
from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index from nni.utils import NodeType, OptimizeMode, extract_scalar_reward, split_index, randint_to_quniform
logger = logging.getLogger('hyperopt_AutoML') logger = logging.getLogger('hyperopt_AutoML')
...@@ -153,14 +153,14 @@ def _add_index(in_x, parameter): ...@@ -153,14 +153,14 @@ def _add_index(in_x, parameter):
Will change to format in hyperopt, like: Will change to format in hyperopt, like:
{'dropout_rate': 0.8, 'conv_size': {'_index': 1, '_value': 3}, 'hidden_size': {'_index': 1, '_value': 512}} {'dropout_rate': 0.8, 'conv_size': {'_index': 1, '_value': 3}, 'hidden_size': {'_index': 1, '_value': 512}}
""" """
if TYPE not in in_x: # if at the top level if NodeType.TYPE not in in_x: # if at the top level
out_y = dict() out_y = dict()
for key, value in parameter.items(): for key, value in parameter.items():
out_y[key] = _add_index(in_x[key], value) out_y[key] = _add_index(in_x[key], value)
return out_y return out_y
elif isinstance(in_x, dict): elif isinstance(in_x, dict):
value_type = in_x[TYPE] value_type = in_x[NodeType.TYPE]
value_format = in_x[VALUE] value_format = in_x[NodeType.VALUE]
if value_type == "choice": if value_type == "choice":
choice_name = parameter[0] if isinstance(parameter, choice_name = parameter[0] if isinstance(parameter,
list) else parameter list) else parameter
...@@ -173,15 +173,14 @@ def _add_index(in_x, parameter): ...@@ -173,15 +173,14 @@ def _add_index(in_x, parameter):
choice_value_format = item[1] choice_value_format = item[1]
if choice_key == choice_name: if choice_key == choice_name:
return { return {
INDEX: NodeType.INDEX: pos,
pos, NodeType.VALUE: [
VALUE: [
choice_name, choice_name,
_add_index(choice_value_format, parameter[1]) _add_index(choice_value_format, parameter[1])
] ]
} }
elif choice_name == item: elif choice_name == item:
return {INDEX: pos, VALUE: item} return {NodeType.INDEX: pos, NodeType.VALUE: item}
else: else:
return parameter return parameter
...@@ -232,6 +231,8 @@ class HyperoptTuner(Tuner): ...@@ -232,6 +231,8 @@ class HyperoptTuner(Tuner):
search_space : dict search_space : dict
""" """
self.json = search_space self.json = search_space
randint_to_quniform(self.json)
search_space_instance = json2space(self.json) search_space_instance = json2space(self.json)
rstate = np.random.RandomState() rstate = np.random.RandomState()
trials = hp.Trials() trials = hp.Trials()
......
# Medianstop Assessor
## Median Stop
Medianstop 是一种简单的提前终止 Trial 的策略,可参考[论文](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf)。 如果 Trial X 的在步骤 S 的最好目标值比所有已完成 Trial 的步骤 S 的中位数值明显要低,这个 Trial 就会被提前停止。
\ No newline at end of file
# Metis Tuner
## Metis Tuner
大多数调参工具仅仅预测最优配置,而 [Metis](https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/) 的优势在于有两个输出:(a) 最优配置的当前预测结果, 以及 (b) 下一次 Trial 的建议。 不再需要随机猜测!
大多数工具假设训练集没有噪声数据,但 Metis 会知道是否需要对某个超参重新采样。
大多数工具都有着重于在已有结果上继续发展的问题,而 Metis 的搜索策略可以在探索,发展和重新采样(可选)中进行平衡。
Metis 属于基于序列的贝叶斯优化 (SMBO) 的类别,它也基于贝叶斯优化框架。 为了对超参-性能空间建模,Metis 同时使用了高斯过程(Gaussian Process)和高斯混合模型(GMM)。 由于每次 Trial 都可能有很高的时间成本,Metis 大量使用了已有模型来进行推理计算。 在每次迭代中,Metis 执行两个任务:
在高斯过程空间中找到全局最优点。 这一点表示了最佳配置。
它会标识出下一个超参的候选项。 这是通过对隐含信息的探索、挖掘和重采样来实现的。
注意,搜索空间仅支持 `choice`, `quniform`, `uniform``randint`
更多详情,参考论文:https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
\ No newline at end of file
...@@ -133,7 +133,7 @@ class MetisTuner(Tuner): ...@@ -133,7 +133,7 @@ class MetisTuner(Tuner):
self.x_bounds[idx] = bounds self.x_bounds[idx] = bounds
self.x_types[idx] = 'discrete_int' self.x_types[idx] = 'discrete_int'
elif key_type == 'randint': elif key_type == 'randint':
self.x_bounds[idx] = [0, key_range[0]] self.x_bounds[idx] = [key_range[0], key_range[1]]
self.x_types[idx] = 'range_int' self.x_types[idx] = 'range_int'
elif key_type == 'uniform': elif key_type == 'uniform':
self.x_bounds[idx] = [key_range[0], key_range[1]] self.x_bounds[idx] = [key_range[0], key_range[1]]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment