Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
3be88922
"runner/git@developer.sourcefind.cn:OpenDAS/ollama.git" did not exist on "3892c3a7032c99db250c3266276c4525d243950a"
Commit
3be88922
authored
May 29, 2019
by
suiguoxin
Browse files
Merge branch 'master' of
git://github.com/microsoft/nni
parents
b92c4ab2
5a058baf
Changes
73
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
165 additions
and
160 deletions
+165
-160
src/nni_manager/core/test/mockedDatastore.ts
src/nni_manager/core/test/mockedDatastore.ts
+10
-0
src/nni_manager/rest_server/restHandler.ts
src/nni_manager/rest_server/restHandler.ts
+11
-0
src/nni_manager/rest_server/restValidationSchemas.ts
src/nni_manager/rest_server/restValidationSchemas.ts
+6
-2
src/nni_manager/rest_server/test/mockedNNIManager.ts
src/nni_manager/rest_server/test/mockedNNIManager.ts
+4
-0
src/nni_manager/training_service/local/gpuScheduler.ts
src/nni_manager/training_service/local/gpuScheduler.ts
+5
-4
src/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+46
-8
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
...i_manager/training_service/remote_machine/gpuScheduler.ts
+54
-24
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+10
-5
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+3
-2
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
+2
-1
src/sdk/pynni/nni/evolution_tuner/README_zh_CN.md
src/sdk/pynni/nni/evolution_tuner/README_zh_CN.md
+0
-5
src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py
src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py
+2
-1
src/sdk/pynni/nni/gridsearch_tuner/README_zh_CN.md
src/sdk/pynni/nni/gridsearch_tuner/README_zh_CN.md
+0
-5
src/sdk/pynni/nni/hyperband_advisor/README_zh_CN.md
src/sdk/pynni/nni/hyperband_advisor/README_zh_CN.md
+0
-56
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
+2
-1
src/sdk/pynni/nni/hyperopt_tuner/README_zh_CN.md
src/sdk/pynni/nni/hyperopt_tuner/README_zh_CN.md
+0
-13
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
+9
-8
src/sdk/pynni/nni/medianstop_assessor/README_zh_CN.md
src/sdk/pynni/nni/medianstop_assessor/README_zh_CN.md
+0
-5
src/sdk/pynni/nni/metis_tuner/README_zh_CN.md
src/sdk/pynni/nni/metis_tuner/README_zh_CN.md
+0
-19
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
+1
-1
No files found.
src/nni_manager/core/test/mockedDatastore.ts
View file @
3be88922
...
@@ -210,6 +210,16 @@ class MockedDataStore implements DataStore {
...
@@ -210,6 +210,16 @@ class MockedDataStore implements DataStore {
return
result
;
return
result
;
}
}
async
exportTrialHpConfigs
():
Promise
<
string
>
{
const
ret
:
string
=
''
;
return
Promise
.
resolve
(
ret
);
}
async
getImportedData
():
Promise
<
string
[]
>
{
const
ret
:
string
[]
=
[];
return
Promise
.
resolve
(
ret
);
}
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
public
getTrialJob
(
trialJobId
:
string
):
Promise
<
TrialJobInfo
>
{
throw
new
Error
(
"
Method not implemented.
"
);
throw
new
Error
(
"
Method not implemented.
"
);
}
}
...
...
src/nni_manager/rest_server/restHandler.ts
View file @
3be88922
...
@@ -72,6 +72,7 @@ class NNIRestHandler {
...
@@ -72,6 +72,7 @@ class NNIRestHandler {
this
.
addTrialJob
(
router
);
this
.
addTrialJob
(
router
);
this
.
cancelTrialJob
(
router
);
this
.
cancelTrialJob
(
router
);
this
.
getMetricData
(
router
);
this
.
getMetricData
(
router
);
this
.
exportData
(
router
);
// Express-joi-validator configuration
// Express-joi-validator configuration
router
.
use
((
err
:
any
,
req
:
Request
,
res
:
Response
,
next
:
any
)
=>
{
router
.
use
((
err
:
any
,
req
:
Request
,
res
:
Response
,
next
:
any
)
=>
{
...
@@ -261,6 +262,16 @@ class NNIRestHandler {
...
@@ -261,6 +262,16 @@ class NNIRestHandler {
});
});
}
}
private
exportData
(
router
:
Router
):
void
{
router
.
get
(
'
/export-data
'
,
(
req
:
Request
,
res
:
Response
)
=>
{
this
.
nniManager
.
exportData
().
then
((
exportedData
:
string
)
=>
{
res
.
send
(
exportedData
);
}).
catch
((
err
:
Error
)
=>
{
this
.
handle_error
(
err
,
res
);
});
});
}
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
private
setErrorPathForFailedJob
(
jobInfo
:
TrialJobInfo
):
TrialJobInfo
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
if
(
jobInfo
===
undefined
||
jobInfo
.
status
!==
'
FAILED
'
||
jobInfo
.
logPath
===
undefined
)
{
return
jobInfo
;
return
jobInfo
;
...
...
src/nni_manager/rest_server/restValidationSchemas.ts
View file @
3be88922
...
@@ -31,10 +31,14 @@ export namespace ValidationSchemas {
...
@@ -31,10 +31,14 @@ export namespace ValidationSchemas {
passwd
:
joi
.
string
(),
passwd
:
joi
.
string
(),
sshKeyPath
:
joi
.
string
(),
sshKeyPath
:
joi
.
string
(),
passphrase
:
joi
.
string
(),
passphrase
:
joi
.
string
(),
gpuIndices
:
joi
.
string
()
gpuIndices
:
joi
.
string
(),
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
()
})),
})),
local_config
:
joi
.
object
({
local_config
:
joi
.
object
({
gpuIndices
:
joi
.
string
()
gpuIndices
:
joi
.
string
(),
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
()
}),
}),
trial_config
:
joi
.
object
({
trial_config
:
joi
.
object
({
image
:
joi
.
string
().
min
(
1
),
image
:
joi
.
string
().
min
(
1
),
...
...
src/nni_manager/rest_server/test/mockedNNIManager.ts
View file @
3be88922
...
@@ -49,6 +49,10 @@ export class MockedNNIManager extends Manager {
...
@@ -49,6 +49,10 @@ export class MockedNNIManager extends Manager {
public
importData
(
data
:
string
):
Promise
<
void
>
{
public
importData
(
data
:
string
):
Promise
<
void
>
{
return
Promise
.
resolve
();
return
Promise
.
resolve
();
}
}
public
async
exportData
():
Promise
<
string
>
{
const
ret
:
string
=
''
;
return
Promise
.
resolve
(
ret
);
}
public
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
public
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
const
deferred
:
Deferred
<
TrialJobStatistics
[]
>
=
new
Deferred
<
TrialJobStatistics
[]
>
();
const
deferred
:
Deferred
<
TrialJobStatistics
[]
>
=
new
Deferred
<
TrialJobStatistics
[]
>
();
deferred
.
resolve
([{
deferred
.
resolve
([{
...
...
src/nni_manager/training_service/local/gpuScheduler.ts
View file @
3be88922
...
@@ -71,14 +71,15 @@ class GPUScheduler {
...
@@ -71,14 +71,15 @@ class GPUScheduler {
execScript
(
gpuMetricsCollectorScriptPath
)
execScript
(
gpuMetricsCollectorScriptPath
)
}
}
public
getAvailableGPUIndices
():
number
[]
{
public
getAvailableGPUIndices
(
useActiveGpu
:
boolean
,
occupiedGpuIndexNumMap
:
Map
<
number
,
number
>
):
number
[]
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
this
.
gpuSummary
!==
undefined
)
{
if
(
process
.
platform
===
'
win32
'
)
{
if
(
process
.
platform
===
'
win32
'
||
useActiveGpu
)
{
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
return
this
.
gpuSummary
.
gpuInfos
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
else
{
else
{
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
info
.
activeProcessNum
===
0
)
return
this
.
gpuSummary
.
gpuInfos
.
filter
((
info
:
GPUInfo
)
=>
.
map
((
info
:
GPUInfo
)
=>
info
.
index
);
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
===
undefined
&&
info
.
activeProcessNum
===
0
||
occupiedGpuIndexNumMap
.
get
(
info
.
index
)
!==
undefined
).
map
((
info
:
GPUInfo
)
=>
info
.
index
);
}
}
}
}
...
...
src/nni_manager/training_service/local/localTrainingService.ts
View file @
3be88922
...
@@ -97,11 +97,19 @@ class LocalTrialJobDetail implements TrialJobDetail {
...
@@ -97,11 +97,19 @@ class LocalTrialJobDetail implements TrialJobDetail {
* Local training service config
* Local training service config
*/
*/
class
LocalConfig
{
class
LocalConfig
{
public
maxTrialNumPerGpu
?:
number
;
public
gpuIndices
?:
string
;
public
gpuIndices
?:
string
;
constructor
(
gpuIndices
?:
string
)
{
public
useActiveGpu
?:
boolean
;
constructor
(
gpuIndices
?:
string
,
maxTrialNumPerGpu
?:
number
,
useActiveGpu
?:
boolean
)
{
if
(
gpuIndices
!==
undefined
)
{
if
(
gpuIndices
!==
undefined
)
{
this
.
gpuIndices
=
gpuIndices
;
this
.
gpuIndices
=
gpuIndices
;
}
}
if
(
maxTrialNumPerGpu
!==
undefined
)
{
this
.
maxTrialNumPerGpu
=
maxTrialNumPerGpu
;
}
if
(
useActiveGpu
!==
undefined
)
{
this
.
useActiveGpu
=
useActiveGpu
;
}
}
}
}
}
...
@@ -117,13 +125,15 @@ class LocalTrainingService implements TrainingService {
...
@@ -117,13 +125,15 @@ class LocalTrainingService implements TrainingService {
private
rootDir
!
:
string
;
private
rootDir
!
:
string
;
private
trialSequenceId
:
number
;
private
trialSequenceId
:
number
;
private
gpuScheduler
!
:
GPUScheduler
;
private
gpuScheduler
!
:
GPUScheduler
;
private
occupiedGpuInd
ices
:
Set
<
number
>
;
private
occupiedGpuInd
exNumMap
:
Map
<
number
,
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
designatedGpuIndices
!
:
Set
<
number
>
;
private
log
:
Logger
;
private
log
:
Logger
;
private
localTrailConfig
?:
TrialConfig
;
private
localTrailConfig
?:
TrialConfig
;
private
localConfig
?:
LocalConfig
;
private
localConfig
?:
LocalConfig
;
private
isMultiPhase
:
boolean
=
false
;
private
isMultiPhase
:
boolean
;
private
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
jobStreamMap
:
Map
<
string
,
ts
.
Stream
>
;
private
maxTrialNumPerGpu
:
number
;
private
useActiveGpu
:
boolean
;
constructor
()
{
constructor
()
{
this
.
eventEmitter
=
new
EventEmitter
();
this
.
eventEmitter
=
new
EventEmitter
();
...
@@ -135,7 +145,10 @@ class LocalTrainingService implements TrainingService {
...
@@ -135,7 +145,10 @@ class LocalTrainingService implements TrainingService {
this
.
trialSequenceId
=
-
1
;
this
.
trialSequenceId
=
-
1
;
this
.
jobStreamMap
=
new
Map
<
string
,
ts
.
Stream
>
();
this
.
jobStreamMap
=
new
Map
<
string
,
ts
.
Stream
>
();
this
.
log
.
info
(
'
Construct local machine training service.
'
);
this
.
log
.
info
(
'
Construct local machine training service.
'
);
this
.
occupiedGpuIndices
=
new
Set
<
number
>
();
this
.
occupiedGpuIndexNumMap
=
new
Map
<
number
,
number
>
();
this
.
maxTrialNumPerGpu
=
1
;
this
.
useActiveGpu
=
false
;
this
.
isMultiPhase
=
false
;
}
}
public
async
run
():
Promise
<
void
>
{
public
async
run
():
Promise
<
void
>
{
...
@@ -304,6 +317,13 @@ class LocalTrainingService implements TrainingService {
...
@@ -304,6 +317,13 @@ class LocalTrainingService implements TrainingService {
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
throw
new
Error
(
'
gpuIndices can not be empty if specified.
'
);
}
}
}
}
if
(
this
.
localConfig
.
maxTrialNumPerGpu
!==
undefined
)
{
this
.
maxTrialNumPerGpu
=
this
.
localConfig
.
maxTrialNumPerGpu
;
}
if
(
this
.
localConfig
.
useActiveGpu
!==
undefined
)
{
this
.
useActiveGpu
=
this
.
localConfig
.
useActiveGpu
;
}
break
;
break
;
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
case
TrialConfigMetadataKey
.
MULTI_PHASE
:
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
this
.
isMultiPhase
=
(
value
===
'
true
'
||
value
===
'
True
'
);
...
@@ -356,7 +376,14 @@ class LocalTrainingService implements TrainingService {
...
@@ -356,7 +376,14 @@ class LocalTrainingService implements TrainingService {
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
trialJob
.
gpuIndices
!==
undefined
&&
trialJob
.
gpuIndices
.
length
>
0
&&
this
.
gpuScheduler
!==
undefined
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
if
(
oldStatus
===
'
RUNNING
'
&&
trialJob
.
status
!==
'
RUNNING
'
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
for
(
const
index
of
trialJob
.
gpuIndices
)
{
this
.
occupiedGpuIndices
.
delete
(
index
);
let
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
throw
new
Error
(
`gpu resource schedule error`
);
}
else
if
(
num
===
1
)
{
this
.
occupiedGpuIndexNumMap
.
delete
(
index
);
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
-
1
)
}
}
}
}
}
}
}
...
@@ -396,8 +423,14 @@ class LocalTrainingService implements TrainingService {
...
@@ -396,8 +423,14 @@ class LocalTrainingService implements TrainingService {
return
[
true
,
resource
];
return
[
true
,
resource
];
}
}
let
selectedGPUIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
()
let
selectedGPUIndices
:
number
[]
=
[];
.
filter
((
index
:
number
)
=>
!
this
.
occupiedGpuIndices
.
has
(
index
));
let
availableGpuIndices
:
number
[]
=
this
.
gpuScheduler
.
getAvailableGPUIndices
(
this
.
useActiveGpu
,
this
.
occupiedGpuIndexNumMap
);
for
(
let
index
of
availableGpuIndices
)
{
let
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
||
num
<
this
.
maxTrialNumPerGpu
)
{
selectedGPUIndices
.
push
(
index
);
}
}
if
(
this
.
designatedGpuIndices
!==
undefined
)
{
if
(
this
.
designatedGpuIndices
!==
undefined
)
{
this
.
checkSpecifiedGpuIndices
();
this
.
checkSpecifiedGpuIndices
();
...
@@ -428,7 +461,12 @@ class LocalTrainingService implements TrainingService {
...
@@ -428,7 +461,12 @@ class LocalTrainingService implements TrainingService {
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
private
occupyResource
(
resource
:
{
gpuIndices
:
number
[]}):
void
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
if
(
this
.
gpuScheduler
!==
undefined
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
for
(
const
index
of
resource
.
gpuIndices
)
{
this
.
occupiedGpuIndices
.
add
(
index
);
let
num
:
number
|
undefined
=
this
.
occupiedGpuIndexNumMap
.
get
(
index
);
if
(
num
===
undefined
)
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
1
)
}
else
{
this
.
occupiedGpuIndexNumMap
.
set
(
index
,
num
+
1
)
}
}
}
}
}
}
}
...
...
src/nni_manager/training_service/remote_machine/gpuScheduler.ts
View file @
3be88922
...
@@ -23,7 +23,8 @@ import * as assert from 'assert';
...
@@ -23,7 +23,8 @@ import * as assert from 'assert';
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
getLogger
,
Logger
}
from
'
../../common/log
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
randomSelect
}
from
'
../../common/utils
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
GPUInfo
}
from
'
../common/gpuData
'
;
import
{
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
RemoteMachineTrialJobDetail
,
parseGpuIndices
,
RemoteMachineMeta
,
RemoteMachineScheduleResult
,
ScheduleResultType
,
SSHClientManager
}
from
'
./remoteMachineData
'
;
import
{
TrialJobDetail
}
from
'
common/trainingService
'
;
/**
/**
* A simple GPU scheduler implementation
* A simple GPU scheduler implementation
...
@@ -45,7 +46,7 @@ export class GPUScheduler {
...
@@ -45,7 +46,7 @@ export class GPUScheduler {
* Schedule a machine according to the constraints (requiredGPUNum)
* Schedule a machine according to the constraints (requiredGPUNum)
* @param requiredGPUNum required GPU number
* @param requiredGPUNum required GPU number
*/
*/
public
scheduleMachine
(
requiredGPUNum
:
number
,
trialJob
Id
:
string
)
:
RemoteMachineScheduleResult
{
public
scheduleMachine
(
requiredGPUNum
:
number
,
trialJob
Detail
:
RemoteMachineTrialJobDetail
)
:
RemoteMachineScheduleResult
{
assert
(
requiredGPUNum
>=
0
);
assert
(
requiredGPUNum
>=
0
);
const
allRMs
:
RemoteMachineMeta
[]
=
Array
.
from
(
this
.
machineSSHClientMap
.
keys
());
const
allRMs
:
RemoteMachineMeta
[]
=
Array
.
from
(
this
.
machineSSHClientMap
.
keys
());
assert
(
allRMs
.
length
>
0
);
assert
(
allRMs
.
length
>
0
);
...
@@ -66,7 +67,7 @@ export class GPUScheduler {
...
@@ -66,7 +67,7 @@ export class GPUScheduler {
// Currenty the requireGPUNum parameter for all trial jobs are identical.
// Currenty the requireGPUNum parameter for all trial jobs are identical.
if
(
requiredGPUNum
>
0
)
{
if
(
requiredGPUNum
>
0
)
{
// Trial job requires GPU
// Trial job requires GPU
const
result
:
RemoteMachineScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
requiredGPUNum
,
trialJob
Id
);
const
result
:
RemoteMachineScheduleResult
|
undefined
=
this
.
scheduleGPUHost
(
requiredGPUNum
,
trialJob
Detail
);
if
(
result
!==
undefined
)
{
if
(
result
!==
undefined
)
{
return
result
;
return
result
;
}
}
...
@@ -74,9 +75,9 @@ export class GPUScheduler {
...
@@ -74,9 +75,9 @@ export class GPUScheduler {
// Trail job does not need GPU
// Trail job does not need GPU
const
allocatedRm
:
RemoteMachineMeta
=
this
.
selectMachine
(
allRMs
);
const
allocatedRm
:
RemoteMachineMeta
=
this
.
selectMachine
(
allRMs
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialJob
Id
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
[],
trialJob
Detail
);
}
}
this
.
log
.
warning
(
`Scheduler: trialJob id
${
trialJob
I
d
}
, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `
);
this
.
log
.
warning
(
`Scheduler: trialJob id
${
trialJob
Detail
.
i
d
}
, no machine can be scheduled, return TMP_NO_AVAILABLE_GPU `
);
return
{
return
{
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
resultType
:
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
,
...
@@ -87,21 +88,35 @@ export class GPUScheduler {
...
@@ -87,21 +88,35 @@ export class GPUScheduler {
/**
/**
* remove the job's gpu reversion
* remove the job's gpu reversion
*/
*/
public
removeGpuReservation
(
trialJobId
:
string
,
rmMeta
?:
RemoteMachineMeta
):
void
{
public
removeGpuReservation
(
trialJobId
:
string
,
trialJobMap
:
Map
<
string
,
RemoteMachineTrialJobDetail
>
):
void
{
// If remote machine has no GPU, gpuReservcation is not initialized, so check if it's undefined
let
trialJobDetail
:
RemoteMachineTrialJobDetail
|
undefined
=
trialJobMap
.
get
(
trialJobId
);
if
(
rmMeta
!==
undefined
&&
rmMeta
.
gpuReservation
!==
undefined
)
{
if
(
trialJobDetail
===
undefined
)
{
rmMeta
.
gpuReservation
.
forEach
((
reserveTrialJobId
:
string
,
gpuIndex
:
number
)
=>
{
throw
new
Error
(
`could not get trialJobDetail by id
${
trialJobId
}
`
);
if
(
reserveTrialJobId
===
trialJobId
)
{
}
rmMeta
.
gpuReservation
.
delete
(
gpuIndex
);
if
(
trialJobDetail
.
rmMeta
!==
undefined
&&
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
!==
undefined
&&
trialJobDetail
.
gpuIndices
!==
undefined
&&
trialJobDetail
.
gpuIndices
.
length
>
0
)
{
for
(
const
gpuInfo
of
trialJobDetail
.
gpuIndices
)
{
let
num
:
number
|
undefined
=
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
!==
undefined
)
{
if
(
num
===
1
)
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
delete
(
gpuInfo
.
index
);
}
else
{
trialJobDetail
.
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
-
1
)
}
}
}
}
);
}
}
}
trialJobDetail
.
gpuIndices
=
[];
trialJobMap
.
set
(
trialJobId
,
trialJobDetail
);
}
}
private
scheduleGPUHost
(
requiredGPUNum
:
number
,
trialJob
Id
:
string
):
RemoteMachineScheduleResult
|
undefined
{
private
scheduleGPUHost
(
requiredGPUNum
:
number
,
trialJob
Detail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
|
undefined
{
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
totalResourceMap
:
Map
<
RemoteMachineMeta
,
GPUInfo
[]
>
=
this
.
gpuResourceDetection
();
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
const
qualifiedRMs
:
RemoteMachineMeta
[]
=
[];
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
totalResourceMap
.
forEach
((
gpuInfos
:
GPUInfo
[],
rmMeta
:
RemoteMachineMeta
)
=>
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
if
(
gpuInfos
!==
undefined
&&
gpuInfos
.
length
>=
requiredGPUNum
)
{
qualifiedRMs
.
push
(
rmMeta
);
qualifiedRMs
.
push
(
rmMeta
);
}
}
...
@@ -110,7 +125,7 @@ export class GPUScheduler {
...
@@ -110,7 +125,7 @@ export class GPUScheduler {
const
allocatedRm
:
RemoteMachineMeta
=
this
.
selectMachine
(
qualifiedRMs
);
const
allocatedRm
:
RemoteMachineMeta
=
this
.
selectMachine
(
qualifiedRMs
);
const
gpuInfos
:
GPUInfo
[]
|
undefined
=
totalResourceMap
.
get
(
allocatedRm
);
const
gpuInfos
:
GPUInfo
[]
|
undefined
=
totalResourceMap
.
get
(
allocatedRm
);
if
(
gpuInfos
!==
undefined
)
{
// should always true
if
(
gpuInfos
!==
undefined
)
{
// should always true
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
gpuInfos
,
trialJob
Id
);
return
this
.
allocateHost
(
requiredGPUNum
,
allocatedRm
,
gpuInfos
,
trialJob
Detail
);
}
else
{
}
else
{
assert
(
false
,
'
gpuInfos is undefined
'
);
assert
(
false
,
'
gpuInfos is undefined
'
);
}
}
...
@@ -130,9 +145,6 @@ export class GPUScheduler {
...
@@ -130,9 +145,6 @@ export class GPUScheduler {
// Assgin totoal GPU count as init available GPU number
// Assgin totoal GPU count as init available GPU number
if
(
rmMeta
.
gpuSummary
!==
undefined
)
{
if
(
rmMeta
.
gpuSummary
!==
undefined
)
{
const
availableGPUs
:
GPUInfo
[]
=
[];
const
availableGPUs
:
GPUInfo
[]
=
[];
if
(
rmMeta
.
gpuReservation
===
undefined
)
{
rmMeta
.
gpuReservation
=
new
Map
<
number
,
string
>
();
}
const
designatedGpuIndices
:
Set
<
number
>
|
undefined
=
parseGpuIndices
(
rmMeta
.
gpuIndices
);
const
designatedGpuIndices
:
Set
<
number
>
|
undefined
=
parseGpuIndices
(
rmMeta
.
gpuIndices
);
if
(
designatedGpuIndices
!==
undefined
)
{
if
(
designatedGpuIndices
!==
undefined
)
{
for
(
const
gpuIndex
of
designatedGpuIndices
)
{
for
(
const
gpuIndex
of
designatedGpuIndices
)
{
...
@@ -145,10 +157,20 @@ export class GPUScheduler {
...
@@ -145,10 +157,20 @@ export class GPUScheduler {
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
rmMeta
.
gpuSummary
.
gpuInfos
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
// if the GPU has active process, OR be reserved by a job,
// if the GPU has active process, OR be reserved by a job,
// or index not in gpuIndices configuration in machineList,
// or index not in gpuIndices configuration in machineList,
// or trial number on a GPU reach max number,
// We should NOT allocate this GPU
// We should NOT allocate this GPU
if
(
gpuInfo
.
activeProcessNum
===
0
&&
!
rmMeta
.
gpuReservation
.
has
(
gpuInfo
.
index
)
// if users set useActiveGpu, use the gpu whether there is another activeProcess
&&
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
)))
{
if
(
designatedGpuIndices
===
undefined
||
designatedGpuIndices
.
has
(
gpuInfo
.
index
))
{
availableGPUs
.
push
(
gpuInfo
);
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
let
maxTrialNumPerGpu
:
number
=
rmMeta
.
maxTrialNumPerGpu
?
rmMeta
.
maxTrialNumPerGpu
:
1
;
if
((
num
===
undefined
&&
(
!
rmMeta
.
useActiveGpu
&&
gpuInfo
.
activeProcessNum
===
0
||
rmMeta
.
useActiveGpu
))
||
(
num
!==
undefined
&&
num
<
maxTrialNumPerGpu
))
{
availableGPUs
.
push
(
gpuInfo
);
}
}
else
{
throw
new
Error
(
`occupiedGpuIndexMap initialize error!`
);
}
}
}
});
});
totalResourceMap
.
set
(
rmMeta
,
availableGPUs
);
totalResourceMap
.
set
(
rmMeta
,
availableGPUs
);
...
@@ -170,14 +192,22 @@ export class GPUScheduler {
...
@@ -170,14 +192,22 @@ export class GPUScheduler {
}
}
private
allocateHost
(
requiredGPUNum
:
number
,
rmMeta
:
RemoteMachineMeta
,
private
allocateHost
(
requiredGPUNum
:
number
,
rmMeta
:
RemoteMachineMeta
,
gpuInfos
:
GPUInfo
[],
trialJob
Id
:
string
):
RemoteMachineScheduleResult
{
gpuInfos
:
GPUInfo
[],
trialJob
Detail
:
RemoteMachineTrialJobDetail
):
RemoteMachineScheduleResult
{
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
assert
(
gpuInfos
.
length
>=
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
const
allocatedGPUs
:
GPUInfo
[]
=
this
.
selectGPUsForTrial
(
gpuInfos
,
requiredGPUNum
);
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
allocatedGPUs
.
forEach
((
gpuInfo
:
GPUInfo
)
=>
{
rmMeta
.
gpuReservation
.
set
(
gpuInfo
.
index
,
trialJobId
);
if
(
rmMeta
.
occupiedGpuIndexMap
!==
undefined
)
{
let
num
=
rmMeta
.
occupiedGpuIndexMap
.
get
(
gpuInfo
.
index
);
if
(
num
===
undefined
)
{
num
=
0
;
}
rmMeta
.
occupiedGpuIndexMap
.
set
(
gpuInfo
.
index
,
num
+
1
);
}
else
{
throw
new
Error
(
`Machine
${
rmMeta
.
ip
}
occupiedGpuIndexMap initialize error!`
);
}
});
});
trialJobDetail
.
gpuIndices
=
allocatedGPUs
;
trialJobDetail
.
rmMeta
=
rmMeta
;
return
{
return
{
resultType
:
ScheduleResultType
.
SUCCEED
,
resultType
:
ScheduleResultType
.
SUCCEED
,
scheduleInfo
:
{
scheduleInfo
:
{
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
3be88922
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
...
@@ -23,7 +23,7 @@ import * as fs from 'fs';
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Client
,
ConnectConfig
}
from
'
ssh2
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
Deferred
}
from
'
ts-deferred
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
JobApplicationForm
,
TrialJobDetail
,
TrialJobStatus
}
from
'
../../common/trainingService
'
;
import
{
GPUSummary
}
from
'
../common/gpuData
'
;
import
{
GPUSummary
,
GPUInfo
}
from
'
../common/gpuData
'
;
/**
/**
* Metadata of remote machine for configuration and statuc query
* Metadata of remote machine for configuration and statuc query
...
@@ -36,20 +36,23 @@ export class RemoteMachineMeta {
...
@@ -36,20 +36,23 @@ export class RemoteMachineMeta {
public
readonly
sshKeyPath
?:
string
;
public
readonly
sshKeyPath
?:
string
;
public
readonly
passphrase
?:
string
;
public
readonly
passphrase
?:
string
;
public
gpuSummary
:
GPUSummary
|
undefined
;
public
gpuSummary
:
GPUSummary
|
undefined
;
// GPU Reservation info, the key is GPU index, the value is the job id which reserves this GPU
public
gpuReservation
:
Map
<
number
,
string
>
;
public
readonly
gpuIndices
?:
string
;
public
readonly
gpuIndices
?:
string
;
public
readonly
maxTrialNumPerGpu
?:
number
;
public
occupiedGpuIndexMap
:
Map
<
number
,
number
>
;
public
readonly
useActiveGpu
?:
boolean
=
false
;
constructor
(
ip
:
string
,
port
:
number
,
username
:
string
,
passwd
:
string
,
constructor
(
ip
:
string
,
port
:
number
,
username
:
string
,
passwd
:
string
,
sshKeyPath
:
string
,
passphrase
:
string
,
gpuIndices
?:
string
)
{
sshKeyPath
:
string
,
passphrase
:
string
,
gpuIndices
?:
string
,
maxTrialNumPerGpu
?:
number
,
useActiveGpu
?:
boolean
)
{
this
.
ip
=
ip
;
this
.
ip
=
ip
;
this
.
port
=
port
;
this
.
port
=
port
;
this
.
username
=
username
;
this
.
username
=
username
;
this
.
passwd
=
passwd
;
this
.
passwd
=
passwd
;
this
.
sshKeyPath
=
sshKeyPath
;
this
.
sshKeyPath
=
sshKeyPath
;
this
.
passphrase
=
passphrase
;
this
.
passphrase
=
passphrase
;
this
.
gpuReservation
=
new
Map
<
number
,
string
>
();
this
.
gpuIndices
=
gpuIndices
;
this
.
gpuIndices
=
gpuIndices
;
this
.
maxTrialNumPerGpu
=
maxTrialNumPerGpu
;
this
.
occupiedGpuIndexMap
=
new
Map
<
number
,
number
>
();
this
.
useActiveGpu
=
useActiveGpu
;
}
}
}
}
...
@@ -97,6 +100,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
...
@@ -97,6 +100,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
public
sequenceId
:
number
;
public
sequenceId
:
number
;
public
rmMeta
?:
RemoteMachineMeta
;
public
rmMeta
?:
RemoteMachineMeta
;
public
isEarlyStopped
?:
boolean
;
public
isEarlyStopped
?:
boolean
;
public
gpuIndices
:
GPUInfo
[];
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
constructor
(
id
:
string
,
status
:
TrialJobStatus
,
submitTime
:
number
,
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
)
{
workingDirectory
:
string
,
form
:
JobApplicationForm
,
sequenceId
:
number
)
{
...
@@ -107,6 +111,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
...
@@ -107,6 +111,7 @@ export class RemoteMachineTrialJobDetail implements TrialJobDetail {
this
.
form
=
form
;
this
.
form
=
form
;
this
.
sequenceId
=
sequenceId
;
this
.
sequenceId
=
sequenceId
;
this
.
tags
=
[];
this
.
tags
=
[];
this
.
gpuIndices
=
[]
}
}
}
}
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
3be88922
...
@@ -282,7 +282,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -282,7 +282,7 @@ class RemoteMachineTrainingService implements TrainingService {
private
updateGpuReservation
()
{
private
updateGpuReservation
()
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
for
(
const
[
key
,
value
]
of
this
.
trialJobsMap
)
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
if
(
!
[
'
WAITING
'
,
'
RUNNING
'
].
includes
(
value
.
status
))
{
this
.
gpuScheduler
.
removeGpuReservation
(
value
.
id
,
value
.
rmMeta
);
this
.
gpuScheduler
.
removeGpuReservation
(
key
,
this
.
trialJobsMap
);
}
}
};
};
}
}
...
@@ -521,7 +521,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -521,7 +521,7 @@ class RemoteMachineTrainingService implements TrainingService {
return
deferred
.
promise
;
return
deferred
.
promise
;
}
}
// get an ssh client from scheduler
// get an ssh client from scheduler
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
trialConfig
.
gpuNum
,
trialJob
Id
);
const
rmScheduleResult
:
RemoteMachineScheduleResult
=
this
.
gpuScheduler
.
scheduleMachine
(
this
.
trialConfig
.
gpuNum
,
trialJob
Detail
);
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
REQUIRE_EXCEED_TOTAL
)
{
const
errorMessage
:
string
=
`Required GPU number
${
this
.
trialConfig
.
gpuNum
}
is too large, no machine can meet`
;
const
errorMessage
:
string
=
`Required GPU number
${
this
.
trialConfig
.
gpuNum
}
is too large, no machine can meet`
;
this
.
log
.
error
(
errorMessage
);
this
.
log
.
error
(
errorMessage
);
...
@@ -542,6 +542,7 @@ class RemoteMachineTrainingService implements TrainingService {
...
@@ -542,6 +542,7 @@ class RemoteMachineTrainingService implements TrainingService {
trialJobDetail
.
url
=
`file://
${
rmScheduleInfo
.
rmMeta
.
ip
}
:
${
trialWorkingFolder
}
`
;
trialJobDetail
.
url
=
`file://
${
rmScheduleInfo
.
rmMeta
.
ip
}
:
${
trialWorkingFolder
}
`
;
trialJobDetail
.
startTime
=
Date
.
now
();
trialJobDetail
.
startTime
=
Date
.
now
();
this
.
trialJobsMap
.
set
(
trialJobId
,
trialJobDetail
);
deferred
.
resolve
(
true
);
deferred
.
resolve
(
true
);
}
else
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
}
else
if
(
rmScheduleResult
.
resultType
===
ScheduleResultType
.
TMP_NO_AVAILABLE_GPU
)
{
this
.
log
.
info
(
`Right now no available GPU can be allocated for trial
${
trialJobId
}
, will try to schedule later`
);
this
.
log
.
info
(
`Right now no available GPU can be allocated for trial
${
trialJobId
}
, will try to schedule later`
);
...
...
src/sdk/pynni/nni/bohb_advisor/bohb_advisor.py
View file @
3be88922
...
@@ -31,7 +31,7 @@ import ConfigSpace.hyperparameters as CSH
...
@@ -31,7 +31,7 @@ import ConfigSpace.hyperparameters as CSH
from
nni.protocol
import
CommandType
,
send
from
nni.protocol
import
CommandType
,
send
from
nni.msg_dispatcher_base
import
MsgDispatcherBase
from
nni.msg_dispatcher_base
import
MsgDispatcherBase
from
nni.utils
import
OptimizeMode
,
extract_scalar_reward
from
nni.utils
import
OptimizeMode
,
extract_scalar_reward
,
randint_to_quniform
from
.config_generator
import
CG_BOHB
from
.config_generator
import
CG_BOHB
...
@@ -443,6 +443,7 @@ class BOHB(MsgDispatcherBase):
...
@@ -443,6 +443,7 @@ class BOHB(MsgDispatcherBase):
search space of this experiment
search space of this experiment
"""
"""
search_space
=
data
search_space
=
data
randint_to_quniform
(
search_space
)
cs
=
CS
.
ConfigurationSpace
()
cs
=
CS
.
ConfigurationSpace
()
for
var
in
search_space
:
for
var
in
search_space
:
_type
=
str
(
search_space
[
var
][
"_type"
])
_type
=
str
(
search_space
[
var
][
"_type"
])
...
...
src/sdk/pynni/nni/evolution_tuner/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# Naive Evolution Tuner
## Naive Evolution(进化算法)
进化算法来自于
[
Large-Scale Evolution of Image Classifiers
](
https://arxiv.org/pdf/1703.01041.pdf
)
。 它会基于搜索空间随机生成一个种群。 在每一代中,会选择较好的结果,并对其下一代进行一些变异(例如,改动一个超参,增加或减少一层)。 进化算法需要很多次 Trial 才能有效,但它也非常简单,也很容易扩展新功能。
\ No newline at end of file
src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py
View file @
3be88922
...
@@ -26,7 +26,7 @@ import random
...
@@ -26,7 +26,7 @@ import random
import
numpy
as
np
import
numpy
as
np
from
nni.tuner
import
Tuner
from
nni.tuner
import
Tuner
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
,
split_index
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
,
split_index
,
randint_to_quniform
import
nni.parameter_expressions
as
parameter_expressions
import
nni.parameter_expressions
as
parameter_expressions
...
@@ -175,6 +175,7 @@ class EvolutionTuner(Tuner):
...
@@ -175,6 +175,7 @@ class EvolutionTuner(Tuner):
search_space : dict
search_space : dict
"""
"""
self
.
searchspace_json
=
search_space
self
.
searchspace_json
=
search_space
randint_to_quniform
(
self
.
searchspace_json
)
self
.
space
=
json2space
(
self
.
searchspace_json
)
self
.
space
=
json2space
(
self
.
searchspace_json
)
self
.
random_state
=
np
.
random
.
RandomState
()
self
.
random_state
=
np
.
random
.
RandomState
()
...
...
src/sdk/pynni/nni/gridsearch_tuner/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# Grid Search
## Grid Search(遍历搜索)
Grid Search 会穷举定义在搜索空间文件中的所有超参组合。 注意,搜索空间仅支持
`choice`
,
`quniform`
,
`qloguniform`
。
`quniform`
和
`qloguniform`
中的
**
数字
`q`
有不同的含义(与
[
搜索空间
](
../../../../../docs/zh_CN/SearchSpaceSpec.md
)
说明不同)。 这里的意义是在
`low`
和
`high`
之间均匀取值的数量。
</p>
\ No newline at end of file
src/sdk/pynni/nni/hyperband_advisor/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# NNI 中使用 Hyperband
## 1. 介绍
[
Hyperband
](
https://arxiv.org/pdf/1603.06560.pdf
)
是一种流行的自动机器学习算法。 Hyperband 的基本思想是对配置分组,每组有
`n`
个随机生成的超参配置,每个配置使用
`r`
次资源(如,epoch 数量,批处理数量等)。 当
`n`
个配置完成后,会选择最好的
`n/eta`
个配置,并增加
`r*eta`
次使用的资源。 最后,会选择出的最好配置。
## 2. 实现并行
首先,此样例是基于 MsgDispatcherBase 来实现的自动机器学习算法,而不是基于 Tuner 和Assessor。 这种实现方法下,Hyperband 集成了 Tuner 和 Assessor 两者的功能,因而将它叫做 Advisor。
其次,本实现完全利用了 Hyperband 内部的并行性。 具体来说,下一个分组不会严格的在当前分组结束后再运行,只要有资源,就可以开始运行新的分组。
## 3. 用法
要使用 Hyperband,需要在 Experiment 的 YAML 配置文件进行如下改动。
advisor:
#可选项: Hyperband
builtinAdvisorName: Hyperband
classArgs:
#R: 最大的步骤
R: 100
#eta: 丢弃的 Trial 的比例
eta: 3
#可选项: maximize, minimize
optimize_mode: maximize
注意,一旦使用了 Advisor,就不能在配置文件中添加 Tuner 和 Assessor。 使用 Hyperband 时,Trial 代码收到的超参(如键值对)中,除了用户定义的超参,会多一个
`STEPS`
。
**
使用
`STEPS`
,Trial 能够控制其运行的时间。
</p>
对于 Trial 代码中
`report_intermediate_result(metric)`
和
`report_final_result(metric)`
的
**`指标` 应该是数值,或者用一个 dict,并保证其中有键值为 default 的项目,其值也为数值型**
。 这是需要进行最大化或者最小化优化的数值,如精度或者损失度。
`R`
和
`eta`
是 Hyperband 中可以改动的参数。
`R`
表示可以分配给配置的最大步数(STEPS)。 这里,STEPS 可以代表 epoch 或 批处理数量。
`STEPS`
应该被 Trial 代码用来控制运行的次数。 参考样例
`examples/trials/mnist-hyperband/`
,了解详细信息。
`eta`
表示
`n`
个配置中的
`n/eta`
个配置会留存下来,并用更多的 STEPS 来运行。
下面是
`R=81`
且
`eta=3`
时的样例:
| | s=4 | s=3 | s=2 | s=1 | s=0 |
| - | ---- | ---- | ---- | ---- | ---- |
| i | n r | n r | n r | n r | n r |
| 0 | 81 1 | 27 3 | 9 9 | 6 27 | 5 81 |
| 1 | 27 3 | 9 9 | 3 27 | 2 81 | |
| 2 | 9 9 | 3 27 | 1 81 | | |
| 3 | 3 27 | 1 81 | | | |
| 4 | 1 81 | | | | |
`s`
表示分组,
`n`
表示生成的配置数量,相应的
`r`
表示配置会运行多少 STEPS。
`i`
表示轮数,如分组 4 有 5 轮,分组 3 有 4 轮。
关于如何实现 Trial 代码,参考
`examples/trials/mnist-hyperband/`
中的说明。
## 4. 待改进
当前实现的 Hyperband 算法可以通过改进支持的提前终止算法来提高,原因是最好的
`n/eta`
个配置并不一定都表现很好。 不好的配置可以更早的终止。
在当前实现中,遵循了
[
此论文
](
https://arxiv.org/pdf/1603.06560.pdf
)
的设计,配置都是随机生成的。 要进一步提升,配置生成过程可以利用更高级的算法。
\ No newline at end of file
src/sdk/pynni/nni/hyperband_advisor/hyperband_advisor.py
View file @
3be88922
...
@@ -31,7 +31,7 @@ import json_tricks
...
@@ -31,7 +31,7 @@ import json_tricks
from
nni.protocol
import
CommandType
,
send
from
nni.protocol
import
CommandType
,
send
from
nni.msg_dispatcher_base
import
MsgDispatcherBase
from
nni.msg_dispatcher_base
import
MsgDispatcherBase
from
nni.common
import
init_logger
from
nni.common
import
init_logger
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
,
randint_to_quniform
import
nni.parameter_expressions
as
parameter_expressions
import
nni.parameter_expressions
as
parameter_expressions
_logger
=
logging
.
getLogger
(
__name__
)
_logger
=
logging
.
getLogger
(
__name__
)
...
@@ -357,6 +357,7 @@ class Hyperband(MsgDispatcherBase):
...
@@ -357,6 +357,7 @@ class Hyperband(MsgDispatcherBase):
number of trial jobs
number of trial jobs
"""
"""
self
.
searchspace_json
=
data
self
.
searchspace_json
=
data
randint_to_quniform
(
self
.
searchspace_json
)
self
.
random_state
=
np
.
random
.
RandomState
()
self
.
random_state
=
np
.
random
.
RandomState
()
def
handle_trial_end
(
self
,
data
):
def
handle_trial_end
(
self
,
data
):
...
...
src/sdk/pynni/nni/hyperopt_tuner/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# TPE, Random Search, Anneal Tuners
## TPE
Tree-structured Parzen Estimator (TPE) 是一种 sequential model-based optimization(SMBO,即基于序列模型优化)的方法。 SMBO 方法根据历史指标数据来按顺序构造模型,来估算超参的性能,随后基于此模型来选择新的超参。 TPE 方法对 P(x|y) 和 P(y) 建模,其中 x 表示超参,y 表示相关的评估指标。 P(x|y) 通过变换超参的生成过程来建模,用非参数密度(non-parametric densities)代替配置的先验分布。 细节可参考
[
Algorithms for Hyper-Parameter Optimization
](
https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf
)
。
## Random Search(随机搜索)
[
Random Search for Hyper-Parameter Optimization
](
http://www.jmlr.org/papers/volume13/bergstra12a/bergstra12a.pdf
)
中介绍了随机搜索惊人的简单和效果。 建议当不清楚超参的先验分布时,采用随机搜索作为基准。
## Anneal(退火算法)
这种简单的退火算法从先前的采样开始,会越来越靠近发现的最佳点取样。 此算法是随机搜索的简单变体,利用了响应面的平滑性。 退火率不是自适应的。
\ No newline at end of file
src/sdk/pynni/nni/hyperopt_tuner/hyperopt_tuner.py
View file @
3be88922
...
@@ -27,7 +27,7 @@ import logging
...
@@ -27,7 +27,7 @@ import logging
import
hyperopt
as
hp
import
hyperopt
as
hp
import
numpy
as
np
import
numpy
as
np
from
nni.tuner
import
Tuner
from
nni.tuner
import
Tuner
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
,
split_index
from
nni.utils
import
NodeType
,
OptimizeMode
,
extract_scalar_reward
,
split_index
,
randint_to_quniform
logger
=
logging
.
getLogger
(
'hyperopt_AutoML'
)
logger
=
logging
.
getLogger
(
'hyperopt_AutoML'
)
...
@@ -153,14 +153,14 @@ def _add_index(in_x, parameter):
...
@@ -153,14 +153,14 @@ def _add_index(in_x, parameter):
Will change to format in hyperopt, like:
Will change to format in hyperopt, like:
{'dropout_rate': 0.8, 'conv_size': {'_index': 1, '_value': 3}, 'hidden_size': {'_index': 1, '_value': 512}}
{'dropout_rate': 0.8, 'conv_size': {'_index': 1, '_value': 3}, 'hidden_size': {'_index': 1, '_value': 512}}
"""
"""
if
TYPE
not
in
in_x
:
# if at the top level
if
NodeType
.
TYPE
not
in
in_x
:
# if at the top level
out_y
=
dict
()
out_y
=
dict
()
for
key
,
value
in
parameter
.
items
():
for
key
,
value
in
parameter
.
items
():
out_y
[
key
]
=
_add_index
(
in_x
[
key
],
value
)
out_y
[
key
]
=
_add_index
(
in_x
[
key
],
value
)
return
out_y
return
out_y
elif
isinstance
(
in_x
,
dict
):
elif
isinstance
(
in_x
,
dict
):
value_type
=
in_x
[
TYPE
]
value_type
=
in_x
[
NodeType
.
TYPE
]
value_format
=
in_x
[
VALUE
]
value_format
=
in_x
[
NodeType
.
VALUE
]
if
value_type
==
"choice"
:
if
value_type
==
"choice"
:
choice_name
=
parameter
[
0
]
if
isinstance
(
parameter
,
choice_name
=
parameter
[
0
]
if
isinstance
(
parameter
,
list
)
else
parameter
list
)
else
parameter
...
@@ -173,15 +173,14 @@ def _add_index(in_x, parameter):
...
@@ -173,15 +173,14 @@ def _add_index(in_x, parameter):
choice_value_format
=
item
[
1
]
choice_value_format
=
item
[
1
]
if
choice_key
==
choice_name
:
if
choice_key
==
choice_name
:
return
{
return
{
INDEX
:
NodeType
.
INDEX
:
pos
,
pos
,
NodeType
.
VALUE
:
[
VALUE
:
[
choice_name
,
choice_name
,
_add_index
(
choice_value_format
,
parameter
[
1
])
_add_index
(
choice_value_format
,
parameter
[
1
])
]
]
}
}
elif
choice_name
==
item
:
elif
choice_name
==
item
:
return
{
INDEX
:
pos
,
VALUE
:
item
}
return
{
NodeType
.
INDEX
:
pos
,
NodeType
.
VALUE
:
item
}
else
:
else
:
return
parameter
return
parameter
...
@@ -232,6 +231,8 @@ class HyperoptTuner(Tuner):
...
@@ -232,6 +231,8 @@ class HyperoptTuner(Tuner):
search_space : dict
search_space : dict
"""
"""
self
.
json
=
search_space
self
.
json
=
search_space
randint_to_quniform
(
self
.
json
)
search_space_instance
=
json2space
(
self
.
json
)
search_space_instance
=
json2space
(
self
.
json
)
rstate
=
np
.
random
.
RandomState
()
rstate
=
np
.
random
.
RandomState
()
trials
=
hp
.
Trials
()
trials
=
hp
.
Trials
()
...
...
src/sdk/pynni/nni/medianstop_assessor/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# Medianstop Assessor
## Median Stop
Medianstop 是一种简单的提前终止 Trial 的策略,可参考
[
论文
](
https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/46180.pdf
)
。 如果 Trial X 的在步骤 S 的最好目标值比所有已完成 Trial 的步骤 S 的中位数值明显要低,这个 Trial 就会被提前停止。
\ No newline at end of file
src/sdk/pynni/nni/metis_tuner/README_zh_CN.md
deleted
100644 → 0
View file @
b92c4ab2
# Metis Tuner
## Metis Tuner
大多数调参工具仅仅预测最优配置,而
[
Metis
](
https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
)
的优势在于有两个输出:(a) 最优配置的当前预测结果, 以及 (b) 下一次 Trial 的建议。 不再需要随机猜测!
大多数工具假设训练集没有噪声数据,但 Metis 会知道是否需要对某个超参重新采样。
大多数工具都有着重于在已有结果上继续发展的问题,而 Metis 的搜索策略可以在探索,发展和重新采样(可选)中进行平衡。
Metis 属于基于序列的贝叶斯优化 (SMBO) 的类别,它也基于贝叶斯优化框架。 为了对超参-性能空间建模,Metis 同时使用了高斯过程(Gaussian Process)和高斯混合模型(GMM)。 由于每次 Trial 都可能有很高的时间成本,Metis 大量使用了已有模型来进行推理计算。 在每次迭代中,Metis 执行两个任务:
在高斯过程空间中找到全局最优点。 这一点表示了最佳配置。
它会标识出下一个超参的候选项。 这是通过对隐含信息的探索、挖掘和重采样来实现的。
注意,搜索空间仅支持
`choice`
,
`quniform`
,
`uniform`
和
`randint`
。
更多详情,参考论文:https://www.microsoft.com/en-us/research/publication/metis-robustly-tuning-tail-latencies-cloud-systems/
\ No newline at end of file
src/sdk/pynni/nni/metis_tuner/metis_tuner.py
View file @
3be88922
...
@@ -133,7 +133,7 @@ class MetisTuner(Tuner):
...
@@ -133,7 +133,7 @@ class MetisTuner(Tuner):
self
.
x_bounds
[
idx
]
=
bounds
self
.
x_bounds
[
idx
]
=
bounds
self
.
x_types
[
idx
]
=
'discrete_int'
self
.
x_types
[
idx
]
=
'discrete_int'
elif
key_type
==
'randint'
:
elif
key_type
==
'randint'
:
self
.
x_bounds
[
idx
]
=
[
0
,
key_range
[
0
]]
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
]
,
key_range
[
1
]]
self
.
x_types
[
idx
]
=
'range_int'
self
.
x_types
[
idx
]
=
'range_int'
elif
key_type
==
'uniform'
:
elif
key_type
==
'uniform'
:
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
],
key_range
[
1
]]
self
.
x_bounds
[
idx
]
=
[
key_range
[
0
],
key_range
[
1
]]
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment