Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
df6145a2
Commit
df6145a2
authored
Dec 16, 2020
by
Yuge Zhang
Browse files
Merge branch 'master' of
https://github.com/microsoft/nni
into dev-retiarii
parents
0f0c6288
f8424a9f
Changes
205
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
259 additions
and
56 deletions
+259
-56
nni/tools/nnictl/nnictl_utils.py
nni/tools/nnictl/nnictl_utils.py
+1
-0
nni/tools/package_utils/__init__.py
nni/tools/package_utils/__init__.py
+2
-2
nni/tools/trial_tool/trial_runner.py
nni/tools/trial_tool/trial_runner.py
+1
-1
test/ut/sdk/test_builtin_tuners.py
test/ut/sdk/test_builtin_tuners.py
+9
-10
test/ut/sdk/test_hyperopt_tuner.py
test/ut/sdk/test_hyperopt_tuner.py
+1
-1
test/ut/sdk/test_networkmorphism_tuner.py
test/ut/sdk/test_networkmorphism_tuner.py
+1
-1
ts/nni_manager/core/nniExperimentsManager.ts
ts/nni_manager/core/nniExperimentsManager.ts
+6
-1
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+1
-0
ts/nni_manager/main.ts
ts/nni_manager/main.ts
+8
-14
ts/nni_manager/rest_server/restValidationSchemas.ts
ts/nni_manager/rest_server/restValidationSchemas.ts
+5
-1
ts/nni_manager/training_service/common/trialConfigMetadataKey.ts
...manager/training_service/common/trialConfigMetadataKey.ts
+5
-1
ts/nni_manager/training_service/local/localTrainingService.ts
...ni_manager/training_service/local/localTrainingService.ts
+15
-2
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+4
-0
ts/nni_manager/training_service/reusable/channels/webCommandChannel.ts
...r/training_service/reusable/channels/webCommandChannel.ts
+14
-1
ts/nni_manager/training_service/reusable/environment.ts
ts/nni_manager/training_service/reusable/environment.ts
+20
-6
ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
...ng_service/reusable/environments/amlEnvironmentService.ts
+9
-6
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
...ervice/reusable/environments/environmentServiceFactory.ts
+22
-0
ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts
..._service/reusable/environments/localEnvironmentService.ts
+118
-0
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
...ervice/reusable/environments/openPaiEnvironmentService.ts
+4
-0
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+13
-9
No files found.
nni/tools/nnictl/nnictl_utils.py
View file @
df6145a2
...
...
@@ -54,6 +54,7 @@ def update_experiment():
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
not
detect_process
(
rest_pid
):
experiment_config
.
update_experiment
(
key
,
'status'
,
'STOPPED'
)
experiment_config
.
update_experiment
(
key
,
'port'
,
None
)
continue
def
check_experiment_id
(
args
,
update
=
True
):
...
...
nni/tools/package_utils/__init__.py
View file @
df6145a2
...
...
@@ -43,8 +43,8 @@ def get_registered_algo_meta(builtin_name, algo_type=None):
-------
Returns meta information of speicified builtin alogorithms, for example:
{
'classArgsValidator': 'nni.smac_tuner.
smac_tuner.
SMACClassArgsValidator',
'className': 'nni.smac_tuner.
smac_tuner.
SMACTuner',
'classArgsValidator': 'nni.smac_tuner.SMACClassArgsValidator',
'className': 'nni.smac_tuner.SMACTuner',
'builtinName': 'SMAC'
}
"""
...
...
nni/tools/trial_tool/trial_runner.py
View file @
df6145a2
...
...
@@ -25,7 +25,6 @@ def main_loop(args):
'''main loop logic for trial runner'''
idle_last_time
=
datetime
.
now
()
gpu_refresh_last_time
=
datetime
.
now
()
-
timedelta
(
minutes
=
1
)
try
:
if
args
.
job_pid_file
:
with
open
(
args
.
job_pid_file
,
'w'
)
as
job_file
:
...
...
@@ -188,6 +187,7 @@ if __name__ == '__main__':
os
.
environ
[
'NNI_EXP_ID'
]
=
args
.
exp_id
os
.
environ
[
'MULTI_PHASE'
]
=
"true"
os
.
environ
[
'NNI_TRIAL_JOB_ID'
]
=
"runner"
os
.
environ
[
'REUSE_MODE'
]
=
"true"
from
.log_utils
import
LogType
,
RemoteLogger
,
StdOutputType
,
nni_log
from
.trial
import
Trial
...
...
test/ut/sdk/test_builtin_tuners.py
View file @
df6145a2
...
...
@@ -11,22 +11,21 @@ import sys
from
collections
import
deque
from
unittest
import
TestCase
,
main
from
nni.algorithms.hpo.batch_tuner
.batch_tuner
import
BatchTuner
from
nni.algorithms.hpo.evolution_tuner
.evolution_tuner
import
EvolutionTuner
from
nni.algorithms.hpo.gp_tuner
.gp_tuner
import
GPTuner
from
nni.algorithms.hpo.gridsearch_tuner
.gridsearch_tuner
import
GridSearchTuner
from
nni.algorithms.hpo.hyperopt_tuner
.hyperopt_tuner
import
HyperoptTuner
from
nni.algorithms.hpo.metis_tuner
.metis_tuner
import
MetisTuner
from
nni.algorithms.hpo.pbt_tuner
.pbt_tuner
import
PBTTuner
from
nni.algorithms.hpo.regularized_evolution_tuner
.regularized_evolution_tuner
import
RegularizedEvolutionTuner
from
nni.algorithms.hpo.batch_tuner
import
BatchTuner
from
nni.algorithms.hpo.evolution_tuner
import
EvolutionTuner
from
nni.algorithms.hpo.gp_tuner
import
GPTuner
from
nni.algorithms.hpo.gridsearch_tuner
import
GridSearchTuner
from
nni.algorithms.hpo.hyperopt_tuner
import
HyperoptTuner
from
nni.algorithms.hpo.metis_tuner
import
MetisTuner
from
nni.algorithms.hpo.pbt_tuner
import
PBTTuner
from
nni.algorithms.hpo.regularized_evolution_tuner
import
RegularizedEvolutionTuner
from
nni.runtime.msg_dispatcher
import
_pack_parameter
,
MsgDispatcher
if
sys
.
platform
!=
'win32'
:
from
nni.algorithms.hpo.smac_tuner
.smac_tuner
import
SMACTuner
from
nni.algorithms.hpo.smac_tuner
import
SMACTuner
from
nni.tuner
import
Tuner
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
'test_tuner'
)
...
...
test/ut/sdk/test_hyperopt_tuner.py
View file @
df6145a2
...
...
@@ -9,7 +9,7 @@ from unittest import TestCase, main
import
hyperopt
as
hp
from
nni.algorithms.hpo.hyperopt_tuner
.hyperopt_tuner
import
json2space
,
json2parameter
,
json2vals
,
HyperoptTuner
from
nni.algorithms.hpo.hyperopt_tuner
import
json2space
,
json2parameter
,
json2vals
,
HyperoptTuner
class
HyperoptTunerTestCase
(
TestCase
):
...
...
test/ut/sdk/test_networkmorphism_tuner.py
View file @
df6145a2
...
...
@@ -6,6 +6,7 @@ from unittest import TestCase, main
from
copy
import
deepcopy
import
torch
from
nni.algorithms.hpo.networkmorphism_tuner
import
NetworkMorphismTuner
from
nni.algorithms.hpo.networkmorphism_tuner.graph
import
graph_to_json
,
json_to_graph
from
nni.algorithms.hpo.networkmorphism_tuner.graph_transformer
import
(
to_deeper_graph
,
...
...
@@ -13,7 +14,6 @@ from nni.algorithms.hpo.networkmorphism_tuner.graph_transformer import (
to_wider_graph
,
)
from
nni.algorithms.hpo.networkmorphism_tuner.layers
import
layer_description_extractor
from
nni.algorithms.hpo.networkmorphism_tuner.networkmorphism_tuner
import
NetworkMorphismTuner
from
nni.algorithms.hpo.networkmorphism_tuner.nn
import
CnnGenerator
...
...
ts/nni_manager/core/nniExperimentsManager.ts
View file @
df6145a2
...
...
@@ -77,7 +77,11 @@ class NNIExperimentsManager implements ExperimentManager {
this
.
withLockSync
(()
=>
{
const
experimentsInformation
=
JSON
.
parse
(
fs
.
readFileSync
(
this
.
experimentsPath
).
toString
());
assert
(
experimentId
in
experimentsInformation
,
`Experiment Manager: Experiment Id
${
experimentId
}
not found, this should not happen`
);
experimentsInformation
[
experimentId
][
key
]
=
value
;
if
(
value
!==
undefined
)
{
experimentsInformation
[
experimentId
][
key
]
=
value
;
}
else
{
delete
experimentsInformation
[
experimentId
][
key
];
}
fs
.
writeFileSync
(
this
.
experimentsPath
,
JSON
.
stringify
(
experimentsInformation
,
null
,
4
));
});
}
catch
(
err
)
{
...
...
@@ -128,6 +132,7 @@ class NNIExperimentsManager implements ExperimentManager {
updateList
.
forEach
((
expId
:
string
)
=>
{
if
(
experimentsInformation
[
expId
])
{
experimentsInformation
[
expId
][
'
status
'
]
=
'
STOPPED
'
;
delete
experimentsInformation
[
expId
][
'
port
'
];
}
else
{
this
.
log
.
error
(
`Experiment Manager: Experiment Id
${
expId
}
not found, this should not happen`
);
}
...
...
ts/nni_manager/core/nnimanager.ts
View file @
df6145a2
...
...
@@ -480,6 +480,7 @@ class NNIManager implements Manager {
}
await
this
.
storeExperimentProfile
();
this
.
setStatus
(
'
STOPPED
'
);
this
.
experimentManager
.
setExperimentInfo
(
this
.
experimentProfile
.
id
,
'
port
'
,
undefined
);
}
private
async
periodicallyUpdateExecDuration
():
Promise
<
void
>
{
...
...
ts/nni_manager/main.ts
View file @
df6145a2
...
...
@@ -28,6 +28,7 @@ import { RouterTrainingService } from './training_service/reusable/routerTrainin
import
{
PAIYarnTrainingService
}
from
'
./training_service/pai/paiYarn/paiYarnTrainingService
'
;
import
{
DLTSTrainingService
}
from
'
./training_service/dlts/dltsTrainingService
'
;
function
initStartupInfo
(
startExpMode
:
string
,
experimentId
:
string
,
basePort
:
number
,
platform
:
string
,
logDirectory
:
string
,
experimentLogLevel
:
string
,
readonly
:
boolean
,
dispatcherPipe
:
string
):
void
{
...
...
@@ -36,22 +37,15 @@ function initStartupInfo(
}
async
function
initContainer
(
foreground
:
boolean
,
platformMode
:
string
,
logFileName
?:
string
):
Promise
<
void
>
{
if
(
platformMode
===
'
adl
'
)
{
const
routerPlatformMode
=
[
'
remote
'
,
'
pai
'
,
'
aml
'
,
'
heterogeneous
'
];
if
(
routerPlatformMode
.
includes
(
platformMode
))
{
Container
.
bind
(
TrainingService
)
.
to
(
Adl
TrainingService
)
.
to
(
Router
TrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
local
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
LocalTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
remote
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
RouterTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
pai
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
RouterTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
paiYarn
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
PAIYarnTrainingService
)
...
...
@@ -68,9 +62,9 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
Container
.
bind
(
TrainingService
)
.
to
(
DLTSTrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
if
(
platformMode
===
'
a
m
l
'
)
{
}
else
if
(
platformMode
===
'
a
d
l
'
)
{
Container
.
bind
(
TrainingService
)
.
to
(
Router
TrainingService
)
.
to
(
Adl
TrainingService
)
.
scope
(
Scope
.
Singleton
);
}
else
{
throw
new
Error
(
`Error: unsupported mode:
${
platformMode
}
`
);
...
...
@@ -103,7 +97,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN
function
usage
():
void
{
console
.
info
(
'
usage: node main.js --port <port> --mode
\
<
adl/
local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
<local/remote/pai/kubeflow/frameworkcontroller/paiYarn/aml
/adl/heterogeneous
> --start_mode <new/resume> --experiment_id <id> --foreground <true/false>
'
);
}
const
strPort
:
string
=
parseArg
([
'
--port
'
,
'
-p
'
]);
...
...
@@ -123,7 +117,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals
const
port
:
number
=
parseInt
(
strPort
,
10
);
const
mode
:
string
=
parseArg
([
'
--mode
'
,
'
-m
'
]);
if
(
!
[
'
adl
'
,
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
].
includes
(
mode
))
{
if
(
!
[
'
local
'
,
'
remote
'
,
'
pai
'
,
'
kubeflow
'
,
'
frameworkcontroller
'
,
'
paiYarn
'
,
'
dlts
'
,
'
aml
'
,
'
adl
'
,
'
heterogeneous
'
].
includes
(
mode
))
{
console
.
log
(
`FATAL: unknown mode:
${
mode
}
`
);
usage
();
process
.
exit
(
1
);
...
...
ts/nni_manager/rest_server/restValidationSchemas.ts
View file @
df6145a2
...
...
@@ -23,7 +23,8 @@ export namespace ValidationSchemas {
local_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
gpuIndices
:
joi
.
string
(),
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
()
useActiveGpu
:
joi
.
boolean
(),
reuse
:
joi
.
boolean
()
}),
trial_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
image
:
joi
.
string
().
min
(
1
),
...
...
@@ -182,6 +183,9 @@ export namespace ValidationSchemas {
maxTrialNumPerGpu
:
joi
.
number
(),
useActiveGpu
:
joi
.
boolean
()
}),
heterogeneous_config
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
trainingServicePlatforms
:
joi
.
array
(),
}),
nni_manager_ip
:
joi
.
object
({
// eslint-disable-line @typescript-eslint/camelcase
nniManagerIp
:
joi
.
string
().
min
(
1
)
}),
...
...
ts/nni_manager/training_service/common/trialConfigMetadataKey.ts
View file @
df6145a2
...
...
@@ -11,6 +11,7 @@ export enum TrialConfigMetadataKey {
LOCAL_CONFIG
=
'
local_config
'
,
TRIAL_CONFIG
=
'
trial_config
'
,
REMOTE_CONFIG
=
'
remote_config
'
,
HETEROGENEOUS_CONFIG
=
'
heterogeneous_config
'
,
EXPERIMENT_ID
=
'
experimentId
'
,
MULTI_PHASE
=
'
multiPhase
'
,
RANDOM_SCHEDULER
=
'
random_scheduler
'
,
...
...
@@ -22,5 +23,8 @@ export enum TrialConfigMetadataKey {
DLTS_CLUSTER_CONFIG
=
'
dlts_config
'
,
AML_CLUSTER_CONFIG
=
'
aml_config
'
,
VERSION_CHECK
=
'
version_check
'
,
LOG_COLLECTION
=
'
log_collection
'
LOG_COLLECTION
=
'
log_collection
'
,
// Used to set platform for heterogeneous in reuse mode,
// temproarily change and will refactor config schema in the future
PLATFORM_LIST
=
'
platform_list
'
}
ts/nni_manager/training_service/local/localTrainingService.ts
View file @
df6145a2
...
...
@@ -78,7 +78,7 @@ class LocalTrialJobDetail implements TrialJobDetail {
/**
* Local training service config
*/
class
LocalConfig
{
export
class
LocalConfig
{
public
maxTrialNumPerGpu
?:
number
;
public
gpuIndices
?:
string
;
public
useActiveGpu
?:
boolean
;
...
...
@@ -253,7 +253,20 @@ class LocalTrainingService implements TrainingService {
return
Promise
.
resolve
();
}
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
);
tkill
(
trialJob
.
pid
,
'
SIGTERM
'
);
const
startTime
=
Date
.
now
();
while
(
await
isAlive
(
trialJob
.
pid
))
{
if
(
Date
.
now
()
-
startTime
>
4999
)
{
tkill
(
trialJob
.
pid
,
'
SIGKILL
'
,
(
err
)
=>
{
if
(
err
)
{
this
.
log
.
error
(
`kill trial job error:
${
err
}
`
);
}
});
break
;
}
await
delay
(
500
);
}
this
.
setTrialJobStatus
(
trialJob
,
getJobCancelStatus
(
isEarlyStopped
));
return
Promise
.
resolve
();
...
...
ts/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
df6145a2
...
...
@@ -358,6 +358,10 @@ class RemoteMachineTrainingService implements TrainingService {
case
TrialConfigMetadataKey
.
LOG_COLLECTION
:
this
.
logCollection
=
value
;
break
;
case
TrialConfigMetadataKey
.
REMOTE_CONFIG
:
// Add remote_config in remoteEnvironmentService to set reuse mode,
// this config need to be catched here, otherwise will throw Unknown key exception here
break
;
default
:
//Reject for unknown keys
throw
new
Error
(
`Uknown key:
${
key
}
`
);
...
...
ts/nni_manager/training_service/reusable/channels/webCommandChannel.ts
View file @
df6145a2
...
...
@@ -8,6 +8,7 @@ import { getBasePort, getExperimentId } from "../../../common/experimentStartupI
import
{
INITIALIZED
}
from
'
../../../core/commands
'
;
import
{
CommandChannel
,
RunnerConnection
}
from
"
../commandChannel
"
;
import
{
Channel
,
EnvironmentInformation
}
from
"
../environment
"
;
import
{
EventEmitter
}
from
"
events
"
;
class
WebRunnerConnection
extends
RunnerConnection
{
public
readonly
clients
:
WebSocket
[]
=
[];
...
...
@@ -29,7 +30,7 @@ class WebRunnerConnection extends RunnerConnection {
export
class
WebCommandChannel
extends
CommandChannel
{
private
readonly
expId
:
string
=
getExperimentId
();
private
static
commandChannel
:
WebCommandChannel
;
private
webSocketServer
:
SocketServer
|
undefined
;
private
clients
:
Map
<
WebSocket
,
WebRunnerConnection
|
undefined
>
=
new
Map
<
WebSocket
,
WebRunnerConnection
|
undefined
>
();
...
...
@@ -40,6 +41,18 @@ export class WebCommandChannel extends CommandChannel {
public
async
config
(
_key
:
string
,
_value
:
any
):
Promise
<
void
>
{
// do nothing
}
// Set WebCommandChannel as singleton mode, one experiment could only start one webCommandChannel instance
private
constructor
(
commandEmitter
:
EventEmitter
)
{
super
(
commandEmitter
);
}
public
static
getInstance
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
if
(
!
this
.
commandChannel
)
{
this
.
commandChannel
=
new
WebCommandChannel
(
commandEmitter
);
}
return
this
.
commandChannel
;
}
public
async
start
():
Promise
<
void
>
{
const
port
=
getBasePort
()
+
1
;
...
...
ts/nni_manager/training_service/reusable/environment.ts
View file @
df6145a2
...
...
@@ -3,12 +3,12 @@
'
use strict
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
getLogger
,
Logger
}
from
"
../../common/log
"
;
import
{
TrialJobStatus
}
from
"
../../common/trainingService
"
;
import
{
GPUInfo
}
from
"
../../training_service/common/gpuData
"
;
import
{
WebCommandChannel
}
from
"
./channels/webCommandChannel
"
;
import
{
CommandChannel
}
from
"
./commandChannel
"
;
import
{
WebCommandChannel
}
from
'
./channels/webCommandChannel
'
;
import
{
EventEmitter
}
from
"
events
"
;
export
type
EnvironmentStatus
=
'
UNKNOWN
'
|
'
WAITING
'
|
'
RUNNING
'
|
'
SUCCEEDED
'
|
'
FAILED
'
|
'
USER_CANCELED
'
;
...
...
@@ -75,6 +75,8 @@ export class EnvironmentInformation {
public
maxTrialNumberPerGpu
?:
number
;
public
useActiveGpu
?:
boolean
;
public
environmentService
?:
EnvironmentService
;
constructor
(
id
:
string
,
name
:
string
,
envId
?:
string
)
{
this
.
log
=
getLogger
();
this
.
id
=
id
;
...
...
@@ -127,6 +129,8 @@ export abstract class EnvironmentService {
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
public
abstract
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
// Make public for ut
protected
commandChannel
:
CommandChannel
|
undefined
;
// It is used to set prefetched environment count, default value is 0 for OpenPAI and AML mode,
// in remote mode, this value is set to the length of machine list.
...
...
@@ -134,6 +138,20 @@ export abstract class EnvironmentService {
return
0
;
}
public
abstract
get
getName
():
string
;
// Initialize command channel, use WebCommandChannel as default command channel
public
initCommandChannel
(
eventEmitter
:
EventEmitter
):
void
{
this
.
commandChannel
=
WebCommandChannel
.
getInstance
(
eventEmitter
);
}
public
get
getCommandChannel
():
CommandChannel
{
if
(
this
.
commandChannel
===
undefined
)
{
throw
new
Error
(
"
Command channel not initialized!
"
);
}
return
this
.
commandChannel
;
}
// It depends on environment pressure and settings
// for example, OpenPAI relies on API calls, and there is an limitation for frequence, so it need to be bigger.
public
get
environmentMaintenceLoopInterval
():
number
{
...
...
@@ -147,10 +165,6 @@ export abstract class EnvironmentService {
return
true
;
}
public
createCommandChannel
(
commandEmitter
:
EventEmitter
):
CommandChannel
{
return
new
WebCommandChannel
(
commandEmitter
);
}
public
createEnvironmentInformation
(
envId
:
string
,
envName
:
string
):
EnvironmentInformation
{
return
new
EnvironmentInformation
(
envId
,
envName
);
}
...
...
ts/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts
View file @
df6145a2
...
...
@@ -3,7 +3,6 @@
'
use strict
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
component
from
'
../../../common/component
'
;
...
...
@@ -14,13 +13,13 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey';
import
{
validateCodeDir
}
from
'
../../common/util
'
;
import
{
AMLClient
}
from
'
../aml/amlClient
'
;
import
{
AMLClusterConfig
,
AMLEnvironmentInformation
,
AMLTrialConfig
}
from
'
../aml/amlConfig
'
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
import
{
CommandChannel
}
from
"
../commandChannel
"
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
EventEmitter
}
from
"
events
"
;
import
{
AMLCommandChannel
}
from
'
../channels/amlCommandChannel
'
;
/**
* Collector
PAI
jobs info from
PAI
cluster, and update
pai
job status locally
* Collector
AML
jobs info from
AML
cluster, and update
aml
job status locally
*/
@
component
.
Singleton
export
class
AMLEnvironmentService
extends
EnvironmentService
{
...
...
@@ -41,14 +40,18 @@ export class AMLEnvironmentService extends EnvironmentService {
return
false
;
}
public
create
CommandChannel
(
command
Emitter
:
EventEmitter
):
CommandChannel
{
return
new
AMLCommandChannel
(
command
Emitter
);
public
init
CommandChannel
(
event
Emitter
:
EventEmitter
):
void
{
this
.
commandChannel
=
new
AMLCommandChannel
(
event
Emitter
);
}
public
createEnvironmentInformation
(
envId
:
string
,
envName
:
string
):
EnvironmentInformation
{
return
new
AMLEnvironmentInformation
(
envId
,
envName
);
}
public
get
getName
():
string
{
return
'
aml
'
;
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
AML_CLUSTER_CONFIG
:
...
...
ts/nni_manager/training_service/reusable/environments/environmentServiceFactory.ts
0 → 100644
View file @
df6145a2
import
{
AMLEnvironmentService
}
from
'
./amlEnvironmentService
'
;
import
{
OpenPaiEnvironmentService
}
from
'
./openPaiEnvironmentService
'
;
import
{
LocalEnvironmentService
}
from
'
./localEnvironmentService
'
;
import
{
RemoteEnvironmentService
}
from
'
./remoteEnvironmentService
'
;
import
{
EnvironmentService
}
from
'
../environment
'
;
export
class
EnvironmentServiceFactory
{
public
static
createEnvironmentService
(
name
:
string
):
EnvironmentService
{
switch
(
name
)
{
case
'
local
'
:
return
new
LocalEnvironmentService
();
case
'
remote
'
:
return
new
RemoteEnvironmentService
();
case
'
aml
'
:
return
new
AMLEnvironmentService
();
case
'
pai
'
:
return
new
OpenPaiEnvironmentService
();
default
:
throw
new
Error
(
`
${
name
}
not supported!`
);
}
}
}
ts/nni_manager/training_service/reusable/environments/localEnvironmentService.ts
0 → 100644
View file @
df6145a2
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
'
use strict
'
;
import
*
as
fs
from
'
fs
'
;
import
*
as
path
from
'
path
'
;
import
*
as
tkill
from
'
tree-kill
'
;
import
*
as
component
from
'
../../../common/component
'
;
import
{
getExperimentId
}
from
'
../../../common/experimentStartupInfo
'
;
import
{
getLogger
,
Logger
}
from
'
../../../common/log
'
;
import
{
TrialConfigMetadataKey
}
from
'
../../common/trialConfigMetadataKey
'
;
import
{
EnvironmentInformation
,
EnvironmentService
}
from
'
../environment
'
;
import
{
TrialConfig
}
from
'
../../common/trialConfig
'
;
import
{
getExperimentRootDir
,
isAlive
}
from
'
../../../common/utils
'
;
import
{
execMkdir
,
runScript
,
execCopydir
}
from
'
../../common/util
'
;
@
component
.
Singleton
export
class
LocalEnvironmentService
extends
EnvironmentService
{
private
readonly
log
:
Logger
=
getLogger
();
private
localTrialConfig
:
TrialConfig
|
undefined
;
private
experimentRootDir
:
string
;
private
experimentId
:
string
;
constructor
()
{
super
();
this
.
experimentId
=
getExperimentId
();
this
.
experimentRootDir
=
getExperimentRootDir
();
}
public
get
environmentMaintenceLoopInterval
():
number
{
return
100
;
}
public
get
hasStorageService
():
boolean
{
return
false
;
}
public
get
getName
():
string
{
return
'
local
'
;
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
TRIAL_CONFIG
:
this
.
localTrialConfig
=
<
TrialConfig
>
JSON
.
parse
(
value
);
break
;
default
:
this
.
log
.
debug
(
`Local mode does not proccess metadata key: '
${
key
}
', value: '
${
value
}
'`
);
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
environments
.
forEach
(
async
(
environment
)
=>
{
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
const
runnerReturnCodeFilePath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/code`
;
/* eslint-disable require-atomic-updates */
try
{
// check if pid file exist
const
pidExist
=
await
fs
.
existsSync
(
jobpidPath
);
if
(
!
pidExist
)
{
return
;
}
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
jobpidPath
,
'
utf8
'
);
const
alive
:
boolean
=
await
isAlive
(
pid
);
environment
.
status
=
'
RUNNING
'
;
// if the process of jobpid is not alive any more
if
(
!
alive
)
{
if
(
fs
.
existsSync
(
runnerReturnCodeFilePath
))
{
const
runnerReturnCode
:
string
=
await
fs
.
promises
.
readFile
(
runnerReturnCodeFilePath
,
'
utf8
'
);
const
match
:
RegExpMatchArray
|
null
=
runnerReturnCode
.
trim
()
.
match
(
/^-
?(\d
+
)\s
+
(\d
+
)
$/
);
if
(
match
!==
null
)
{
const
{
1
:
code
}
=
match
;
// Update trial job's status based on result code
if
(
parseInt
(
code
,
10
)
===
0
)
{
environment
.
setStatus
(
'
SUCCEEDED
'
);
}
else
{
environment
.
setStatus
(
'
FAILED
'
);
}
}
}
}
}
catch
(
error
)
{
this
.
log
.
error
(
`Update job status exception, error is
${
error
.
message
}
`
);
}
});
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
localTrialConfig
===
undefined
)
{
throw
new
Error
(
'
Local trial config is not initialized
'
);
}
// Need refactor, this temp folder path is not appropriate, there are two expId in this path
const
localTempFolder
:
string
=
path
.
join
(
this
.
experimentRootDir
,
this
.
experimentId
,
"
environment-temp
"
,
"
envs
"
);
const
localEnvCodeFolder
:
string
=
path
.
join
(
this
.
experimentRootDir
,
"
envs
"
);
environment
.
runnerWorkingFolder
=
path
.
join
(
localEnvCodeFolder
,
environment
.
id
);
await
execMkdir
(
environment
.
runnerWorkingFolder
);
await
execCopydir
(
localTempFolder
,
localEnvCodeFolder
);
environment
.
command
=
`cd
${
this
.
experimentRootDir
}
&& \
${
environment
.
command
}
--job_pid_file
${
environment
.
runnerWorkingFolder
}
/pid \
1>
${
environment
.
runnerWorkingFolder
}
/trialrunner_stdout 2>
${
environment
.
runnerWorkingFolder
}
/trialrunner_stderr \
&& echo $?
\`
date +%s%3N
\`
>
${
environment
.
runnerWorkingFolder
}
/code`
;
await
fs
.
promises
.
writeFile
(
path
.
join
(
localEnvCodeFolder
,
'
nni_run.sh
'
),
environment
.
command
,
{
encoding
:
'
utf8
'
,
mode
:
0o777
}),
// Execute command in local machine
runScript
(
path
.
join
(
localEnvCodeFolder
,
'
nni_run.sh
'
));
environment
.
trackingUrl
=
`
${
environment
.
runnerWorkingFolder
}
`
;
}
public
async
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
const
pid
:
string
=
await
fs
.
promises
.
readFile
(
jobpidPath
,
'
utf8
'
);
tkill
(
Number
(
pid
),
'
SIGKILL
'
);
}
}
ts/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts
View file @
df6145a2
...
...
@@ -45,6 +45,10 @@ export class OpenPaiEnvironmentService extends EnvironmentService {
return
true
;
}
public
get
getName
():
string
{
return
'
pai
'
;
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
PAI_CLUSTER_CONFIG
:
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
df6145a2
...
...
@@ -63,6 +63,10 @@ export class RemoteEnvironmentService extends EnvironmentService {
return
false
;
}
public
get
getName
():
string
{
return
'
remote
'
;
}
public
async
config
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
switch
(
key
)
{
case
TrialConfigMetadataKey
.
MACHINE_LIST
:
...
...
@@ -134,7 +138,15 @@ export class RemoteEnvironmentService extends EnvironmentService {
await
executor
.
createFolder
(
remoteGpuScriptCollectorDir
,
true
);
await
executor
.
allowPermission
(
true
,
nniRootDir
);
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
tasks
:
Promise
<
void
>
[]
=
[];
environments
.
forEach
(
async
(
environment
)
=>
{
tasks
.
push
(
this
.
refreshEnvironment
(
environment
));
});
await
Promise
.
all
(
tasks
);
}
private
async
refreshEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
const
executor
=
await
this
.
getExecutor
(
environment
.
id
);
const
jobpidPath
:
string
=
`
${
environment
.
runnerWorkingFolder
}
/pid`
;
...
...
@@ -176,14 +188,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
}
public
async
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
{
const
tasks
:
Promise
<
void
>
[]
=
[];
environments
.
forEach
(
async
(
environment
)
=>
{
tasks
.
push
(
this
.
refreshEnvironment
(
environment
));
});
await
Promise
.
all
(
tasks
);
}
/**
* If a environment is finished, release the connection resource
* @param environment remote machine environment job detail
...
...
Prev
1
…
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment