Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
67e23897
Unverified
Commit
67e23897
authored
Apr 16, 2021
by
liuzhe-lz
Committed by
GitHub
Apr 16, 2021
Browse files
Fix config v2 bugs (#3540)
parent
1418a366
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
67 additions
and
84 deletions
+67
-84
nni/tools/nnictl/config_utils.py
nni/tools/nnictl/config_utils.py
+1
-1
nni/tools/nnictl/launcher.py
nni/tools/nnictl/launcher.py
+30
-15
nni/tools/nnictl/nnictl_utils.py
nni/tools/nnictl/nnictl_utils.py
+0
-35
ts/nni_manager/core/nnimanager.ts
ts/nni_manager/core/nnimanager.ts
+7
-4
ts/nni_manager/training_service/reusable/environment.ts
ts/nni_manager/training_service/reusable/environment.ts
+4
-0
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
...service/reusable/environments/remoteEnvironmentService.ts
+12
-14
ts/nni_manager/training_service/reusable/trialDispatcher.ts
ts/nni_manager/training_service/reusable/trialDispatcher.ts
+3
-2
ts/webui/src/components/overview/count/EditExperimentParam.tsx
...bui/src/components/overview/count/EditExperimentParam.tsx
+2
-8
ts/webui/src/components/overview/count/ExpDuration.tsx
ts/webui/src/components/overview/count/ExpDuration.tsx
+1
-1
ts/webui/src/components/overview/count/TrialCount.tsx
ts/webui/src/components/overview/count/TrialCount.tsx
+1
-1
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
+2
-2
ts/webui/src/static/experimentConfig.ts
ts/webui/src/static/experimentConfig.ts
+4
-1
No files found.
nni/tools/nnictl/config_utils.py
View file @
67e23897
...
...
@@ -123,7 +123,7 @@ class Experiments:
self
.
experiments
[
expId
][
'tag'
]
=
tag
self
.
experiments
[
expId
][
'pid'
]
=
pid
self
.
experiments
[
expId
][
'webuiUrl'
]
=
webuiUrl
self
.
experiments
[
expId
][
'logDir'
]
=
logDir
self
.
experiments
[
expId
][
'logDir'
]
=
str
(
logDir
)
self
.
write_file
()
def
update_experiment
(
self
,
expId
,
key
,
value
):
...
...
nni/tools/nnictl/launcher.py
View file @
67e23897
...
...
@@ -411,6 +411,21 @@ def launch_experiment(args, experiment_config, mode, experiment_id, config_versi
kill_command
(
rest_process
.
pid
)
print_normal
(
'Stopping experiment...'
)
def
_validate_v1
(
config
,
path
):
try
:
validate_all_content
(
config
,
path
)
except
Exception
as
e
:
print_error
(
f
'Config V1 validation failed:
{
repr
(
e
)
}
'
)
exit
(
1
)
def
_validate_v2
(
config
,
path
):
base_path
=
Path
(
path
).
parent
try
:
conf
=
ExperimentConfig
(
_base_path
=
base_path
,
**
config
)
return
conf
.
json
()
except
Exception
as
e
:
print_error
(
f
'Config V2 validation failed:
{
repr
(
e
)
}
'
)
def
create_experiment
(
args
):
'''start a new experiment'''
experiment_id
=
''
.
join
(
random
.
sample
(
string
.
ascii_letters
+
string
.
digits
,
8
))
...
...
@@ -420,23 +435,23 @@ def create_experiment(args):
exit
(
1
)
config_yml
=
get_yml_content
(
config_path
)
try
:
config
=
ExperimentConfig
(
_base_path
=
Path
(
config_path
).
parent
,
**
config_yml
)
config_v2
=
config
.
json
()
except
Exception
as
error_v2
:
print_warning
(
'Validation with V2 schema failed. Trying to convert from V1 format...'
)
try
:
validate_all_content
(
config_yml
,
config_path
)
except
Exception
as
error_v1
:
print_error
(
f
'Convert from v1 format failed:
{
repr
(
error_v1
)
}
'
)
print_error
(
f
'Config in v2 format validation failed:
{
repr
(
error_v2
)
}
'
)
exit
(
1
)
if
'trainingServicePlatform'
in
config_yml
:
_validate_v1
(
config_yml
,
config_path
)
platform
=
config_yml
[
'trainingServicePlatform'
]
if
platform
in
k8s_training_services
:
schema
=
1
config_v1
=
config_yml
else
:
schema
=
2
from
nni.experiment.config
import
convert
config_v2
=
convert
.
to_v2
(
config_yml
).
json
()
else
:
config_v2
=
_validate_v2
(
config_yml
,
config_path
)
schema
=
2
try
:
if
getattr
(
config_v2
[
'trainingService'
],
'platform'
,
None
)
in
k8s_training_services
:
launch_experiment
(
args
,
config_
yml
,
'new'
,
experiment_id
,
1
)
if
schema
==
1
:
launch_experiment
(
args
,
config_
v1
,
'new'
,
experiment_id
,
1
)
else
:
launch_experiment
(
args
,
config_v2
,
'new'
,
experiment_id
,
2
)
except
Exception
as
exception
:
...
...
nni/tools/nnictl/nnictl_utils.py
View file @
67e23897
...
...
@@ -13,7 +13,6 @@ from functools import cmp_to_key
import
traceback
from
datetime
import
datetime
,
timezone
from
subprocess
import
Popen
from
pyhdfs
import
HdfsClient
from
nni.tools.annotation
import
expand_annotations
import
nni_node
# pylint: disable=import-error
from
.rest_utils
import
rest_get
,
rest_delete
,
check_rest_server_quick
,
check_response
...
...
@@ -501,30 +500,6 @@ def remote_clean(machine_list, experiment_id=None):
print_normal
(
'removing folder {0}'
.
format
(
host
+
':'
+
str
(
port
)
+
remote_dir
))
remove_remote_directory
(
sftp
,
remote_dir
)
def
hdfs_clean
(
host
,
user_name
,
output_dir
,
experiment_id
=
None
):
'''clean up hdfs data'''
hdfs_client
=
HdfsClient
(
hosts
=
'{0}:80'
.
format
(
host
),
user_name
=
user_name
,
webhdfs_path
=
'/webhdfs/api/v1'
,
timeout
=
5
)
if
experiment_id
:
full_path
=
'/'
+
'/'
.
join
([
user_name
,
'nni'
,
'experiments'
,
experiment_id
])
else
:
full_path
=
'/'
+
'/'
.
join
([
user_name
,
'nni'
,
'experiments'
])
print_normal
(
'removing folder {0} in hdfs'
.
format
(
full_path
))
hdfs_client
.
delete
(
full_path
,
recursive
=
True
)
if
output_dir
:
pattern
=
re
.
compile
(
'hdfs://(?P<host>([0-9]{1,3}.){3}[0-9]{1,3})(:[0-9]{2,5})?(?P<baseDir>/.*)?'
)
match_result
=
pattern
.
match
(
output_dir
)
if
match_result
:
output_host
=
match_result
.
group
(
'host'
)
output_dir
=
match_result
.
group
(
'baseDir'
)
#check if the host is valid
if
output_host
!=
host
:
print_warning
(
'The host in {0} is not consistent with {1}'
.
format
(
output_dir
,
host
))
else
:
if
experiment_id
:
output_dir
=
output_dir
+
'/'
+
experiment_id
print_normal
(
'removing folder {0} in hdfs'
.
format
(
output_dir
))
hdfs_client
.
delete
(
output_dir
,
recursive
=
True
)
def
experiment_clean
(
args
):
'''clean up the experiment data'''
experiment_id_list
=
[]
...
...
@@ -556,11 +531,6 @@ def experiment_clean(args):
if
platform
==
'remote'
:
machine_list
=
experiment_config
.
get
(
'machineList'
)
remote_clean
(
machine_list
,
experiment_id
)
elif
platform
==
'pai'
:
host
=
experiment_config
.
get
(
'paiConfig'
).
get
(
'host'
)
user_name
=
experiment_config
.
get
(
'paiConfig'
).
get
(
'userName'
)
output_dir
=
experiment_config
.
get
(
'trial'
).
get
(
'outputDir'
)
hdfs_clean
(
host
,
user_name
,
output_dir
,
experiment_id
)
elif
platform
!=
'local'
:
# TODO: support all platforms
print_warning
(
'platform {0} clean up not supported yet.'
.
format
(
platform
))
...
...
@@ -632,11 +602,6 @@ def platform_clean(args):
if
platform
==
'remote'
:
machine_list
=
config_content
.
get
(
'machineList'
)
remote_clean
(
machine_list
)
elif
platform
==
'pai'
:
host
=
config_content
.
get
(
'paiConfig'
).
get
(
'host'
)
user_name
=
config_content
.
get
(
'paiConfig'
).
get
(
'userName'
)
output_dir
=
config_content
.
get
(
'trial'
).
get
(
'outputDir'
)
hdfs_clean
(
host
,
user_name
,
output_dir
)
print_normal
(
'Done.'
)
def
experiment_list
(
args
):
...
...
ts/nni_manager/core/nnimanager.ts
View file @
67e23897
...
...
@@ -254,12 +254,15 @@ class NNIManager implements Manager {
return
this
.
dataStore
.
getTrialJob
(
trialJobId
);
}
public
async
setClusterMetadata
(
_key
:
string
,
_value
:
string
):
Promise
<
void
>
{
throw
new
Error
(
'
Calling removed API setClusterMetadata
'
);
public
async
setClusterMetadata
(
key
:
string
,
value
:
string
):
Promise
<
void
>
{
while
(
this
.
trainingService
===
undefined
)
{
await
delay
(
1000
);
}
this
.
trainingService
.
setClusterMetadata
(
key
,
value
);
}
public
getClusterMetadata
(
_
key
:
string
):
Promise
<
string
>
{
throw
new
Error
(
'
Calling removed API
getClusterMetadata
'
);
public
getClusterMetadata
(
key
:
string
):
Promise
<
string
>
{
return
this
.
trainingService
.
getClusterMetadata
(
key
);
}
public
async
getTrialJobStatistics
():
Promise
<
TrialJobStatistics
[]
>
{
...
...
ts/nni_manager/training_service/reusable/environment.ts
View file @
67e23897
...
...
@@ -128,6 +128,10 @@ export class EnvironmentInformation {
export
abstract
class
EnvironmentService
{
public
async
init
():
Promise
<
void
>
{
return
;
}
public
abstract
get
hasStorageService
():
boolean
;
public
abstract
refreshEnvironmentsStatus
(
environments
:
EnvironmentInformation
[]):
Promise
<
void
>
;
public
abstract
stopEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
;
...
...
ts/nni_manager/training_service/reusable/environments/remoteEnvironmentService.ts
View file @
67e23897
...
...
@@ -27,7 +27,7 @@ export class RemoteEnvironmentService extends EnvironmentService {
private
readonly
environmentExecutorManagerMap
:
Map
<
string
,
ExecutorManager
>
;
private
readonly
remoteMachineMetaOccupiedMap
:
Map
<
RemoteMachineConfig
,
boolean
>
;
private
readonly
log
:
Logger
;
private
sshConnectionPromises
:
any
[];
private
sshConnectionPromises
:
Promise
<
void
[]
>
;
private
experimentRootDir
:
string
;
private
remoteExperimentRootDir
:
string
=
""
;
private
experimentId
:
string
;
...
...
@@ -39,7 +39,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
this
.
environmentExecutorManagerMap
=
new
Map
<
string
,
ExecutorManager
>
();
this
.
machineExecutorManagerMap
=
new
Map
<
RemoteMachineConfig
,
ExecutorManager
>
();
this
.
remoteMachineMetaOccupiedMap
=
new
Map
<
RemoteMachineConfig
,
boolean
>
();
this
.
sshConnectionPromises
=
[];
this
.
experimentRootDir
=
getExperimentRootDir
();
this
.
experimentId
=
getExperimentId
();
this
.
log
=
getLogger
();
...
...
@@ -50,9 +49,18 @@ export class RemoteEnvironmentService extends EnvironmentService {
throw
new
Error
(
`codeDir
${
this
.
config
.
trialCodeDirectory
}
is not a directory`
);
}
this
.
sshConnectionPromises
=
this
.
config
.
machineList
.
map
(
this
.
sshConnectionPromises
=
Promise
.
all
(
this
.
config
.
machineList
.
map
(
machine
=>
this
.
initRemoteMachineOnConnected
(
machine
)
);
));
}
public
async
init
():
Promise
<
void
>
{
await
this
.
sshConnectionPromises
;
this
.
log
.
info
(
'
ssh connection initialized!
'
);
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
});
}
public
get
prefetchedEnvironmentCount
():
number
{
...
...
@@ -204,16 +212,6 @@ export class RemoteEnvironmentService extends EnvironmentService {
}
public
async
startEnvironment
(
environment
:
EnvironmentInformation
):
Promise
<
void
>
{
if
(
this
.
sshConnectionPromises
.
length
>
0
)
{
await
Promise
.
all
(
this
.
sshConnectionPromises
);
this
.
log
.
info
(
'
ssh connection initialized!
'
);
// set sshConnectionPromises to [] to avoid log information duplicated
this
.
sshConnectionPromises
=
[];
Array
.
from
(
this
.
machineExecutorManagerMap
.
keys
()).
forEach
(
rmMeta
=>
{
// initialize remoteMachineMetaOccupiedMap, false means not occupied
this
.
remoteMachineMetaOccupiedMap
.
set
(
rmMeta
,
false
);
});
}
const
remoteEnvironment
:
RemoteMachineEnvironmentInformation
=
environment
as
RemoteMachineEnvironmentInformation
;
remoteEnvironment
.
status
=
'
WAITING
'
;
// schedule machine for environment, generate command
...
...
ts/nni_manager/training_service/reusable/trialDispatcher.ts
View file @
67e23897
...
...
@@ -122,7 +122,6 @@ class TrialDispatcher implements TrainingService {
this
.
environmentServiceList
.
push
(
env
);
}
// FIXME: max?
this
.
environmentMaintenceLoopInterval
=
Math
.
max
(
...
this
.
environmentServiceList
.
map
((
env
)
=>
env
.
environmentMaintenceLoopInterval
)
);
...
...
@@ -211,6 +210,7 @@ class TrialDispatcher implements TrainingService {
}
public
async
run
():
Promise
<
void
>
{
await
Promise
.
all
(
this
.
environmentServiceList
.
map
(
env
=>
env
.
init
()));
for
(
const
environmentService
of
this
.
environmentServiceList
)
{
const
runnerSettings
:
RunnerSettings
=
new
RunnerSettings
();
...
...
@@ -497,9 +497,10 @@ class TrialDispatcher implements TrainingService {
liveEnvironmentsCount
++
;
if
(
environment
.
status
===
"
RUNNING
"
&&
environment
.
isRunnerReady
)
{
// if environment is not reusable and used, stop and not count as idle;
const
reuseMode
=
Array
.
isArray
(
this
.
config
.
trainingService
)
||
(
this
.
config
.
trainingService
as
any
).
reuseMode
;
if
(
0
===
environment
.
runningTrialCount
&&
!
(
this
.
config
as
any
).
reuseMode
&&
!
reuseMode
&&
environment
.
assignedTrialCount
>
0
)
{
if
(
environment
.
environmentService
===
undefined
)
{
...
...
ts/webui/src/components/overview/count/EditExperimentParam.tsx
View file @
67e23897
...
...
@@ -101,13 +101,7 @@ export const EditExperimentParam = (): any => {
}
if
(
isMaxDuration
)
{
const
maxDura
=
JSON
.
parse
(
editInputVal
);
if
(
unit
===
'
m
'
)
{
newProfile
.
params
[
field
]
=
maxDura
*
60
;
}
else
if
(
unit
===
'
h
'
)
{
newProfile
.
params
[
field
]
=
maxDura
*
3600
;
}
else
{
newProfile
.
params
[
field
]
=
maxDura
*
24
*
60
*
60
;
}
newProfile
.
params
[
field
]
=
`
${
maxDura
}${
unit
}
`
;
}
else
{
newProfile
.
params
[
field
]
=
parseInt
(
editInputVal
,
10
);
}
...
...
@@ -162,7 +156,7 @@ export const EditExperimentParam = (): any => {
<
EditExpeParamContext
.
Consumer
>
{
(
value
):
React
.
ReactNode
=>
{
let
editClassName
=
''
;
if
(
value
.
field
===
'
maxEx
ec
Duration
'
)
{
if
(
value
.
field
===
'
maxEx
periment
Duration
'
)
{
editClassName
=
isShowPencil
?
'
noEditDuration
'
:
'
editDuration
'
;
}
return
(
...
...
ts/webui/src/components/overview/count/ExpDuration.tsx
View file @
67e23897
...
...
@@ -50,7 +50,7 @@ export const ExpDuration = (): any => (
<
EditExpeParamContext
.
Provider
value
=
{
{
editType
:
CONTROLTYPE
[
0
],
field
:
'
maxEx
ec
Duration
'
,
field
:
'
maxEx
periment
Duration
'
,
title
:
'
Max duration
'
,
maxExecDuration
:
maxExecDurationStr
,
maxTrialNum
:
EXPERIMENT
.
maxTrialNumber
,
...
...
ts/webui/src/components/overview/count/TrialCount.tsx
View file @
67e23897
...
...
@@ -89,7 +89,7 @@ export const TrialCount = (): any => {
<
EditExpeParamContext
.
Provider
value
=
{
{
title
:
MAX_TRIAL_NUMBERS
,
field
:
'
maxTrialNum
'
,
field
:
'
maxTrialNum
ber
'
,
editType
:
CONTROLTYPE
[
1
],
maxExecDuration
:
''
,
maxTrialNum
:
EXPERIMENT
.
maxTrialNumber
,
...
...
ts/webui/src/components/slideNav/TrialConfigPanel.tsx
View file @
67e23897
...
...
@@ -73,9 +73,9 @@ class TrialConfigPanel extends React.Component<LogDrawerProps, LogDrawerState> {
<
AppContext
.
Consumer
>
{
(
value
):
React
.
ReactNode
=>
{
const
unit
=
value
.
maxDurationUnit
;
profile
.
params
.
maxEx
ec
Duration
=
`
${
convertTimeAsUnit
(
profile
.
params
.
maxEx
periment
Duration
=
`
${
convertTimeAsUnit
(
unit
,
profile
.
params
.
maxEx
ec
Duration
profile
.
params
.
maxEx
periment
Duration
)}${
unit
}
`
;
const
showProfile
=
JSON
.
stringify
(
profile
,
filter
,
2
);
return
(
...
...
ts/webui/src/static/experimentConfig.ts
View file @
67e23897
...
...
@@ -152,7 +152,10 @@ export interface ExperimentConfig {
const
timeUnits
=
{
d
:
24
*
3600
,
h
:
3600
,
m
:
60
,
s
:
1
};
export
function
toSeconds
(
time
:
string
):
number
{
export
function
toSeconds
(
time
:
string
|
number
):
number
{
if
(
typeof
time
===
'
number
'
)
{
return
time
;
}
for
(
const
[
unit
,
factor
]
of
Object
.
entries
(
timeUnits
))
{
if
(
time
.
endsWith
(
unit
))
{
const
digits
=
time
.
slice
(
0
,
-
1
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment