Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
52b93d0c
Unverified
Commit
52b93d0c
authored
Nov 05, 2019
by
chicm-ms
Committed by
GitHub
Nov 05, 2019
Browse files
Show error log of failed trial jobs for integration tests (#1602)
* show failed job log
parent
86f8c2ab
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
52 additions
and
29 deletions
+52
-29
src/nni_manager/common/utils.ts
src/nni_manager/common/utils.ts
+1
-1
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+4
-2
test/config_test.py
test/config_test.py
+12
-12
test/metrics_test.py
test/metrics_test.py
+2
-2
test/training_service.yml
test/training_service.yml
+3
-0
test/utils.py
test/utils.py
+30
-12
No files found.
src/nni_manager/common/utils.ts
View file @
52b93d0c
...
...
@@ -510,4 +510,4 @@ function unixPathJoin(...paths: any[]): string {
export
{
countFilesRecursively
,
validateFileNameRecursively
,
getRemoteTmpDir
,
generateParamFileName
,
getMsgDispatcherCommand
,
getCheckpointDir
,
getLogDir
,
getExperimentRootDir
,
getJobCancelStatus
,
getDefaultDatabaseDir
,
getIPV4Address
,
unixPathJoin
,
mkDirP
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
mkDirP
,
mkDirPSync
,
delay
,
prepareUnitTest
,
parseArg
,
cleanupUnitTest
,
uniqueString
,
randomSelect
,
getLogLevel
,
getVersion
,
getCmdPy
,
getTunerProc
,
isAlive
,
killPid
,
getNewLine
};
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
52b93d0c
...
...
@@ -30,7 +30,7 @@ import { String } from 'typescript-string-operations';
import
*
as
component
from
'
../../common/component
'
;
import
{
getBasePort
,
getExperimentId
}
from
'
../../common/experimentStartupInfo
'
;
import
{
RestServer
}
from
'
../../common/restServer
'
;
import
{
get
LogDir
}
from
'
../../common/utils
'
;
import
{
get
ExperimentRootDir
,
mkDirPSync
}
from
'
../../common/utils
'
;
/**
* Cluster Job Training service Rest server, provides rest API to support Cluster job metrics update
...
...
@@ -146,7 +146,9 @@ export abstract class ClusterJobRestServer extends RestServer {
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper,`
+
` please check your NNI version in NNIManager and TrialKeeper!`
;
}
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
const
trialLogDir
:
string
=
path
.
join
(
getExperimentRootDir
(),
'
trials
'
,
req
.
params
.
trialId
);
mkDirPSync
(
trialLogDir
);
const
trialLogPath
:
string
=
path
.
join
(
trialLogDir
,
'
stdout_log_collection.log
'
);
try
{
let
skipLogging
:
boolean
=
false
;
if
(
req
.
body
.
tag
===
'
trial
'
&&
req
.
body
.
msg
!==
undefined
)
{
...
...
test/config_test.py
View file @
52b93d0c
...
...
@@ -24,9 +24,10 @@ import glob
import
subprocess
import
time
import
traceback
import
json
from
utils
import
setup_experiment
,
get_experiment_status
,
get_yml_content
,
dump_yml_content
,
\
parse_max_duration_time
,
get_succeeded_trial_num
,
print_stderr
,
deep_update
parse_max_duration_time
,
get_succeeded_trial_num
,
deep_update
,
print_failed_job_log
,
get_failed_trial_jobs
from
utils
import
GREEN
,
RED
,
CLEAR
,
STATUS_URL
,
TRIAL_JOBS_URL
def
gen_new_config
(
config_file
,
training_service
=
'local'
):
...
...
@@ -37,18 +38,18 @@ def gen_new_config(config_file, training_service='local'):
config
=
get_yml_content
(
config_file
)
new_config_file
=
config_file
+
'.tmp'
ts
=
get_yml_content
(
'training_service.yml'
)[
training_service
]
print
(
ts
)
it_config
=
get_yml_content
(
'training_service.yml'
)
# hack for kubeflow trial config
if
training_service
==
'kubeflow'
:
ts
[
'trial'
][
'worker'
][
'command'
]
=
config
[
'trial'
][
'command'
]
it_config
[
training_service
]
[
'trial'
][
'worker'
][
'command'
]
=
config
[
'trial'
][
'command'
]
config
[
'trial'
].
pop
(
'command'
)
if
'gpuNum'
in
config
[
'trial'
]:
config
[
'trial'
].
pop
(
'gpuNum'
)
deep_update
(
config
,
ts
)
print
(
config
)
deep_update
(
config
,
it_config
[
'all'
])
deep_update
(
config
,
it_config
[
training_service
])
dump_yml_content
(
new_config_file
,
config
)
return
new_config_file
,
config
...
...
@@ -57,6 +58,7 @@ def run_test(config_file, training_service, local_gpu=False):
'''run test per configuration file'''
new_config_file
,
config
=
gen_new_config
(
config_file
,
training_service
)
print
(
json
.
dumps
(
config
,
sort_keys
=
True
,
indent
=
4
))
if
training_service
==
'local'
and
not
local_gpu
and
config
[
'trial'
][
'gpuNum'
]
>
0
:
print
(
'no gpu, skiping: '
,
config_file
)
...
...
@@ -72,14 +74,12 @@ def run_test(config_file, training_service, local_gpu=False):
for
_
in
range
(
0
,
max_duration
+
30
,
sleep_interval
):
time
.
sleep
(
sleep_interval
)
status
=
get_experiment_status
(
STATUS_URL
)
if
status
==
'DONE'
:
num_succeeded
=
get_succeeded_trial_num
(
TRIAL_JOBS_URL
)
if
training_service
==
'local'
:
print_stderr
(
TRIAL_JOBS_URL
)
assert
num_succeeded
==
max_trial_num
,
'only %d succeeded trial jobs, there should be %d'
%
(
num_succeeded
,
max_trial_num
)
if
status
in
[
'DONE'
,
'ERROR'
]
or
get_failed_trial_jobs
(
TRIAL_JOBS_URL
):
break
assert
status
==
'DONE'
,
'Failed to finish in maxExecDuration'
print_failed_job_log
(
config
[
'trainingServicePlatform'
],
TRIAL_JOBS_URL
)
if
status
!=
'DONE'
or
get_succeeded_trial_num
(
TRIAL_JOBS_URL
)
<
max_trial_num
:
raise
AssertionError
(
'Failed to finish in maxExecDuration'
)
finally
:
if
os
.
path
.
exists
(
new_config_file
):
os
.
remove
(
new_config_file
)
...
...
test/metrics_test.py
View file @
52b93d0c
...
...
@@ -26,7 +26,7 @@ import traceback
import
json
import
requests
from
utils
import
get_experiment_status
,
get_yml_content
,
parse_max_duration_time
,
get_succeeded_trial_num
,
print_
stderr
from
utils
import
get_experiment_status
,
get_yml_content
,
parse_max_duration_time
,
get_succeeded_trial_num
,
print_
failed_job_log
from
utils
import
GREEN
,
RED
,
CLEAR
,
STATUS_URL
,
TRIAL_JOBS_URL
,
METRICS_URL
def
run_test
():
...
...
@@ -49,7 +49,7 @@ def run_test():
#print('experiment status:', status)
if
status
==
'DONE'
:
num_succeeded
=
get_succeeded_trial_num
(
TRIAL_JOBS_URL
)
print_
stderr
(
TRIAL_JOBS_URL
)
print_
failed_job_log
(
'local'
,
TRIAL_JOBS_URL
)
if
sys
.
platform
==
"win32"
:
time
.
sleep
(
sleep_interval
)
# Windows seems to have some issues on updating in time
assert
num_succeeded
==
max_trial_num
,
'only %d succeeded trial jobs, there should be %d'
%
(
num_succeeded
,
max_trial_num
)
...
...
test/training_service.yml
View file @
52b93d0c
all
:
logCollection
:
http
kubeflow
:
maxExecDuration
:
15m
nniManagerIp
:
...
...
test/utils.py
View file @
52b93d0c
...
...
@@ -81,13 +81,18 @@ def get_experiment_id(experiment_url):
experiment_id
=
requests
.
get
(
experiment_url
).
json
()[
'id'
]
return
experiment_id
def
get_
nni_log_path
(
experiment_url
):
'''get
nni's log path from nni's experiment url
'''
def
get_
experiment_dir
(
experiment_url
):
'''get
experiment root directory
'''
experiment_id
=
get_experiment_id
(
experiment_url
)
experiment_path
=
os
.
path
.
join
(
os
.
path
.
expanduser
(
'~'
),
'nni'
,
'experiments'
,
experiment_id
)
nnimanager_log_path
=
os
.
path
.
join
(
experiment_path
,
'log'
,
'nnimanager.log'
)
return
os
.
path
.
join
(
os
.
path
.
expanduser
(
'~'
),
'nni'
,
'experiments'
,
experiment_id
)
return
nnimanager_log_path
def
get_nni_log_dir
(
experiment_url
):
'''get nni's log directory from nni's experiment url'''
return
os
.
path
.
join
(
get_experiment_dir
(
experiment_url
),
'log'
)
def
get_nni_log_path
(
experiment_url
):
'''get nni's log path from nni's experiment url'''
return
os
.
path
.
join
(
get_nni_log_dir
(
experiment_url
),
'nnimanager.log'
)
def
is_experiment_done
(
nnimanager_log_path
):
'''check if the experiment is done successfully'''
...
...
@@ -104,7 +109,6 @@ def get_experiment_status(status_url):
def
get_succeeded_trial_num
(
trial_jobs_url
):
trial_jobs
=
requests
.
get
(
trial_jobs_url
).
json
()
print
(
trial_jobs
)
num_succeed
=
0
for
trial_job
in
trial_jobs
:
if
trial_job
[
'status'
]
in
[
'SUCCEEDED'
,
'EARLY_STOPPED'
]:
...
...
@@ -112,17 +116,31 @@ def get_succeeded_trial_num(trial_jobs_url):
print
(
'num_succeed:'
,
num_succeed
)
return
num_succeed
def
print_stderr
(
trial_jobs_url
):
def
get_failed_trial_jobs
(
trial_jobs_url
):
'''Return failed trial jobs'''
trial_jobs
=
requests
.
get
(
trial_jobs_url
).
json
()
failed_jobs
=
[]
for
trial_job
in
trial_jobs
:
if
trial_job
[
'status'
]
in
[
'FAILED'
]:
failed_jobs
.
append
(
trial_job
)
return
failed_jobs
def
print_failed_job_log
(
training_service
,
trial_jobs_url
):
'''Print job log of FAILED trial jobs'''
trial_jobs
=
get_failed_trial_jobs
(
trial_jobs_url
)
for
trial_job
in
trial_jobs
:
if
tr
ial_job
[
'status'
]
==
'FAILED
'
:
if
tr
aining_service
==
'local
'
:
if
sys
.
platform
==
"win32"
:
p
=
trial_job
[
'stderrPath'
].
split
(
':'
)
stderr_path
=
':'
.
join
([
p
[
-
2
],
p
[
-
1
]])
subprocess
.
run
([
'type'
,
stderr_path
],
shell
=
True
)
log_filename
=
':'
.
join
([
p
[
-
2
],
p
[
-
1
]])
else
:
stderr_path
=
trial_job
[
'stderrPath'
].
split
(
':'
)[
-
1
]
subprocess
.
run
([
'cat'
,
stderr_path
])
log_filename
=
trial_job
[
'stderrPath'
].
split
(
':'
)[
-
1
]
else
:
log_filename
=
os
.
path
.
join
(
get_experiment_dir
(
EXPERIMENT_URL
),
'trials'
,
trial_job
[
'id'
],
'stdout_log_collection.log'
)
with
open
(
log_filename
,
'r'
)
as
f
:
log_content
=
f
.
read
()
print
(
log_filename
,
flush
=
True
)
print
(
log_content
,
flush
=
True
)
def
parse_max_duration_time
(
max_exec_duration
):
unit
=
max_exec_duration
[
-
1
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment