Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
21b48d29
Unverified
Commit
21b48d29
authored
Mar 27, 2019
by
SparkSnail
Committed by
GitHub
Mar 27, 2019
Browse files
Support showing version check error message in WebUI (#922)
parent
0330333c
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
89 additions
and
14 deletions
+89
-14
src/nni_manager/training_service/common/clusterJobRestServer.ts
...i_manager/training_service/common/clusterJobRestServer.ts
+41
-0
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
...frameworkcontroller/frameworkcontrollerTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
...ng_service/kubernetes/kubeflow/kubeflowTrainingService.ts
+5
-0
src/nni_manager/training_service/kubernetes/kubernetesData.ts
...nni_manager/training_service/kubernetes/kubernetesData.ts
+1
-1
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
.../training_service/kubernetes/kubernetesTrainingService.ts
+1
-1
src/nni_manager/training_service/pai/paiData.ts
src/nni_manager/training_service/pai/paiData.ts
+1
-1
src/nni_manager/training_service/pai/paiTrainingService.ts
src/nni_manager/training_service/pai/paiTrainingService.ts
+6
-2
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
...ager/training_service/remote_machine/remoteMachineData.ts
+1
-1
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
...ng_service/remote_machine/remoteMachineTrainingService.ts
+5
-0
tools/nni_trial_tool/constants.py
tools/nni_trial_tool/constants.py
+1
-0
tools/nni_trial_tool/trial_keeper.py
tools/nni_trial_tool/trial_keeper.py
+16
-6
tools/nni_trial_tool/url_utils.py
tools/nni_trial_tool/url_utils.py
+6
-2
No files found.
src/nni_manager/training_service/common/clusterJobRestServer.ts
View file @
21b48d29
...
...
@@ -41,6 +41,10 @@ export abstract class ClusterJobRestServer extends RestServer{
private
readonly
expId
:
string
=
getExperimentId
();
private
enableVersionCheck
:
boolean
=
true
;
//switch to enable version check
private
versionCheckSuccess
:
boolean
|
undefined
;
private
errorMessage
?:
string
;
/**
* constructor to provide NNIRestServer's own rest property, e.g. port
*/
...
...
@@ -59,6 +63,14 @@ export abstract class ClusterJobRestServer extends RestServer{
return
this
.
port
;
}
public
get
getErrorMessage
():
string
|
undefined
{
return
this
.
errorMessage
;
}
public
set
setEnableVersionCheck
(
versionCheck
:
boolean
)
{
this
.
enableVersionCheck
=
versionCheck
;
}
/**
* NNIRestServer's own router registration
*/
...
...
@@ -77,6 +89,31 @@ export abstract class ClusterJobRestServer extends RestServer{
next
();
});
router
.
post
(
`/version/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
)
{
try
{
const
checkResultSuccess
:
boolean
=
req
.
body
.
tag
===
'
VCSuccess
'
?
true
:
false
;
if
(
this
.
versionCheckSuccess
!==
undefined
&&
this
.
versionCheckSuccess
!==
checkResultSuccess
)
{
this
.
errorMessage
=
'
Version check error, version check result is inconsistent!
'
;
this
.
log
.
error
(
this
.
errorMessage
);
}
else
if
(
checkResultSuccess
)
{
this
.
log
.
info
(
`Version check in trialKeeper success!`
);
this
.
versionCheckSuccess
=
true
;
}
else
{
this
.
versionCheckSuccess
=
false
;
this
.
errorMessage
=
req
.
body
.
msg
;
}
}
catch
(
err
)
{
this
.
log
.
error
(
`json parse metrics error:
${
err
}
`
);
res
.
status
(
500
);
res
.
send
(
err
.
message
);
}
}
else
{
this
.
log
.
info
(
`Skipping version check!`
);
}
res
.
send
();
});
router
.
post
(
`/update-metrics/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
try
{
this
.
log
.
info
(
`Get update-metrics request, trial job id is
${
req
.
params
.
trialId
}
`
);
...
...
@@ -94,6 +131,10 @@ export abstract class ClusterJobRestServer extends RestServer{
});
router
.
post
(
`/stdout/
${
this
.
expId
}
/:trialId`
,
(
req
:
Request
,
res
:
Response
)
=>
{
if
(
this
.
enableVersionCheck
&&
!
this
.
versionCheckSuccess
&&
!
this
.
errorMessage
)
{
this
.
errorMessage
=
`Version check failed, didn't get version check response from trialKeeper, please check your NNI version in `
+
`NNIManager and TrialKeeper!`
}
const
trialLogPath
:
string
=
path
.
join
(
getLogDir
(),
`trial_
${
req
.
params
.
trialId
}
.log`
);
try
{
let
skipLogging
:
boolean
=
false
;
...
...
src/nni_manager/training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService.ts
View file @
21b48d29
...
...
@@ -66,11 +66,16 @@ class FrameworkControllerTrainingService extends KubernetesTrainingService imple
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`frameworkcontroller Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for frameworkcontroller jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
fcJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
}
...
...
src/nni_manager/training_service/kubernetes/kubeflow/kubeflowTrainingService.ts
View file @
21b48d29
...
...
@@ -71,11 +71,16 @@ class KubeflowTrainingService extends KubernetesTrainingService implements Kuber
throw
new
Error
(
'
kubernetesJobRestServer not initialized!
'
);
}
await
this
.
kubernetesJobRestServer
.
start
();
this
.
kubernetesJobRestServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`Kubeflow Training service rest server listening on:
${
this
.
kubernetesJobRestServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
// collect metrics for Kubeflow jobs by interacting with Kubernetes API server
await
delay
(
3000
);
await
this
.
kubeflowJobInfoCollector
.
retrieveTrialStatus
(
this
.
kubernetesCRDClient
);
if
(
this
.
kubernetesJobRestServer
.
getErrorMessage
)
{
throw
new
Error
(
this
.
kubernetesJobRestServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
}
this
.
log
.
info
(
'
Kubeflow training service exit.
'
);
}
...
...
src/nni_manager/training_service/kubernetes/kubernetesData.ts
View file @
21b48d29
...
...
@@ -71,5 +71,5 @@ mkdir -p $NNI_OUTPUT_DIR
cp -rT $NNI_CODE_DIR $NNI_SYS_DIR
cd $NNI_SYS_DIR
sh install_nni.sh
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --version '{11}' --log_collection '{12}'`
python3 -m nni_trial_tool.trial_keeper --trial_command '{8}' --nnimanager_ip {9} --nnimanager_port {10} --
nni_manager_
version '{11}' --log_collection '{12}'`
+
`1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr`
src/nni_manager/training_service/kubernetes/kubernetesTrainingService.ts
View file @
21b48d29
...
...
@@ -61,7 +61,7 @@ abstract class KubernetesTrainingService {
protected
kubernetesCRDClient
?:
KubernetesCRDClient
;
protected
kubernetesJobRestServer
?:
KubernetesJobRestServer
;
protected
kubernetesClusterConfig
?:
KubernetesClusterConfig
;
protected
versionCheck
?
:
boolean
=
true
;
protected
versionCheck
:
boolean
=
true
;
protected
logCollection
:
string
;
constructor
()
{
...
...
src/nni_manager/training_service/pai/paiData.ts
View file @
21b48d29
...
...
@@ -64,7 +64,7 @@ export const PAI_TRIAL_COMMAND_FORMAT: string =
`export NNI_PLATFORM=pai NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={2} NNI_EXP_ID={3} NNI_TRIAL_SEQ_ID={4}
&& cd $NNI_SYS_DIR && sh install_nni.sh
&& python3 -m nni_trial_tool.trial_keeper --trial_command '{5}' --nnimanager_ip '{6}' --nnimanager_port '{7}'
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --version '{12}' --log_collection '{13}'`
;
--pai_hdfs_output_dir '{8}' --pai_hdfs_host '{9}' --pai_user_name {10} --nni_hdfs_exp_dir '{11}' --webhdfs_path '/webhdfs/api/v1' --
nni_manager_
version '{12}' --log_collection '{13}'`
;
export
const
PAI_OUTPUT_DIR_FORMAT
:
string
=
`hdfs://{0}:9000/`
;
...
...
src/nni_manager/training_service/pai/paiTrainingService.ts
View file @
21b48d29
...
...
@@ -75,7 +75,7 @@ class PAITrainingService implements TrainingService {
private
paiRestServerPort
?:
number
;
private
nniManagerIpConfig
?:
NNIManagerIpConfig
;
private
copyExpCodeDirPromise
?:
Promise
<
void
>
;
private
versionCheck
?
:
boolean
=
true
;
private
versionCheck
:
boolean
=
true
;
private
logCollection
:
string
;
constructor
()
{
...
...
@@ -97,11 +97,15 @@ class PAITrainingService implements TrainingService {
this
.
log
.
info
(
'
Run PAI training service.
'
);
const
restServer
:
PAIJobRestServer
=
component
.
get
(
PAIJobRestServer
);
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
`PAI Training service rest server listening on:
${
restServer
.
endPoint
}
`
);
while
(
!
this
.
stopping
)
{
await
this
.
updatePaiToken
();
await
this
.
paiJobCollector
.
retrieveTrialStatus
(
this
.
paiToken
,
this
.
paiClusterConfig
);
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
)
this
.
stopping
=
true
;
}
await
delay
(
3000
);
}
this
.
log
.
info
(
'
PAI training service exit.
'
);
...
...
src/nni_manager/training_service/remote_machine/remoteMachineData.ts
View file @
21b48d29
...
...
@@ -250,7 +250,7 @@ export NNI_PLATFORM=remote NNI_SYS_DIR={0} NNI_OUTPUT_DIR={1} NNI_TRIAL_JOB_ID={
cd $NNI_SYS_DIR
sh install_nni.sh
echo $$ >{6}
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
python3 -m nni_trial_tool.trial_keeper --trial_command '{7}' --nnimanager_ip '{8}' --nnimanager_port '{9}' --
nni_manager_
version '{10}' --log_collection '{11}' 1>$NNI_OUTPUT_DIR/trialkeeper_stdout 2>$NNI_OUTPUT_DIR/trialkeeper_stderr
echo $?
\`
date +%s%3N
\`
>{12}`
;
export
const
HOST_JOB_SHELL_FORMAT
:
string
=
...
...
src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts
View file @
21b48d29
...
...
@@ -102,6 +102,7 @@ class RemoteMachineTrainingService implements TrainingService {
public
async
run
():
Promise
<
void
>
{
const
restServer
:
RemoteMachineJobRestServer
=
component
.
get
(
RemoteMachineJobRestServer
);
await
restServer
.
start
();
restServer
.
setEnableVersionCheck
=
this
.
versionCheck
;
this
.
log
.
info
(
'
Run remote machine training service.
'
);
while
(
!
this
.
stopping
)
{
while
(
this
.
jobQueue
.
length
>
0
)
{
...
...
@@ -117,6 +118,10 @@ class RemoteMachineTrainingService implements TrainingService {
break
;
}
}
if
(
restServer
.
getErrorMessage
)
{
throw
new
Error
(
restServer
.
getErrorMessage
);
this
.
stopping
=
true
;
}
await
delay
(
3000
);
}
this
.
log
.
info
(
'
Remote machine training service exit.
'
);
...
...
tools/nni_trial_tool/constants.py
View file @
21b48d29
...
...
@@ -35,6 +35,7 @@ STDOUT_FULL_PATH = os.path.join(LOG_DIR, 'stdout')
STDERR_FULL_PATH
=
os
.
path
.
join
(
LOG_DIR
,
'stderr'
)
STDOUT_API
=
'/stdout'
VERSION_API
=
'/version'
NNI_SYS_DIR
=
os
.
environ
[
'NNI_SYS_DIR'
]
NNI_TRIAL_JOB_ID
=
os
.
environ
[
'NNI_TRIAL_JOB_ID'
]
NNI_EXP_ID
=
os
.
environ
[
'NNI_EXP_ID'
]
\ No newline at end of file
tools/nni_trial_tool/trial_keeper.py
View file @
21b48d29
...
...
@@ -27,14 +27,18 @@ import shlex
import
re
import
sys
import
select
import
json
from
pyhdfs
import
HdfsClient
import
pkg_resources
from
.rest_utils
import
rest_post
from
.url_utils
import
gen_send_stdout_url
,
gen_send_version_url
from
.constants
import
HOME_DIR
,
LOG_DIR
,
NNI_PLATFORM
,
STDOUT_FULL_PATH
,
STDERR_FULL_PATH
from
.hdfsClientUtility
import
copyDirectoryToHdfs
,
copyHdfsDirectoryToLocal
from
.log_utils
import
LogType
,
nni_log
,
RemoteLogger
,
PipeLogReader
,
StdOutputType
logger
=
logging
.
getLogger
(
'trial_keeper'
)
regular
=
re
.
compile
(
'v?(?P<version>[0-9](\.[0-9]){0,1}).*'
)
def
main_loop
(
args
):
'''main loop logic for trial keeper'''
...
...
@@ -110,21 +114,27 @@ def check_version(args):
#package nni does not exist, try nni-tool package
nni_log
(
LogType
.
Error
,
'Package nni does not exist!'
)
os
.
_exit
(
1
)
if
not
args
.
version
:
if
not
args
.
nni_manager_
version
:
# skip version check
nni_log
(
LogType
.
Warning
,
'Skipping version check!'
)
else
:
regular
=
re
.
compile
(
'v?(?P<version>[0-9](\.[0-9]){0,2}).*'
)
try
:
trial_keeper_version
=
regular
.
search
(
trial_keeper_version
).
group
(
'version'
)
nni_log
(
LogType
.
Info
,
'trial_keeper_version is {0}'
.
format
(
trial_keeper_version
))
training_service_version
=
regular
.
search
(
args
.
version
).
group
(
'version'
)
nni_log
(
LogType
.
Info
,
'training_service_version is {0}'
.
format
(
training_service_version
))
if
trial_keeper_version
!=
training_service_version
:
nni_manager_version
=
regular
.
search
(
args
.
nni_manager_version
).
group
(
'version'
)
nni_log
(
LogType
.
Info
,
'nni_manager_version is {0}'
.
format
(
nni_manager_version
))
log_entry
=
{}
if
trial_keeper_version
!=
nni_manager_version
:
nni_log
(
LogType
.
Error
,
'Version does not match!'
)
error_message
=
'NNIManager version is {0}, TrialKeeper version is {1}, NNI version does not match!'
.
format
(
nni_manager_version
,
trial_keeper_version
)
log_entry
[
'tag'
]
=
'VCFail'
log_entry
[
'msg'
]
=
error_message
rest_post
(
gen_send_version_url
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
),
json
.
dumps
(
log_entry
),
10
,
False
)
os
.
_exit
(
1
)
else
:
nni_log
(
LogType
.
Info
,
'Version match!'
)
log_entry
[
'tag'
]
=
'VCSuccess'
rest_post
(
gen_send_version_url
(
args
.
nnimanager_ip
,
args
.
nnimanager_port
),
json
.
dumps
(
log_entry
),
10
,
False
)
except
AttributeError
as
err
:
nni_log
(
LogType
.
Error
,
err
)
...
...
@@ -142,7 +152,7 @@ if __name__ == '__main__':
PARSER
.
add_argument
(
'--pai_user_name'
,
type
=
str
,
help
=
'the username of hdfs'
)
PARSER
.
add_argument
(
'--nni_hdfs_exp_dir'
,
type
=
str
,
help
=
'nni experiment directory in hdfs'
)
PARSER
.
add_argument
(
'--webhdfs_path'
,
type
=
str
,
help
=
'the webhdfs path used in webhdfs URL'
)
PARSER
.
add_argument
(
'--version'
,
type
=
str
,
help
=
'the nni version transmitted from
trainingService
'
)
PARSER
.
add_argument
(
'--
nni_manager_
version'
,
type
=
str
,
help
=
'the nni version transmitted from
nniManager
'
)
PARSER
.
add_argument
(
'--log_collection'
,
type
=
str
,
help
=
'set the way to collect log in trialkeeper'
)
args
,
unknown
=
PARSER
.
parse_known_args
()
if
args
.
trial_command
is
None
:
...
...
tools/nni_trial_tool/url_utils.py
View file @
21b48d29
...
...
@@ -18,8 +18,12 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from
.constants
import
API_ROOT_URL
,
BASE_URL
,
STDOUT_API
,
NNI_TRIAL_JOB_ID
,
NNI_EXP_ID
from
.constants
import
API_ROOT_URL
,
BASE_URL
,
STDOUT_API
,
NNI_TRIAL_JOB_ID
,
NNI_EXP_ID
,
VERSION_API
def
gen_send_stdout_url
(
ip
,
port
):
'''Generate send stdout url'''
return
'{0}:{1}{2}{3}/{4}/{5}'
.
format
(
BASE_URL
.
format
(
ip
),
port
,
API_ROOT_URL
,
STDOUT_API
,
NNI_EXP_ID
,
NNI_TRIAL_JOB_ID
)
def
gen_send_version_url
(
ip
,
port
):
'''Generate send error url'''
return
'{0}:{1}{2}{3}/{4}/{5}'
.
format
(
BASE_URL
.
format
(
ip
),
port
,
API_ROOT_URL
,
VERSION_API
,
NNI_EXP_ID
,
NNI_TRIAL_JOB_ID
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment