Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
69cae211
Unverified
Commit
69cae211
authored
May 20, 2020
by
Chi Song
Committed by
GitHub
May 20, 2020
Browse files
Support Windows as remote node. (#2431)
parent
1180599a
Changes
33
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
104 additions
and
41 deletions
+104
-41
test/config/examples/mnist-keras.yml
test/config/examples/mnist-keras.yml
+0
-1
test/config/examples/mnist-nested-search-space.yml
test/config/examples/mnist-nested-search-space.yml
+0
-1
test/config/examples/mnist-pytorch.yml
test/config/examples/mnist-pytorch.yml
+0
-1
test/config/examples/mnist-tfv1.yml
test/config/examples/mnist-tfv1.yml
+0
-1
test/config/integration_tests.yml
test/config/integration_tests.yml
+11
-2
test/nni_test/nnitest/naive_test.py
test/nni_test/nnitest/naive_test.py
+6
-5
test/nni_test/nnitest/run_tests.py
test/nni_test/nnitest/run_tests.py
+2
-12
test/nni_test/nnitest/utils.py
test/nni_test/nnitest/utils.py
+10
-3
test/pipelines/pipelines-it-remote-linux-to-windows.yml
test/pipelines/pipelines-it-remote-linux-to-windows.yml
+48
-0
tools/nni_cmd/launcher.py
tools/nni_cmd/launcher.py
+3
-1
tools/nni_cmd/nnictl_utils.py
tools/nni_cmd/nnictl_utils.py
+1
-1
tools/nni_trial_tool/constants.py
tools/nni_trial_tool/constants.py
+0
-2
tools/nni_trial_tool/trial_keeper.py
tools/nni_trial_tool/trial_keeper.py
+23
-11
No files found.
test/config/examples/mnist-keras.yml
View file @
69cae211
...
...
@@ -14,7 +14,6 @@ assessor:
trial
:
codeDir
:
../../../examples/trials/mnist-keras
command
:
python3 mnist-keras.py --num_train 200 --epochs
1
gpuNum
:
0
useAnnotation
:
false
multiPhase
:
false
...
...
test/config/examples/mnist-nested-search-space.yml
View file @
69cae211
...
...
@@ -15,7 +15,6 @@ assessor:
trial
:
codeDir
:
../../../examples/trials/mnist-nested-search-space
command
:
python3 mnist.py --batch_num
10
gpuNum
:
0
useAnnotation
:
false
multiPhase
:
false
...
...
test/config/examples/mnist-pytorch.yml
View file @
69cae211
...
...
@@ -14,7 +14,6 @@ assessor:
trial
:
codeDir
:
../../../examples/trials/mnist-pytorch
command
:
python3 mnist.py --epochs 1 --batch_num
10
gpuNum
:
0
useAnnotation
:
false
multiPhase
:
false
...
...
test/config/examples/mnist-tfv1.yml
View file @
69cae211
...
...
@@ -14,7 +14,6 @@ assessor:
trial
:
codeDir
:
../../../examples/trials/mnist-tfv1
command
:
python3 mnist.py --batch_num
10
gpuNum
:
0
useAnnotation
:
false
multiPhase
:
false
...
...
test/config/integration_tests.yml
View file @
69cae211
defaultTestCaseConfig
:
launchCommand
:
nnictl create --config $configFile
launchCommand
:
nnictl create --config $configFile
--debug
stopCommand
:
nnictl stop
experimentStatusCheck
:
True
platform
:
linux darwin win32
...
...
@@ -22,7 +22,7 @@ testCases:
validator
:
# launch command, default launch command is 'nnictl create --config $configFile'
launchCommand
:
nnictl create --config $configFile
launchCommand
:
nnictl create --config $configFile
--debug
# stop command, default stop command is 'nnictl stop', empty means no stop command
stopCommand
:
nnictl stop
...
...
@@ -38,15 +38,24 @@ testCases:
-
name
:
mnist-tfv1
configFile
:
test/config/examples/mnist-tfv1.yml
config
:
maxTrialNum
:
1
trialConcurrency
:
1
-
name
:
mnist-keras
configFile
:
test/config/examples/mnist-keras.yml
config
:
maxTrialNum
:
2
trialConcurrency
:
1
-
name
:
mnist-pytorch
configFile
:
test/config/examples/mnist-pytorch.yml
-
name
:
mnist-annotation
configFile
:
test/config/examples/mnist-annotation.yml
config
:
maxTrialNum
:
1
trialConcurrency
:
1
-
name
:
cifar10-pytorch
configFile
:
test/config/examples/cifar10-pytorch.yml
...
...
test/nni_test/nnitest/naive_test.py
View file @
69cae211
...
...
@@ -10,7 +10,7 @@ import sys
import
time
import
traceback
from
utils
import
is_experiment_done
,
get_experiment_id
,
get_nni_log_path
,
read_last_line
,
remove_files
,
setup_experiment
,
detect_port
,
snooz
e
from
utils
import
is_experiment_done
,
get_experiment_id
,
get_nni_log_path
,
read_last_line
,
remove_files
,
setup_experiment
,
detect_port
,
wait_for_port_availabl
e
from
utils
import
GREEN
,
RED
,
CLEAR
,
EXPERIMENT_URL
NNI_SOURCE_DIR
=
'..'
...
...
@@ -71,7 +71,7 @@ def naive_test(args):
assert
assessor_result
==
expected
,
'Bad assessor result'
subprocess
.
run
([
'nnictl'
,
'stop'
])
snooze
(
)
wait_for_port_available
(
8080
,
10
)
def
stop_experiment_test
(
args
):
config_file
=
args
.
config
...
...
@@ -86,19 +86,20 @@ def stop_experiment_test(args):
experiment_id
=
get_experiment_id
(
EXPERIMENT_URL
)
proc
=
subprocess
.
run
([
'nnictl'
,
'stop'
,
experiment_id
])
assert
proc
.
returncode
==
0
,
'`nnictl stop %s` failed with code %d'
%
(
experiment_id
,
proc
.
returncode
)
snooze
(
)
wait_for_port_available
(
8080
,
10
)
assert
not
detect_port
(
8080
),
'`nnictl stop %s` failed to stop experiments'
%
experiment_id
# test cmd `nnictl stop --port`
proc
=
subprocess
.
run
([
'nnictl'
,
'stop'
,
'--port'
,
'8990'
])
assert
proc
.
returncode
==
0
,
'`nnictl stop %s` failed with code %d'
%
(
experiment_id
,
proc
.
returncode
)
snooze
(
)
wait_for_port_available
(
8990
,
10
)
assert
not
detect_port
(
8990
),
'`nnictl stop %s` failed to stop experiments'
%
experiment_id
# test cmd `nnictl stop --all`
proc
=
subprocess
.
run
([
'nnictl'
,
'stop'
,
'--all'
])
assert
proc
.
returncode
==
0
,
'`nnictl stop --all` failed with code %d'
%
proc
.
returncode
snooze
()
wait_for_port_available
(
8888
,
10
)
wait_for_port_available
(
8989
,
10
)
assert
not
detect_port
(
8888
)
and
not
detect_port
(
8989
),
'`nnictl stop --all` failed to stop experiments'
...
...
test/nni_test/nnitest/run_tests.py
View file @
69cae211
...
...
@@ -15,7 +15,7 @@ import ruamel.yaml as yaml
from
utils
import
get_experiment_status
,
get_yml_content
,
dump_yml_content
,
get_experiment_id
,
\
parse_max_duration_time
,
get_trial_stats
,
deep_update
,
print_trial_job_log
,
get_failed_trial_jobs
,
\
get_experiment_dir
,
print_experiment_log
from
utils
import
GREEN
,
RED
,
CLEAR
,
STATUS_URL
,
TRIAL_JOBS_URL
,
EXPERIMENT_URL
,
REST_ENDPOINT
,
detect_port
from
utils
import
GREEN
,
RED
,
CLEAR
,
STATUS_URL
,
TRIAL_JOBS_URL
,
EXPERIMENT_URL
,
REST_ENDPOINT
,
wait_for_port_available
import
validators
it_variables
=
{}
...
...
@@ -157,7 +157,7 @@ def launch_test(config_file, training_service, test_case_config):
if
num_failed
>
0
:
print
(
'failed jobs: '
,
num_failed
)
break
time
.
sleep
(
3
)
time
.
sleep
(
1
)
except
:
print_experiment_log
(
experiment_id
=
experiment_id
)
raise
...
...
@@ -189,16 +189,6 @@ def case_included(name, cases):
return
True
return
False
def
wait_for_port_available
(
port
,
timeout
):
begin_time
=
time
.
time
()
while
True
:
if
not
detect_port
(
port
):
return
if
time
.
time
()
-
begin_time
>
timeout
:
msg
=
'port {} is not available in {} seconds.'
.
format
(
port
,
timeout
)
raise
RuntimeError
(
msg
)
time
.
sleep
(
5
)
def
match_platform
(
test_case_config
):
return
sys
.
platform
in
test_case_config
[
'platform'
].
split
(
' '
)
...
...
test/nni_test/nnitest/utils.py
View file @
69cae211
...
...
@@ -168,6 +168,13 @@ def detect_port(port):
except
:
return
False
def
snooze
():
'''Sleep to make sure previous stopped exp has enough time to exit'''
time
.
sleep
(
6
)
def
wait_for_port_available
(
port
,
timeout
):
begin_time
=
time
.
time
()
while
True
:
if
not
detect_port
(
port
):
return
if
time
.
time
()
-
begin_time
>
timeout
:
msg
=
'port {} is not available in {} seconds.'
.
format
(
port
,
timeout
)
raise
RuntimeError
(
msg
)
time
.
sleep
(
1
)
test/pipelines/pipelines-it-remote-linux-to-windows.yml
0 → 100644
View file @
69cae211
jobs
:
-
job
:
"
integration_test_remote_linux_to_windows"
timeoutInMinutes
:
120
steps
:
-
script
:
make clean
displayName
:
"
clean
nni
source
code"
-
task
:
CopyFilesOverSSH@0
inputs
:
sshEndpoint
:
$(end_point)
contents
:
|
**
!**/dist/**
!**/node_modules/**
targetFolder
:
/tmp/nnitest/$(Build.BuildId)
overwrite
:
true
displayName
:
"
Copy
all
files
to
remote
machine"
timeoutInMinutes
:
10
-
task
:
SSH@0
inputs
:
sshEndpoint
:
$(end_point)
runOptions
:
commands
commands
:
cd "\tmp\nnitest\$(Build.BuildId)" && powershell.exe -command "conda activate l2w | .\uninstall.ps1 | .\install.ps1"
failOnStdErr
:
false
displayName
:
"
install
on
remote
windows"
-
script
:
python3 -m pip install --upgrade pip setuptools --user
displayName
:
"
Install
python
tools"
-
script
:
make easy-install
displayName
:
"
Install
nni
via
source
code"
-
script
:
|
sudo apt-get install swig -y
PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC
PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB
displayName
:
"
Install
dependencies
for
integration
tests
in
remote
mode"
-
script
:
|
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(remote_user) --remote_host $(remote_host) \
--remote_port $(remote_port) --remote_pwd $(remote_pwd) --nni_manager_ip $(nni_manager_ip)
cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName
:
"
integration
test"
-
task
:
SSH@0
inputs
:
sshEndpoint
:
$(end_point)
runOptions
:
commands
commands
:
rmdir /s /q "\\?\c:\tmp\nnitest\$(Build.BuildId)"
condition
:
always()
displayName
:
"
clean
up
on
remote
server"
tools/nni_cmd/launcher.py
View file @
69cae211
...
...
@@ -139,7 +139,9 @@ def set_remote_config(experiment_config, port, config_file_name):
for
i
in
range
(
len
(
request_data
[
'machine_list'
])):
if
isinstance
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
),
int
):
request_data
[
'machine_list'
][
i
][
'gpuIndices'
]
=
str
(
request_data
[
'machine_list'
][
i
].
get
(
'gpuIndices'
))
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
REST_TIME_OUT
)
# It needs to connect all remote machines, the time out of connection is 30 seconds.
# So timeout of this place should be longer.
response
=
rest_put
(
cluster_metadata_url
(
port
),
json
.
dumps
(
request_data
),
60
,
True
)
err_message
=
''
if
not
response
or
not
check_response
(
response
):
if
response
is
not
None
:
...
...
tools/nni_cmd/nnictl_utils.py
View file @
69cae211
...
...
@@ -227,7 +227,7 @@ def stop_experiment(args):
experiment_config
=
Experiments
()
experiment_dict
=
experiment_config
.
get_all_experiments
()
for
experiment_id
in
experiment_id_list
:
print_normal
(
'Stoping experiment %s'
%
experiment_id
)
print_normal
(
'Stop
p
ing experiment %s'
%
experiment_id
)
nni_config
=
Config
(
experiment_dict
[
experiment_id
][
'fileName'
])
rest_pid
=
nni_config
.
get_config
(
'restServerPid'
)
if
rest_pid
:
...
...
tools/nni_trial_tool/constants.py
View file @
69cae211
...
...
@@ -7,8 +7,6 @@ API_ROOT_URL = '/api/v1/nni-pai'
BASE_URL
=
'http://{}'
HOME_DIR
=
os
.
path
.
join
(
os
.
environ
[
'HOME'
],
'nni'
)
LOG_DIR
=
os
.
environ
[
'NNI_OUTPUT_DIR'
]
NNI_PLATFORM
=
os
.
environ
[
'NNI_PLATFORM'
]
...
...
tools/nni_trial_tool/trial_keeper.py
View file @
69cae211
...
...
@@ -2,23 +2,27 @@
# Licensed under the MIT license.
import
argparse
import
os
from
subprocess
import
Popen
import
time
import
ctypes
import
json
import
logging
import
s
hlex
import
o
s
import
re
import
shlex
import
sys
import
json
import
threading
from
pyhdfs
import
HdfsClient
import
time
from
subprocess
import
Popen
import
pkg_resources
from
.rest_utils
import
rest_post
,
rest_get
from
.url_utils
import
gen_send_version_url
,
gen_parameter_meta_url
from
pyhdfs
import
HdfsClient
from
.constants
import
LOG_DIR
,
NNI_PLATFORM
,
MULTI_PHASE
,
NNI_TRIAL_JOB_ID
,
NNI_SYS_DIR
,
NNI_EXP_ID
from
.hdfsClientUtility
import
copyDirectoryToHdfs
,
copyHdfsDirectoryToLocal
,
copyHdfsFileToLocal
from
.log_utils
import
LogType
,
nni_log
,
RemoteLogger
,
StdOutputType
from
.constants
import
(
LOG_DIR
,
MULTI_PHASE
,
NNI_EXP_ID
,
NNI_PLATFORM
,
NNI_SYS_DIR
,
NNI_TRIAL_JOB_ID
)
from
.hdfsClientUtility
import
(
copyDirectoryToHdfs
,
copyHdfsDirectoryToLocal
,
copyHdfsFileToLocal
)
from
.log_utils
import
LogType
,
RemoteLogger
,
StdOutputType
,
nni_log
from
.rest_utils
import
rest_get
,
rest_post
from
.url_utils
import
gen_parameter_meta_url
,
gen_send_version_url
logger
=
logging
.
getLogger
(
'trial_keeper'
)
regular
=
re
.
compile
(
'v?(?P<version>[0-9](\.[0-9]){0,1}).*'
)
...
...
@@ -80,6 +84,10 @@ def main_loop(args):
if
hdfs_client
is
not
None
:
copyHdfsDirectoryToLocal
(
args
.
nni_hdfs_exp_dir
,
os
.
getcwd
(),
hdfs_client
)
if
args
.
job_id_file
:
with
open
(
args
.
job_id_file
,
'w'
)
as
job_file
:
job_file
.
write
(
"%d"
%
os
.
getpid
())
# Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior
log_pipe_stdout
=
trial_syslogger_stdout
.
get_pipelog_reader
()
process
=
Popen
(
args
.
trial_command
,
shell
=
True
,
stdout
=
log_pipe_stdout
,
stderr
=
log_pipe_stdout
)
...
...
@@ -91,6 +99,9 @@ def main_loop(args):
retCode
=
process
.
poll
()
# child worker process exits and all stdout data is read
if
retCode
is
not
None
and
log_pipe_stdout
.
set_process_exit
()
and
log_pipe_stdout
.
is_read_completed
==
True
:
# In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError.
# So covert it to int32.
retCode
=
ctypes
.
c_long
(
retCode
).
value
nni_log
(
LogType
.
Info
,
'subprocess terminated. Exit code is {}. Quit'
.
format
(
retCode
))
if
hdfs_output_dir
is
not
None
:
# Copy local directory to hdfs for OpenPAI
...
...
@@ -218,6 +229,7 @@ if __name__ == '__main__':
PARSER
.
add_argument
(
'--webhdfs_path'
,
type
=
str
,
help
=
'the webhdfs path used in webhdfs URL'
)
PARSER
.
add_argument
(
'--nni_manager_version'
,
type
=
str
,
help
=
'the nni version transmitted from nniManager'
)
PARSER
.
add_argument
(
'--log_collection'
,
type
=
str
,
help
=
'set the way to collect log in trialkeeper'
)
PARSER
.
add_argument
(
'--job_id_file'
,
type
=
str
,
help
=
'set job id file for operating and monitoring job.'
)
args
,
unknown
=
PARSER
.
parse_known_args
()
if
args
.
trial_command
is
None
:
exit
(
1
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment