Unverified Commit f548d82f authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #250 from microsoft/master

merge master
parents 0a742aff 69cae211
...@@ -13,7 +13,6 @@ assessor: ...@@ -13,7 +13,6 @@ assessor:
trial: trial:
codeDir: ../../../examples/trials/mnist-annotation codeDir: ../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 10 command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: true useAnnotation: true
multiPhase: false multiPhase: false
......
...@@ -14,7 +14,6 @@ assessor: ...@@ -14,7 +14,6 @@ assessor:
trial: trial:
codeDir: ../../../examples/trials/mnist-keras codeDir: ../../../examples/trials/mnist-keras
command: python3 mnist-keras.py --num_train 200 --epochs 1 command: python3 mnist-keras.py --num_train 200 --epochs 1
gpuNum: 0
useAnnotation: false useAnnotation: false
multiPhase: false multiPhase: false
......
...@@ -15,7 +15,6 @@ assessor: ...@@ -15,7 +15,6 @@ assessor:
trial: trial:
codeDir: ../../../examples/trials/mnist-nested-search-space codeDir: ../../../examples/trials/mnist-nested-search-space
command: python3 mnist.py --batch_num 10 command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: false useAnnotation: false
multiPhase: false multiPhase: false
......
...@@ -14,7 +14,6 @@ assessor: ...@@ -14,7 +14,6 @@ assessor:
trial: trial:
codeDir: ../../../examples/trials/mnist-pytorch codeDir: ../../../examples/trials/mnist-pytorch
command: python3 mnist.py --epochs 1 --batch_num 10 command: python3 mnist.py --epochs 1 --batch_num 10
gpuNum: 0
useAnnotation: false useAnnotation: false
multiPhase: false multiPhase: false
......
...@@ -14,7 +14,6 @@ assessor: ...@@ -14,7 +14,6 @@ assessor:
trial: trial:
codeDir: ../../../examples/trials/mnist-tfv1 codeDir: ../../../examples/trials/mnist-tfv1
command: python3 mnist.py --batch_num 10 command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: false useAnnotation: false
multiPhase: false multiPhase: false
......
defaultTestCaseConfig: defaultTestCaseConfig:
launchCommand: nnictl create --config $configFile launchCommand: nnictl create --config $configFile --debug
stopCommand: nnictl stop stopCommand: nnictl stop
experimentStatusCheck: True experimentStatusCheck: True
platform: linux darwin win32 platform: linux darwin win32
...@@ -22,7 +22,7 @@ testCases: ...@@ -22,7 +22,7 @@ testCases:
validator: validator:
# launch command, default launch command is 'nnictl create --config $configFile' # launch command, default launch command is 'nnictl create --config $configFile'
launchCommand: nnictl create --config $configFile launchCommand: nnictl create --config $configFile --debug
# stop command, default stop command is 'nnictl stop', empty means no stop command # stop command, default stop command is 'nnictl stop', empty means no stop command
stopCommand: nnictl stop stopCommand: nnictl stop
...@@ -38,15 +38,24 @@ testCases: ...@@ -38,15 +38,24 @@ testCases:
- name: mnist-tfv1 - name: mnist-tfv1
configFile: test/config/examples/mnist-tfv1.yml configFile: test/config/examples/mnist-tfv1.yml
config:
maxTrialNum: 1
trialConcurrency: 1
- name: mnist-keras - name: mnist-keras
configFile: test/config/examples/mnist-keras.yml configFile: test/config/examples/mnist-keras.yml
config:
maxTrialNum: 2
trialConcurrency: 1
- name: mnist-pytorch - name: mnist-pytorch
configFile: test/config/examples/mnist-pytorch.yml configFile: test/config/examples/mnist-pytorch.yml
- name: mnist-annotation - name: mnist-annotation
configFile: test/config/examples/mnist-annotation.yml configFile: test/config/examples/mnist-annotation.yml
config:
maxTrialNum: 1
trialConcurrency: 1
- name: cifar10-pytorch - name: cifar10-pytorch
configFile: test/config/examples/cifar10-pytorch.yml configFile: test/config/examples/cifar10-pytorch.yml
......
...@@ -10,7 +10,7 @@ import sys ...@@ -10,7 +10,7 @@ import sys
import time import time
import traceback import traceback
from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, wait_for_port_available
from utils import GREEN, RED, CLEAR, EXPERIMENT_URL from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
NNI_SOURCE_DIR = '..' NNI_SOURCE_DIR = '..'
...@@ -71,7 +71,7 @@ def naive_test(args): ...@@ -71,7 +71,7 @@ def naive_test(args):
assert assessor_result == expected, 'Bad assessor result' assert assessor_result == expected, 'Bad assessor result'
subprocess.run(['nnictl', 'stop']) subprocess.run(['nnictl', 'stop'])
snooze() wait_for_port_available(8080, 10)
def stop_experiment_test(args): def stop_experiment_test(args):
config_file = args.config config_file = args.config
...@@ -86,19 +86,20 @@ def stop_experiment_test(args): ...@@ -86,19 +86,20 @@ def stop_experiment_test(args):
experiment_id = get_experiment_id(EXPERIMENT_URL) experiment_id = get_experiment_id(EXPERIMENT_URL)
proc = subprocess.run(['nnictl', 'stop', experiment_id]) proc = subprocess.run(['nnictl', 'stop', experiment_id])
assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
snooze() wait_for_port_available(8080, 10)
assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id
# test cmd `nnictl stop --port` # test cmd `nnictl stop --port`
proc = subprocess.run(['nnictl', 'stop', '--port', '8990']) proc = subprocess.run(['nnictl', 'stop', '--port', '8990'])
assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode) assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
snooze() wait_for_port_available(8990, 10)
assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id
# test cmd `nnictl stop --all` # test cmd `nnictl stop --all`
proc = subprocess.run(['nnictl', 'stop', '--all']) proc = subprocess.run(['nnictl', 'stop', '--all'])
assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode
snooze() wait_for_port_available(8888, 10)
wait_for_port_available(8989, 10)
assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments' assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments'
......
...@@ -15,7 +15,7 @@ import ruamel.yaml as yaml ...@@ -15,7 +15,7 @@ import ruamel.yaml as yaml
from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \ from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \
parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \ parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \
get_experiment_dir, print_experiment_log get_experiment_dir, print_experiment_log
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, detect_port from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, wait_for_port_available
import validators import validators
it_variables = {} it_variables = {}
...@@ -157,7 +157,7 @@ def launch_test(config_file, training_service, test_case_config): ...@@ -157,7 +157,7 @@ def launch_test(config_file, training_service, test_case_config):
if num_failed > 0: if num_failed > 0:
print('failed jobs: ', num_failed) print('failed jobs: ', num_failed)
break break
time.sleep(3) time.sleep(1)
except: except:
print_experiment_log(experiment_id=experiment_id) print_experiment_log(experiment_id=experiment_id)
raise raise
...@@ -189,16 +189,6 @@ def case_included(name, cases): ...@@ -189,16 +189,6 @@ def case_included(name, cases):
return True return True
return False return False
def wait_for_port_available(port, timeout):
begin_time = time.time()
while True:
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(5)
def match_platform(test_case_config): def match_platform(test_case_config):
return sys.platform in test_case_config['platform'].split(' ') return sys.platform in test_case_config['platform'].split(' ')
......
...@@ -168,6 +168,13 @@ def detect_port(port): ...@@ -168,6 +168,13 @@ def detect_port(port):
except: except:
return False return False
def snooze():
'''Sleep to make sure previous stopped exp has enough time to exit''' def wait_for_port_available(port, timeout):
time.sleep(6) begin_time = time.time()
while True:
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(1)
jobs:
- job: "integration_test_remote_linux_to_windows"
timeoutInMinutes: 120
steps:
- script: make clean
displayName: "clean nni source code"
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: $(end_point)
contents: |
**
!**/dist/**
!**/node_modules/**
targetFolder: /tmp/nnitest/$(Build.BuildId)
overwrite: true
displayName: "Copy all files to remote machine"
timeoutInMinutes: 10
- task: SSH@0
inputs:
sshEndpoint: $(end_point)
runOptions: commands
commands: cd "\tmp\nnitest\$(Build.BuildId)" && powershell.exe -command "conda activate l2w | .\uninstall.ps1 | .\install.ps1"
failOnStdErr: false
displayName: "install on remote windows"
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: "Install python tools"
- script: make easy-install
displayName: "Install nni via source code"
- script: |
sudo apt-get install swig -y
PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC
PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB
displayName: "Install dependencies for integration tests in remote mode"
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(remote_user) --remote_host $(remote_host) \
--remote_port $(remote_port) --remote_pwd $(remote_pwd) --nni_manager_ip $(nni_manager_ip)
cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: "integration test"
- task: SSH@0
inputs:
sshEndpoint: $(end_point)
runOptions: commands
commands: rmdir /s /q "\\?\c:\tmp\nnitest\$(Build.BuildId)"
condition: always()
displayName: "clean up on remote server"
...@@ -139,7 +139,9 @@ def set_remote_config(experiment_config, port, config_file_name): ...@@ -139,7 +139,9 @@ def set_remote_config(experiment_config, port, config_file_name):
for i in range(len(request_data['machine_list'])): for i in range(len(request_data['machine_list'])):
if isinstance(request_data['machine_list'][i].get('gpuIndices'), int): if isinstance(request_data['machine_list'][i].get('gpuIndices'), int):
request_data['machine_list'][i]['gpuIndices'] = str(request_data['machine_list'][i].get('gpuIndices')) request_data['machine_list'][i]['gpuIndices'] = str(request_data['machine_list'][i].get('gpuIndices'))
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT) # It needs to connect all remote machines, the time out of connection is 30 seconds.
# So timeout of this place should be longer.
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 60, True)
err_message = '' err_message = ''
if not response or not check_response(response): if not response or not check_response(response):
if response is not None: if response is not None:
......
...@@ -227,7 +227,7 @@ def stop_experiment(args): ...@@ -227,7 +227,7 @@ def stop_experiment(args):
experiment_config = Experiments() experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments() experiment_dict = experiment_config.get_all_experiments()
for experiment_id in experiment_id_list: for experiment_id in experiment_id_list:
print_normal('Stoping experiment %s' % experiment_id) print_normal('Stopping experiment %s' % experiment_id)
nni_config = Config(experiment_dict[experiment_id]['fileName']) nni_config = Config(experiment_dict[experiment_id]['fileName'])
rest_pid = nni_config.get_config('restServerPid') rest_pid = nni_config.get_config('restServerPid')
if rest_pid: if rest_pid:
......
...@@ -7,8 +7,6 @@ API_ROOT_URL = '/api/v1/nni-pai' ...@@ -7,8 +7,6 @@ API_ROOT_URL = '/api/v1/nni-pai'
BASE_URL = 'http://{}' BASE_URL = 'http://{}'
HOME_DIR = os.path.join(os.environ['HOME'], 'nni')
LOG_DIR = os.environ['NNI_OUTPUT_DIR'] LOG_DIR = os.environ['NNI_OUTPUT_DIR']
NNI_PLATFORM = os.environ['NNI_PLATFORM'] NNI_PLATFORM = os.environ['NNI_PLATFORM']
......
...@@ -2,23 +2,27 @@ ...@@ -2,23 +2,27 @@
# Licensed under the MIT license. # Licensed under the MIT license.
import argparse import argparse
import os import ctypes
from subprocess import Popen import json
import time
import logging import logging
import shlex import os
import re import re
import shlex
import sys import sys
import json
import threading import threading
from pyhdfs import HdfsClient import time
from subprocess import Popen
import pkg_resources import pkg_resources
from .rest_utils import rest_post, rest_get from pyhdfs import HdfsClient
from .url_utils import gen_send_version_url, gen_parameter_meta_url
from .constants import LOG_DIR, NNI_PLATFORM, MULTI_PHASE, NNI_TRIAL_JOB_ID, NNI_SYS_DIR, NNI_EXP_ID from .constants import (LOG_DIR, MULTI_PHASE, NNI_EXP_ID, NNI_PLATFORM,
from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal, copyHdfsFileToLocal NNI_SYS_DIR, NNI_TRIAL_JOB_ID)
from .log_utils import LogType, nni_log, RemoteLogger, StdOutputType from .hdfsClientUtility import (copyDirectoryToHdfs, copyHdfsDirectoryToLocal,
copyHdfsFileToLocal)
from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log
from .rest_utils import rest_get, rest_post
from .url_utils import gen_parameter_meta_url, gen_send_version_url
logger = logging.getLogger('trial_keeper') logger = logging.getLogger('trial_keeper')
regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,1}).*') regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,1}).*')
...@@ -80,6 +84,10 @@ def main_loop(args): ...@@ -80,6 +84,10 @@ def main_loop(args):
if hdfs_client is not None: if hdfs_client is not None:
copyHdfsDirectoryToLocal(args.nni_hdfs_exp_dir, os.getcwd(), hdfs_client) copyHdfsDirectoryToLocal(args.nni_hdfs_exp_dir, os.getcwd(), hdfs_client)
if args.job_id_file:
with open(args.job_id_file, 'w') as job_file:
job_file.write("%d" % os.getpid())
# Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior
log_pipe_stdout = trial_syslogger_stdout.get_pipelog_reader() log_pipe_stdout = trial_syslogger_stdout.get_pipelog_reader()
process = Popen(args.trial_command, shell=True, stdout=log_pipe_stdout, stderr=log_pipe_stdout) process = Popen(args.trial_command, shell=True, stdout=log_pipe_stdout, stderr=log_pipe_stdout)
...@@ -91,6 +99,9 @@ def main_loop(args): ...@@ -91,6 +99,9 @@ def main_loop(args):
retCode = process.poll() retCode = process.poll()
# child worker process exits and all stdout data is read # child worker process exits and all stdout data is read
if retCode is not None and log_pipe_stdout.set_process_exit() and log_pipe_stdout.is_read_completed == True: if retCode is not None and log_pipe_stdout.set_process_exit() and log_pipe_stdout.is_read_completed == True:
# In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError.
# So covert it to int32.
retCode = ctypes.c_long(retCode).value
nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode))
if hdfs_output_dir is not None: if hdfs_output_dir is not None:
# Copy local directory to hdfs for OpenPAI # Copy local directory to hdfs for OpenPAI
...@@ -218,6 +229,7 @@ if __name__ == '__main__': ...@@ -218,6 +229,7 @@ if __name__ == '__main__':
PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL') PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL')
PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager')
PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper') PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper')
PARSER.add_argument('--job_id_file', type=str, help='set job id file for operating and monitoring job.')
args, unknown = PARSER.parse_known_args() args, unknown = PARSER.parse_known_args()
if args.trial_command is None: if args.trial_command is None:
exit(1) exit(1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment