Unverified Commit f548d82f authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #250 from microsoft/master

merge master
parents 0a742aff 69cae211
......@@ -13,7 +13,6 @@ assessor:
trial:
codeDir: ../../../examples/trials/mnist-annotation
command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: true
multiPhase: false
......
......@@ -14,7 +14,6 @@ assessor:
trial:
codeDir: ../../../examples/trials/mnist-keras
command: python3 mnist-keras.py --num_train 200 --epochs 1
gpuNum: 0
useAnnotation: false
multiPhase: false
......
......@@ -15,7 +15,6 @@ assessor:
trial:
codeDir: ../../../examples/trials/mnist-nested-search-space
command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: false
multiPhase: false
......
......@@ -14,7 +14,6 @@ assessor:
trial:
codeDir: ../../../examples/trials/mnist-pytorch
command: python3 mnist.py --epochs 1 --batch_num 10
gpuNum: 0
useAnnotation: false
multiPhase: false
......
......@@ -14,7 +14,6 @@ assessor:
trial:
codeDir: ../../../examples/trials/mnist-tfv1
command: python3 mnist.py --batch_num 10
gpuNum: 0
useAnnotation: false
multiPhase: false
......
defaultTestCaseConfig:
launchCommand: nnictl create --config $configFile
launchCommand: nnictl create --config $configFile --debug
stopCommand: nnictl stop
experimentStatusCheck: True
platform: linux darwin win32
......@@ -22,7 +22,7 @@ testCases:
validator:
# launch command, default launch command is 'nnictl create --config $configFile'
launchCommand: nnictl create --config $configFile
launchCommand: nnictl create --config $configFile --debug
# stop command, default stop command is 'nnictl stop', empty means no stop command
stopCommand: nnictl stop
......@@ -38,15 +38,24 @@ testCases:
- name: mnist-tfv1
configFile: test/config/examples/mnist-tfv1.yml
config:
maxTrialNum: 1
trialConcurrency: 1
- name: mnist-keras
configFile: test/config/examples/mnist-keras.yml
config:
maxTrialNum: 2
trialConcurrency: 1
- name: mnist-pytorch
configFile: test/config/examples/mnist-pytorch.yml
- name: mnist-annotation
configFile: test/config/examples/mnist-annotation.yml
config:
maxTrialNum: 1
trialConcurrency: 1
- name: cifar10-pytorch
configFile: test/config/examples/cifar10-pytorch.yml
......
......@@ -10,7 +10,7 @@ import sys
import time
import traceback
from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, snooze
from utils import is_experiment_done, get_experiment_id, get_nni_log_path, read_last_line, remove_files, setup_experiment, detect_port, wait_for_port_available
from utils import GREEN, RED, CLEAR, EXPERIMENT_URL
NNI_SOURCE_DIR = '..'
......@@ -71,7 +71,7 @@ def naive_test(args):
assert assessor_result == expected, 'Bad assessor result'
subprocess.run(['nnictl', 'stop'])
snooze()
wait_for_port_available(8080, 10)
def stop_experiment_test(args):
config_file = args.config
......@@ -86,19 +86,20 @@ def stop_experiment_test(args):
experiment_id = get_experiment_id(EXPERIMENT_URL)
proc = subprocess.run(['nnictl', 'stop', experiment_id])
assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
snooze()
wait_for_port_available(8080, 10)
assert not detect_port(8080), '`nnictl stop %s` failed to stop experiments' % experiment_id
# test cmd `nnictl stop --port`
proc = subprocess.run(['nnictl', 'stop', '--port', '8990'])
assert proc.returncode == 0, '`nnictl stop %s` failed with code %d' % (experiment_id, proc.returncode)
snooze()
wait_for_port_available(8990, 10)
assert not detect_port(8990), '`nnictl stop %s` failed to stop experiments' % experiment_id
# test cmd `nnictl stop --all`
proc = subprocess.run(['nnictl', 'stop', '--all'])
assert proc.returncode == 0, '`nnictl stop --all` failed with code %d' % proc.returncode
snooze()
wait_for_port_available(8888, 10)
wait_for_port_available(8989, 10)
assert not detect_port(8888) and not detect_port(8989), '`nnictl stop --all` failed to stop experiments'
......
......@@ -15,7 +15,7 @@ import ruamel.yaml as yaml
from utils import get_experiment_status, get_yml_content, dump_yml_content, get_experiment_id, \
parse_max_duration_time, get_trial_stats, deep_update, print_trial_job_log, get_failed_trial_jobs, \
get_experiment_dir, print_experiment_log
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, detect_port
from utils import GREEN, RED, CLEAR, STATUS_URL, TRIAL_JOBS_URL, EXPERIMENT_URL, REST_ENDPOINT, wait_for_port_available
import validators
it_variables = {}
......@@ -157,7 +157,7 @@ def launch_test(config_file, training_service, test_case_config):
if num_failed > 0:
print('failed jobs: ', num_failed)
break
time.sleep(3)
time.sleep(1)
except:
print_experiment_log(experiment_id=experiment_id)
raise
......@@ -189,16 +189,6 @@ def case_included(name, cases):
return True
return False
def wait_for_port_available(port, timeout):
begin_time = time.time()
while True:
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(5)
def match_platform(test_case_config):
return sys.platform in test_case_config['platform'].split(' ')
......
......@@ -168,6 +168,13 @@ def detect_port(port):
except:
return False
def snooze():
'''Sleep to make sure previous stopped exp has enough time to exit'''
time.sleep(6)
def wait_for_port_available(port, timeout):
begin_time = time.time()
while True:
if not detect_port(port):
return
if time.time() - begin_time > timeout:
msg = 'port {} is not available in {} seconds.'.format(port, timeout)
raise RuntimeError(msg)
time.sleep(1)
jobs:
- job: "integration_test_remote_linux_to_windows"
timeoutInMinutes: 120
steps:
- script: make clean
displayName: "clean nni source code"
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: $(end_point)
contents: |
**
!**/dist/**
!**/node_modules/**
targetFolder: /tmp/nnitest/$(Build.BuildId)
overwrite: true
displayName: "Copy all files to remote machine"
timeoutInMinutes: 10
- task: SSH@0
inputs:
sshEndpoint: $(end_point)
runOptions: commands
commands: cd "\tmp\nnitest\$(Build.BuildId)" && powershell.exe -command "conda activate l2w | .\uninstall.ps1 | .\install.ps1"
failOnStdErr: false
displayName: "install on remote windows"
- script: python3 -m pip install --upgrade pip setuptools --user
displayName: "Install python tools"
- script: make easy-install
displayName: "Install nni via source code"
- script: |
sudo apt-get install swig -y
PATH=$HOME/.local/bin:$PATH nnictl package install --name=SMAC
PATH=$HOME/.local/bin:$PATH nnictl package install --name=BOHB
displayName: "Install dependencies for integration tests in remote mode"
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py --ts remote --remote_user $(remote_user) --remote_host $(remote_host) \
--remote_port $(remote_port) --remote_pwd $(remote_pwd) --nni_manager_ip $(nni_manager_ip)
cat config/training_service.yml
PATH=$HOME/.local/bin:$PATH python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: "integration test"
- task: SSH@0
inputs:
sshEndpoint: $(end_point)
runOptions: commands
commands: rmdir /s /q "\\?\c:\tmp\nnitest\$(Build.BuildId)"
condition: always()
displayName: "clean up on remote server"
......@@ -139,7 +139,9 @@ def set_remote_config(experiment_config, port, config_file_name):
for i in range(len(request_data['machine_list'])):
if isinstance(request_data['machine_list'][i].get('gpuIndices'), int):
request_data['machine_list'][i]['gpuIndices'] = str(request_data['machine_list'][i].get('gpuIndices'))
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), REST_TIME_OUT)
# It needs to connect all remote machines, the time out of connection is 30 seconds.
# So timeout of this place should be longer.
response = rest_put(cluster_metadata_url(port), json.dumps(request_data), 60, True)
err_message = ''
if not response or not check_response(response):
if response is not None:
......
......@@ -227,7 +227,7 @@ def stop_experiment(args):
experiment_config = Experiments()
experiment_dict = experiment_config.get_all_experiments()
for experiment_id in experiment_id_list:
print_normal('Stoping experiment %s' % experiment_id)
print_normal('Stopping experiment %s' % experiment_id)
nni_config = Config(experiment_dict[experiment_id]['fileName'])
rest_pid = nni_config.get_config('restServerPid')
if rest_pid:
......
......@@ -7,8 +7,6 @@ API_ROOT_URL = '/api/v1/nni-pai'
BASE_URL = 'http://{}'
HOME_DIR = os.path.join(os.environ['HOME'], 'nni')
LOG_DIR = os.environ['NNI_OUTPUT_DIR']
NNI_PLATFORM = os.environ['NNI_PLATFORM']
......
......@@ -2,23 +2,27 @@
# Licensed under the MIT license.
import argparse
import os
from subprocess import Popen
import time
import ctypes
import json
import logging
import shlex
import os
import re
import shlex
import sys
import json
import threading
from pyhdfs import HdfsClient
import time
from subprocess import Popen
import pkg_resources
from .rest_utils import rest_post, rest_get
from .url_utils import gen_send_version_url, gen_parameter_meta_url
from pyhdfs import HdfsClient
from .constants import LOG_DIR, NNI_PLATFORM, MULTI_PHASE, NNI_TRIAL_JOB_ID, NNI_SYS_DIR, NNI_EXP_ID
from .hdfsClientUtility import copyDirectoryToHdfs, copyHdfsDirectoryToLocal, copyHdfsFileToLocal
from .log_utils import LogType, nni_log, RemoteLogger, StdOutputType
from .constants import (LOG_DIR, MULTI_PHASE, NNI_EXP_ID, NNI_PLATFORM,
NNI_SYS_DIR, NNI_TRIAL_JOB_ID)
from .hdfsClientUtility import (copyDirectoryToHdfs, copyHdfsDirectoryToLocal,
copyHdfsFileToLocal)
from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log
from .rest_utils import rest_get, rest_post
from .url_utils import gen_parameter_meta_url, gen_send_version_url
logger = logging.getLogger('trial_keeper')
regular = re.compile('v?(?P<version>[0-9](\.[0-9]){0,1}).*')
......@@ -80,6 +84,10 @@ def main_loop(args):
if hdfs_client is not None:
copyHdfsDirectoryToLocal(args.nni_hdfs_exp_dir, os.getcwd(), hdfs_client)
if args.job_id_file:
with open(args.job_id_file, 'w') as job_file:
job_file.write("%d" % os.getpid())
# Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior
log_pipe_stdout = trial_syslogger_stdout.get_pipelog_reader()
process = Popen(args.trial_command, shell=True, stdout=log_pipe_stdout, stderr=log_pipe_stdout)
......@@ -91,6 +99,9 @@ def main_loop(args):
retCode = process.poll()
# child worker process exits and all stdout data is read
if retCode is not None and log_pipe_stdout.set_process_exit() and log_pipe_stdout.is_read_completed == True:
# In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError.
# So covert it to int32.
retCode = ctypes.c_long(retCode).value
nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode))
if hdfs_output_dir is not None:
# Copy local directory to hdfs for OpenPAI
......@@ -218,6 +229,7 @@ if __name__ == '__main__':
PARSER.add_argument('--webhdfs_path', type=str, help='the webhdfs path used in webhdfs URL')
PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager')
PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trialkeeper')
PARSER.add_argument('--job_id_file', type=str, help='set job id file for operating and monitoring job.')
args, unknown = PARSER.parse_known_args()
if args.trial_command is None:
exit(1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment