Unverified Commit 3ec26b40 authored by liuzhe-lz's avatar liuzhe-lz Committed by GitHub
Browse files

Merge master into dev-retiarii (#3178)

parent d165905d
......@@ -6,6 +6,8 @@ import os
import threading
from enum import Enum
_logger = logging.getLogger(__name__)
class CommandType(Enum):
# in
......@@ -32,7 +34,7 @@ try:
_in_file = open(3, 'rb')
_out_file = open(4, 'wb')
except OSError:
pass
_logger.debug('IPC pipeline not exists')
def send(command, data):
......@@ -45,7 +47,7 @@ def send(command, data):
_lock.acquire()
data = data.encode('utf8')
msg = b'%b%014d%b' % (command.value, len(data), data)
logging.getLogger(__name__).debug('Sending command, data: [%s]', msg)
_logger.debug('Sending command, data: [%s]', msg)
_out_file.write(msg)
_out_file.flush()
finally:
......@@ -57,14 +59,14 @@ def receive():
Returns a tuple of command (CommandType) and payload (str)
"""
header = _in_file.read(16)
logging.getLogger(__name__).debug('Received command, header: [%s]', header)
_logger.debug('Received command, header: [%s]', header)
if header is None or len(header) < 16:
# Pipe EOF encountered
logging.getLogger(__name__).debug('Pipe EOF encountered')
_logger.debug('Pipe EOF encountered')
return None, None
length = int(header[2:])
data = _in_file.read(length)
command = CommandType(header[:2])
data = data.decode('utf8')
logging.getLogger(__name__).debug('Received command, data: [%s]', data)
_logger.debug('Received command, data: [%s]', data)
return command, data
......@@ -85,7 +85,6 @@ def start_rest_server(port, platform, mode, experiment_id, foreground=False, log
log_header = LOG_HEADER % str(time_now)
stdout_file.write(log_header)
stderr_file.write(log_header)
print('## [nnictl] cmds:', cmds)
if sys.platform == 'win32':
from subprocess import CREATE_NEW_PROCESS_GROUP
if foreground:
......@@ -388,8 +387,6 @@ def set_experiment(experiment_config, mode, port, config_file_name):
{'key': 'aml_config', 'value': experiment_config['amlConfig']})
request_data['clusterMetaData'].append(
{'key': 'trial_config', 'value': experiment_config['trial']})
print('## experiment config:')
print(request_data)
response = rest_post(experiment_url(port), json.dumps(request_data), REST_TIME_OUT, show_error=True)
if check_response(response):
return response
......
......@@ -63,14 +63,16 @@ def parse_path(experiment_config, config_path):
if experiment_config['trial'].get('paiConfigPath'):
expand_path(experiment_config['trial'], 'paiConfigPath')
#if users use relative path, convert it to absolute path
# If users use relative path, convert it to absolute path.
root_path = os.path.dirname(config_path)
if experiment_config.get('searchSpacePath'):
parse_relative_path(root_path, experiment_config, 'searchSpacePath')
if experiment_config.get('logDir'):
parse_relative_path(root_path, experiment_config, 'logDir')
if experiment_config.get('trial'):
parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
# In AdaptDL mode, 'codeDir' shouldn't be parsed because it points to the path in the container.
if experiment_config.get('trainingServicePlatform') != 'adl':
parse_relative_path(root_path, experiment_config['trial'], 'codeDir')
if experiment_config['trial'].get('authFile'):
parse_relative_path(root_path, experiment_config['trial'], 'authFile')
if experiment_config['trial'].get('ps'):
......
......@@ -50,7 +50,7 @@ def parse_args():
# parse start command
parser_start = subparsers.add_parser('create', help='create a new experiment')
parser_start.add_argument('--config', '-c', required=True, dest='config', help='the path of yaml config file')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_start.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', type=int, help='the port of restful server')
parser_start.add_argument('--debug', '-d', action='store_true', help=' set debug mode')
parser_start.add_argument('--foreground', '-f', action='store_true', help=' set foreground mode, print log content to terminal')
parser_start.set_defaults(func=create_experiment)
......@@ -58,7 +58,7 @@ def parse_args():
# parse resume command
parser_resume = subparsers.add_parser('resume', help='resume a new experiment')
parser_resume.add_argument('id', nargs='?', help='The id of the experiment you want to resume')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_resume.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', type=int, help='the port of restful server')
parser_resume.add_argument('--debug', '-d', action='store_true', help=' set debug mode')
parser_resume.add_argument('--foreground', '-f', action='store_true', help=' set foreground mode, print log content to terminal')
parser_resume.set_defaults(func=resume_experiment)
......@@ -66,7 +66,7 @@ def parse_args():
# parse view command
parser_view = subparsers.add_parser('view', help='view a stopped experiment')
parser_view.add_argument('id', nargs='?', help='The id of the experiment you want to view')
parser_view.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', help='the port of restful server')
parser_view.add_argument('--port', '-p', default=DEFAULT_REST_PORT, dest='port', type=int, help='the port of restful server')
parser_view.set_defaults(func=view_experiment)
# parse update command
......@@ -93,7 +93,7 @@ def parse_args():
#parse stop command
parser_stop = subparsers.add_parser('stop', help='stop the experiment')
parser_stop.add_argument('id', nargs='?', help='the id of experiment, use \'all\' to stop all running experiments')
parser_stop.add_argument('--port', '-p', dest='port', help='the port of restful server')
parser_stop.add_argument('--port', '-p', dest='port', type=int, help='the port of restful server')
parser_stop.add_argument('--all', '-a', action='store_true', help='stop all of experiments')
parser_stop.set_defaults(func=stop_experiment)
......@@ -239,7 +239,7 @@ def parse_args():
parser_tensorboard_start = parser_tensorboard_subparsers.add_parser('start', help='start tensorboard')
parser_tensorboard_start.add_argument('id', nargs='?', help='the id of experiment')
parser_tensorboard_start.add_argument('--trial_id', '-T', dest='trial_id', help='the id of trial')
parser_tensorboard_start.add_argument('--port', dest='port', default=6006, help='the port to start tensorboard')
parser_tensorboard_start.add_argument('--port', dest='port', default=6006, type=int, help='the port to start tensorboard')
parser_tensorboard_start.set_defaults(func=start_tensorboard)
parser_tensorboard_stop = parser_tensorboard_subparsers.add_parser('stop', help='stop tensorboard')
parser_tensorboard_stop.add_argument('id', nargs='?', help='the id of experiment')
......
......@@ -129,7 +129,7 @@ def parse_ids(args):
return running_experiment_list
if args.port is not None:
for key in running_experiment_list:
if str(experiment_dict[key]['port']) == args.port:
if experiment_dict[key]['port'] == args.port:
result_list.append(key)
if args.id and result_list and args.id != result_list[0]:
print_error('Experiment id and resful server port not match')
......
......@@ -9,8 +9,6 @@ from .common_utils import print_error
def rest_put(url, data, timeout, show_error=False):
'''Call rest put method'''
print('## [nnictl] PUT', url)
print(data)
try:
response = requests.put(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\
data=data, timeout=timeout)
......@@ -22,8 +20,6 @@ def rest_put(url, data, timeout, show_error=False):
def rest_post(url, data, timeout, show_error=False):
'''Call rest post method'''
print('## [nnictl] POST', url)
print(data)
try:
response = requests.post(url, headers={'Accept': 'application/json', 'Content-Type': 'application/json'},\
data=data, timeout=timeout)
......
......@@ -134,7 +134,6 @@ def start_tensorboard(args):
if experiment_dict[args.id]["status"] == "STOPPED":
print_error("Experiment {} is stopped...".format(args.id))
return
config_file_name = experiment_dict[experiment_id]['fileName']
nni_config = Config(args.id)
if nni_config.get_config('experimentConfig').get('trainingServicePlatform') == 'adl':
adl_tensorboard_helper(args)
......
trigger: none
pr: none
schedules:
- cron: 0 16 * * *
branches:
include: [ master ]
jobs:
- job: linux
pool: NNI CI GPU3
timeoutInMinutes: 120
steps:
- script: |
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]999.$(date -u +%Y%m%d%H%M%S)"
python3 -m pip install -U --upgrade pip setuptools
python3 -m pip install -U pytest
displayName: Prepare
- script: |
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl
displayName: Install NNI
- script: |
set -e
python3 -m pip install -U scikit-learn==0.23.2
python3 -m pip install -U torchvision==0.4.2
python3 -m pip install -U torch==1.3.1
python3 -m pip install -U keras==2.1.6
python3 -m pip install -U tensorflow==2.3.1 tensorflow-estimator==2.3.0
python3 -m pip install -U thop
sudo apt-get install swig -y
nnictl package install --name=SMAC
nnictl package install --name=BOHB
nnictl package install --name=PPOTuner
displayName: Install extra dependencies
- script: |
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl package install .
displayName: Install customized tuner
- script: |
set -e
(cd test && python3 -m pytest ut)
export PATH=$PATH:$PWD/toolchain/yarn/bin
export CI=true
(cd ts/nni_manager && yarn test)
(cd ts/nasui && yarn test)
displayName: Unit test
continueOnError: true
- script: |
cd test
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts local
displayName: Integration test
continueOnError: true
- script: |
cd test
source scripts/nas.sh
displayName: NAS test
continueOnError: true
- script: |
cd test
source scripts/model_compression.sh
displayName: Model compression test
trigger: none
pr: none
schedules:
- cron: 0 16 * * *
branches:
include: [ master ]
jobs:
- job: local_windows
pool: NNI CI WINDOWS2
timeoutInMinutes: 120
steps:
- script: |
python -m pip install -U --upgrade pip setuptools
python -m pip install -U pytest
displayName: Install Python tools
- script: |
python -m pip uninstall nni --yes
set NNI_RELEASE=999.0
python setup.py build_ts
python setup.py bdist_wheel -p win_amd64
python -m pip install dist/nni-999.0-py3-none-win_amd64.whl
displayName: Install NNI
- script: |
python -m pip install -U scikit-learn==0.23.2
python -m pip install -U keras==2.1.6
python -m pip install -U torchvision===0.4.1 torch===1.3.1 -f https://download.pytorch.org/whl/torch_stable.html
python -m pip install -U tensorflow==2.3.1 tensorflow-estimator==2.3.0
nnictl package install --name=PPOTuner
displayName: Install extra dependencies
- script: |
cd examples/tuners/customized_tuner
python setup.py develop --user
nnictl package install .
displayName: Install example customized tuner
- script: |
cd test
python -m pytest ut
echo "TODO: TypeScript UT"
displayName: Unit test
continueOnError: true
- script: |
cd test
python nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts local
displayName: Integration test
trigger: none
pr: none
schedules:
- cron: 0 16 * * *
branches:
include: [ master ]
# variables set on VSO: (mostly for security concern)
# pai_user
# pai_token
# manager_ip
# docker_hub_password
jobs:
- job: pai
pool: NNI CI PAI CLI
timeoutInMinutes: 120
steps:
- script: |
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
echo "Working directory: ${PWD}"
echo "NNI version: ${NNI_RELEASE}"
echo "Build docker image: $(build_docker_image)"
python3 -m pip install -U --upgrade pip setuptools
displayName: Prepare
- script: |
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install -U dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl
displayName: Build and install NNI
- script: |
set -e
sudo apt-get install swig -y
nnictl package install --name=SMAC
nnictl package install --name=BOHB
displayName: Install extra tuners
- script: |
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl package install .
displayName: Install customized tuner
- script: |
set -e
docker login -u nnidev -p $(docker_hub_password)
echo '## Build docker image ##'
docker build --build-arg NNI_RELEASE=${NNI_RELEASE} -t nnidev/nni-it-pai:latest .
echo '## Upload docker image ##'
docker push nnidev/nni-it-pai:latest
condition: eq(variables['build_docker_image'], 'true')
displayName: Build and upload docker image
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts pai \
--pai_reuse false \
--pai_host https://ne.openpai.org \
--pai_user $(pai_user) \
--nni_docker_image nnidev/nni-it-pai:latest \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_ip $(manager_ip) \
--vc nni
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: Integration test
- script: |
set -e
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts pai \
--pai_reuse true \
--pai_host https://ne.openpai.org \
--pai_user $(pai_user) \
--nni_docker_image nnidev/nni-it-pai:latest \
--pai_storage_config_name confignfs-data \
--pai_token $(pai_token) \
--nni_manager_nfs_mount_path /home/quzha/mnt-pai-ne/shinyang3 \
--container_nfs_mount_path /mnt/confignfs-data/shinyang3 \
--nni_manager_ip $(manager_ip) \
--vc nni
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts pai
displayName: Integration test (reuse mode)
trigger: none
pr: none
schedules:
- cron: 0 16 * * *
branches:
include: [ master ]
variables:
worker: remote_nni-ci-gpu-03
# variables set on VSO: (for security concern)
# manager_ip
# worker_ip
# password_in_docker
jobs:
- job: remote_linux2linux
pool: NNI CI REMOTE CLI
timeoutInMinutes: 120
steps:
- script: |
export NNI_RELEASE=999.$(date -u +%Y%m%d%H%M%S)
echo "##vso[task.setvariable variable=PATH]${PATH}:${HOME}/.local/bin"
echo "##vso[task.setvariable variable=NNI_RELEASE]${NNI_RELEASE}"
echo "Working directory: ${PWD}"
echo "NNI version: ${NNI_RELEASE}"
python3 -m pip install -U --upgrade pip setuptools
displayName: Prepare
- script: |
set -e
python3 setup.py build_ts
python3 setup.py bdist_wheel -p manylinux1_x86_64
python3 -m pip install dist/nni-${NNI_RELEASE}-py3-none-manylinux1_x86_64.whl
displayName: Install NNI
- script: |
set -e
sudo apt-get install swig -y
nnictl package install --name=SMAC
nnictl package install --name=BOHB
displayName: Install extra tuners
- script: |
set -e
cd examples/tuners/customized_tuner
python3 setup.py develop --user
nnictl package install .
displayName: Install customized tuner
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: $(worker)
sourceFolder: dist
targetFolder: /tmp/nnitest/$(Build.BuildId)/dist
overwrite: true
displayName: Copy wheel to remote machine
timeoutInMinutes: 10
- task: CopyFilesOverSSH@0
inputs:
sshEndpoint: $(worker)
sourceFolder: test
targetFolder: /tmp/nnitest/$(Build.BuildId)/test
overwrite: true
displayName: Copy test scripts to remote machine
timeoutInMinutes: 10
- task: SSH@0
inputs:
sshEndpoint: $(worker)
runOptions: commands
commands: |
python3 /tmp/nnitest/$(Build.BuildId)/test/nni_test/nnitest/remote_docker.py --mode start --name $(Build.BuildId) --image nni/nni
echo "##vso[task.setvariable variable=docker_port]$(cat /tmp/nnitest/$(Build.BuildId)/port)"
displayName: Start docker
- script: |
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse false \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test
- script: |
cd test
python3 nni_test/nnitest/generate_ts_config.py \
--ts remote \
--remote_reuse true \
--remote_user nni \
--remote_host $(worker_ip) \
--remote_port $(docker_port) \
--remote_pwd $(password_in_docker) \
--nni_manager_ip $(manager_ip)
python3 nni_test/nnitest/run_tests.py --config config/integration_tests.yml --ts remote
displayName: Integration test (reuse mode)
- task: SSH@0
inputs:
sshEndpoint: $(worker)
runOptions: commands
commands: python3 /tmp/nnitest/$(Build.BuildId)/test/nni_test/nnitest/remote_docker.py --mode stop --name $(Build.BuildId)
displayName: Stop docker
......@@ -47,4 +47,4 @@ ignore-patterns=test*
# List of members which are set dynamically and missed by pylint inference
generated-members=numpy.*,torch.*,tensorflow.*
ignored-modules=tensorflow,_win32,msvcrt
ignored-modules=tensorflow,_winapi,msvcrt
......@@ -61,7 +61,6 @@ dependencies = [
'hyperopt==0.1.2',
'json_tricks',
'netifaces',
'numpy',
'psutil',
'ruamel.yaml',
'requests',
......@@ -74,10 +73,13 @@ dependencies = [
'pkginfo',
'websockets',
'filelock',
'prettytable'
'prettytable',
'dataclasses ; python_version < "3.7"',
'numpy < 1.19.4 ; sys_platform == "win32"',
'numpy < 1.20 ; sys_platform != "win32" and python_version < "3.7"',
'numpy ; sys.platform != "win32" and python_version >= "3.7"'
]
release = os.environ.get('NNI_RELEASE')
def _setup():
......@@ -102,7 +104,7 @@ def _setup():
packages = _find_python_packages(),
package_data = {
'nni': ['**/requirements.txt'],
'nni': _find_requirements_txt(), # must do this manually due to setuptools issue #1806
'nni_node': _find_node_files() # note: this does not work before building
},
......@@ -128,19 +130,26 @@ def _setup():
def _find_python_packages():
packages = []
for dirpath, dirnames, filenames in os.walk('nni'):
if '/__pycache__' not in dirpath:
if '/__pycache__' not in dirpath and '/.mypy_cache' not in dirpath:
packages.append(dirpath.replace('/', '.'))
return sorted(packages) + ['nni_node']
def _find_requirements_txt():
requirement_files = []
for dirpath, dirnames, filenames in os.walk('nni'):
if 'requirements.txt' in filenames:
requirement_files.append(os.path.join(dirpath[len('nni/'):], 'requirements.txt'))
return requirement_files
def _find_node_files():
if not os.path.exists('nni_node'):
if release and 'built_ts' not in sys.argv:
sys.exit('ERROR: To build a release version, run "python setup.py built_ts" first')
if release and 'build_ts' not in sys.argv:
sys.exit('ERROR: To build a release version, run "python setup.py build_ts" first')
return []
files = []
for dirpath, dirnames, filenames in os.walk('nni_node'):
for filename in filenames:
files.append((dirpath + '/' + filename)[len('nni_node/'):])
files.append(os.path.join(dirpath[len('nni_node/'):], filename))
if '__init__.py' in files:
files.remove('__init__.py')
return sorted(files)
......@@ -165,21 +174,24 @@ class BuildTs(Command):
class Build(build):
def run(self):
assert release, 'Please set environment variable "NNI_RELEASE=<release_version>"'
assert os.path.isfile('nni_node/main.js'), 'Please run "build_ts" before "build"'
assert not os.path.islink('nni_node/main.js'), 'This is a development build'
if not release:
sys.exit('Please set environment variable "NNI_RELEASE=<release_version>"')
if os.path.islink('nni_node/main.js'):
sys.exit('A development build already exists. Please uninstall NNI and run "python3 setup.py clean --all".')
super().run()
class Develop(develop):
user_options = develop.user_options + [
('no-user', None, 'Prevent automatically adding "--user"')
('no-user', None, 'Prevent automatically adding "--user"'),
('skip-ts', None, 'Prevent building TypeScript modules')
]
boolean_options = develop.boolean_options + ['no-user']
boolean_options = develop.boolean_options + ['no-user', 'skip-ts']
def initialize_options(self):
super().initialize_options()
self.no_user = None
self.skip_ts = None
def finalize_options(self):
# if `--user` or `--no-user` is explicitly set, do nothing
......@@ -189,7 +201,8 @@ class Develop(develop):
super().finalize_options()
def run(self):
setup_ts.build(release=None)
if not self.skip_ts:
setup_ts.build(release=None)
super().run()
class Clean(clean):
......@@ -224,4 +237,5 @@ _temp_files = [
]
_setup()
if __name__ == '__main__':
_setup()
authorName: nni
experimentName: default_test
maxExecDuration: 5m
maxTrialNum: 4
trialConcurrency: 2
searchSpacePath: ./mnist_search_space.json
tuner:
builtinTunerName: Random
assessor:
builtinAssessorName: Medianstop
classArgs:
optimize_mode: maximize
trial:
codeDir: ../../../examples/trials/mnist-tfv2
command: python3 mnist.py
useAnnotation: false
multiPhase: false
multiThread: false
trainingServicePlatform: local
......@@ -37,17 +37,12 @@ testCases:
- name: sklearn-regression
configFile: test/config/examples/sklearn-regression.yml
- name: mnist-tfv1
configFile: test/config/examples/mnist-tfv1.yml
- name: mnist-tensorflow
configFile: test/config/examples/mnist-tfv2.yml
config:
maxTrialNum: 1
trialConcurrency: 1
- name: mnist-keras
configFile: test/config/examples/mnist-keras.yml
config:
maxTrialNum: 2
trialConcurrency: 1
trainingService: local remote # FIXME: timeout on pai, looks like tensorflow failed to link CUDA
- name: mnist-pytorch-local
configFile: test/config/examples/mnist-pytorch.yml
......@@ -61,11 +56,12 @@ testCases:
launchCommand: nnictl create --config $configFile --debug
trainingService: remote pai kubeflow frameworkcontroller dlts
- name: mnist-annotation
configFile: test/config/examples/mnist-annotation.yml
config:
maxTrialNum: 1
trialConcurrency: 1
# TODO: move this and following commented test cases to pytorch or tf2
#- name: mnist-annotation
# configFile: test/config/examples/mnist-annotation.yml
# config:
# maxTrialNum: 1
# trialConcurrency: 1
- name: cifar10-pytorch
configFile: test/config/examples/cifar10-pytorch.yml
......@@ -79,8 +75,8 @@ testCases:
command: python3 main.py --epochs 1 --batches 1
gpuNum: 0
- name: nested-ss
configFile: test/config/examples/mnist-nested-search-space.yml
#- name: nested-ss
# configFile: test/config/examples/mnist-nested-search-space.yml
- name: classic-nas-gen-ss
configFile: test/config/examples/classic-nas-pytorch.yml
......@@ -147,8 +143,8 @@ testCases:
config:
maxTrialNum: 4
trialConcurrency: 4
launchCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
launchCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
validator:
class: NnicliValidator
platform: linux darwin
......@@ -215,40 +211,40 @@ testCases:
#########################################################################
# nni tuners test
#########################################################################
- name: tuner-annel
configFile: test/config/tuners/anneal.yml
#- name: tuner-annel
# configFile: test/config/tuners/anneal.yml
- name: tuner-evolution
configFile: test/config/tuners/evolution.yml
#- name: tuner-evolution
# configFile: test/config/tuners/evolution.yml
- name: tuner-random
configFile: test/config/tuners/random.yml
#- name: tuner-random
# configFile: test/config/tuners/random.yml
- name: tuner-smac
configFile: test/config/tuners/smac.yml
platform: linux darwin
#- name: tuner-smac
# configFile: test/config/tuners/smac.yml
# platform: linux darwin
- name: tuner-tpe
configFile: test/config/tuners/tpe.yml
#- name: tuner-tpe
# configFile: test/config/tuners/tpe.yml
- name: tuner-batch
configFile: test/config/tuners/batch.yml
#- name: tuner-batch
# configFile: test/config/tuners/batch.yml
- name: tuner-bohb
configFile: test/config/tuners/bohb.yml
platform: linux darwin
#- name: tuner-bohb
# configFile: test/config/tuners/bohb.yml
# platform: linux darwin
- name: tuner-gp
configFile: test/config/tuners/gp.yml
#- name: tuner-gp
# configFile: test/config/tuners/gp.yml
- name: tuner-grid
configFile: test/config/tuners/gridsearch.yml
#- name: tuner-grid
# configFile: test/config/tuners/gridsearch.yml
- name: tuner-hyperband
configFile: test/config/tuners/hyperband.yml
#- name: tuner-hyperband
# configFile: test/config/tuners/hyperband.yml
- name: tuner-metis
configFile: test/config/tuners/metis.yml
#- name: tuner-metis
# configFile: test/config/tuners/metis.yml
- name: tuner-regularized_evolution
configFile: test/config/tuners/regularized_evolution_tuner.yml
......
......@@ -110,8 +110,8 @@ testCases:
config:
maxTrialNum: 4
trialConcurrency: 4
launchCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
launchCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
validator:
class: NnicliValidator
platform: linux darwin
......
......@@ -47,8 +47,8 @@ testCases:
config:
maxTrialNum: 4
trialConcurrency: 4
launchCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
launchCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.start_experiment("$configFile")'
stopCommand: python3 -c 'from nni.experiment import ExternalExperiment as Experiment; exp = Experiment(); exp.connect_experiment("http://localhost:8080/"); exp.stop_experiment()'
validator:
class: NnicliValidator
platform: linux darwin
......
......@@ -6,7 +6,7 @@ from os import remove
import subprocess
import json
import requests
from nni.experiment import Experiment
from nni.experiment import ExternalExperiment as Experiment
from nni.tools.nnictl.updater import load_search_space
from utils import METRICS_URL, GET_IMPORTED_DATA_URL
......
......@@ -13,8 +13,8 @@ from torchvision.models.resnet import resnet18
import unittest
from unittest import TestCase, main
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner, apply_compression_results
from nni.compression.pytorch import ModelSpeedup, apply_compression_results
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner
from nni.algorithms.compression.pytorch.pruning.weight_masker import WeightMasker
from nni.algorithms.compression.pytorch.pruning.one_shot import _StructuredFilterPruner
......@@ -30,13 +30,17 @@ RELATIVE_THRESHOLD = 0.01
# an absolute threshold to determine whether the final result is correct.
# The error should meet the RELATIVE_THREHOLD or the ABSOLUTE_THRESHOLD.
ABSOLUTE_THRESHOLD = 0.0001
class BackboneModel1(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 1, 1, 1)
def forward(self, x):
return self.conv1(x)
class BackboneModel2(torch.nn.Module):
def __init__(self):
super().__init__()
......@@ -53,32 +57,58 @@ class BackboneModel2(torch.nn.Module):
x = F.relu(self.bn2(self.conv2(x)))
x = F.max_pool2d(x, 2, 2)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class BigModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.backbone1 = BackboneModel1()
self.backbone2 = BackboneModel2()
self.fc3 = nn.Sequential(
self.fc3 = nn.Sequential(
nn.Linear(10, 10),
nn.BatchNorm1d(10),
nn.ReLU(inplace=True),
nn.Linear(10, 2)
)
def forward(self, x):
x = self.backbone1(x)
x = self.backbone2(x)
x = self.fc3(x)
return x
class TransposeModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 20, 5)
self.conv2 = nn.ConvTranspose2d(20, 50, 5, groups=2)
self.bn1 = nn.BatchNorm2d(self.conv1.out_channels)
self.bn2 = nn.BatchNorm2d(self.conv2.out_channels)
self.fc1 = nn.Linear(8 * 8 * 50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.bn1(self.conv1(x)))
# x = F.max_pool2d(x, 2, 2)
x = F.relu(self.bn2(self.conv2(x)))
# x = F.max_pool2d(x, 2, 2)
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
dummy_input = torch.randn(2, 1, 28, 28)
SPARSITY = 0.5
MODEL_FILE, MASK_FILE = './11_model.pth', './l1_mask.pth'
def prune_model_l1(model):
config_list = [{
'sparsity': SPARSITY,
......@@ -88,6 +118,7 @@ def prune_model_l1(model):
pruner.compress()
pruner.export_model(model_path=MODEL_FILE, mask_path=MASK_FILE)
def generate_random_sparsity(model):
cfg_list = []
for name, module in model.named_modules():
......@@ -97,18 +128,20 @@ def generate_random_sparsity(model):
'sparsity': sparsity})
return cfg_list
def zero_bn_bias(model):
with torch.no_grad():
for name, module in model.named_modules():
if isinstance(module, nn.BatchNorm2d) \
or isinstance(module, nn.BatchNorm3d) \
or isinstance(module, nn.BatchNorm1d):
or isinstance(module, nn.BatchNorm3d) \
or isinstance(module, nn.BatchNorm1d):
shape = module.bias.data.size()
device = module.bias.device
module.bias.data = torch.zeros(shape).to(device)
shape = module.running_mean.data.size()
module.running_mean = torch.zeros(shape).to(device)
class L1ChannelMasker(WeightMasker):
def __init__(self, model, pruner):
self.model = model
......@@ -143,21 +176,27 @@ class L1ChannelMasker(WeightMasker):
w_abs = weight.abs()
if wrapper.type == 'Conv2d':
w_abs_structured = w_abs.sum((0, 2, 3))
threshold = torch.topk(w_abs_structured, num_prune, largest=False)[0].max()
mask_weight = torch.gt(w_abs_structured, threshold)[None, :, None, None].expand_as(weight).type_as(weight)
threshold = torch.topk(
w_abs_structured, num_prune, largest=False)[0].max()
mask_weight = torch.gt(w_abs_structured, threshold)[
None, :, None, None].expand_as(weight).type_as(weight)
return {'weight_mask': mask_weight.detach()}
else:
# Linear
assert wrapper.type == 'Linear'
w_abs_structured = w_abs.sum((0))
threshold = torch.topk(w_abs_structured, num_prune, largest=False)[0].max()
mask_weight = torch.gt(w_abs_structured, threshold)[None, :].expand_as(weight).type_as(weight)
threshold = torch.topk(
w_abs_structured, num_prune, largest=False)[0].max()
mask_weight = torch.gt(w_abs_structured, threshold)[
None, :].expand_as(weight).type_as(weight)
return {'weight_mask': mask_weight.detach(), 'bias_mask': mask_bias}
class L1ChannelPruner(_StructuredFilterPruner):
def __init__(self, model, config_list, optimizer=None, dependency_aware=False, dummy_input=None):
super().__init__(model, config_list, pruning_algorithm='l1', optimizer=optimizer,
dependency_aware=dependency_aware, dummy_input=dummy_input)
def validate_config(self, model, config_list):
pass
......@@ -177,6 +216,7 @@ def channel_prune(model):
pruner.compress()
pruner.export_model(model_path=MODEL_FILE, mask_path=MASK_FILE)
class SpeedupTestCase(TestCase):
def test_speedup_vgg16(self):
prune_model_l1(vgg16())
......@@ -187,8 +227,10 @@ class SpeedupTestCase(TestCase):
orig_model = vgg16()
assert model.training
assert model.features[2].out_channels == int(orig_model.features[2].out_channels * SPARSITY)
assert model.classifier[0].in_features == int(orig_model.classifier[0].in_features * SPARSITY)
assert model.features[2].out_channels == int(
orig_model.features[2].out_channels * SPARSITY)
assert model.classifier[0].in_features == int(
orig_model.classifier[0].in_features * SPARSITY)
def test_speedup_bigmodel(self):
prune_model_l1(BigModel())
......@@ -205,23 +247,55 @@ class SpeedupTestCase(TestCase):
model.eval()
speedup_out = model(dummy_input)
if not torch.allclose(mask_out, speedup_out, atol=1e-07):
print('input:', dummy_input.size(), torch.abs(dummy_input).sum((2,3)))
print('input:', dummy_input.size(),
torch.abs(dummy_input).sum((2, 3)))
print('mask_out:', mask_out)
print('speedup_out:', speedup_out)
raise RuntimeError('model speedup inference result is incorrect!')
orig_model = BigModel()
assert model.backbone2.conv1.out_channels == int(orig_model.backbone2.conv1.out_channels * SPARSITY)
assert model.backbone2.conv2.in_channels == int(orig_model.backbone2.conv2.in_channels * SPARSITY)
assert model.backbone2.conv2.out_channels == int(orig_model.backbone2.conv2.out_channels * SPARSITY)
assert model.backbone2.fc1.in_features == int(orig_model.backbone2.fc1.in_features * SPARSITY)
assert model.backbone2.conv1.out_channels == int(
orig_model.backbone2.conv1.out_channels * SPARSITY)
assert model.backbone2.conv2.in_channels == int(
orig_model.backbone2.conv2.in_channels * SPARSITY)
assert model.backbone2.conv2.out_channels == int(
orig_model.backbone2.conv2.out_channels * SPARSITY)
assert model.backbone2.fc1.in_features == int(
orig_model.backbone2.fc1.in_features * SPARSITY)
def test_convtranspose_model(self):
ori_model = TransposeModel()
dummy_input = torch.rand(1, 3, 8, 8)
config_list = [{'sparsity': 0.5, 'op_types': ['Conv2d']}]
pruner = L1FilterPruner(ori_model, config_list)
pruner.compress()
ori_model(dummy_input)
pruner.export_model(MODEL_FILE, MASK_FILE)
pruner._unwrap_model()
new_model = TransposeModel()
state_dict = torch.load(MODEL_FILE)
new_model.load_state_dict(state_dict)
ms = ModelSpeedup(new_model, dummy_input, MASK_FILE)
ms.speedup_model()
zero_bn_bias(ori_model)
zero_bn_bias(new_model)
ori_out = ori_model(dummy_input)
new_out = new_model(dummy_input)
ori_sum = torch.sum(ori_out)
speeded_sum = torch.sum(new_out)
print('Tanspose Speedup Test: ori_sum={} speedup_sum={}'.format(ori_sum, speeded_sum))
assert (abs(ori_sum - speeded_sum) / abs(ori_sum) < RELATIVE_THRESHOLD) or \
(abs(ori_sum - speeded_sum) < ABSOLUTE_THRESHOLD)
# FIXME: This test case might fail randomly, no idea why
# Example: https://msrasrg.visualstudio.com/NNIOpenSource/_build/results?buildId=16282
def test_speedup_integration(self):
for model_name in ['resnet18', 'squeezenet1_1', 'mobilenet_v2', 'densenet121', 'densenet169', 'inception_v3', 'resnet50']:
for model_name in ['resnet18', 'squeezenet1_1',
'mobilenet_v2', 'densenet121',
# 'inception_v3' inception is too large and may fail the pipeline
'densenet169', 'resnet50']:
kwargs = {
'pretrained': True
}
......@@ -235,7 +309,7 @@ class SpeedupTestCase(TestCase):
Model = getattr(models, model_name)
net = Model(**kwargs).to(device)
speedup_model = Model(**kwargs).to(device)
net.eval() # this line is necessary
net.eval() # this line is necessary
speedup_model.eval()
# random generate the prune config for the pruner
cfgs = generate_random_sparsity(net)
......@@ -258,8 +332,10 @@ class SpeedupTestCase(TestCase):
speeded_out = speedup_model(data)
ori_sum = torch.sum(ori_out).item()
speeded_sum = torch.sum(speeded_out).item()
print('Sum of the output of %s (before speedup):'%model_name, ori_sum)
print('Sum of the output of %s (after speedup):'%model_name, speeded_sum)
print('Sum of the output of %s (before speedup):' %
model_name, ori_sum)
print('Sum of the output of %s (after speedup):' %
model_name, speeded_sum)
assert (abs(ori_sum - speeded_sum) / abs(ori_sum) < RELATIVE_THRESHOLD) or \
(abs(ori_sum - speeded_sum) < ABSOLUTE_THRESHOLD)
......@@ -296,5 +372,6 @@ class SpeedupTestCase(TestCase):
os.remove(MODEL_FILE)
os.remove(MASK_FILE)
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment