Unverified Commit b469e1c1 authored by J-shang's avatar J-shang Committed by GitHub
Browse files

Merge pull request #4674 from J-shang/fix-conflict

parents acad1c2d e5c34ea9
......@@ -346,12 +346,21 @@ Then follow the 3 steps:
.. note:: The data loading used in the official repo is `slightly different from usual <https://github.com/megvii-model/SinglePathOneShot/issues/5>`__, as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option ``--spos-preprocessing`` will simulate the behavior used originally and enable you to use the checkpoints pretrained.
2. **Evolution Search**: Single Path One-Shot leverages evolution algorithm to search for the best architecture. In the paper, the search module, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set. In this example, we have an incomplete implementation of the evolution search. The example only support training from scratch. Inheriting weights from pretrained supernet is not supported yet. To search with the regularized evolution strategy, run
2. **Evolution Search**: Single Path One-Shot leverages evolution algorithm to search for the best architecture. In the paper, the search module, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set.
In this example, it will inherit the ``state_dict`` of supernet from `./data/checkpoint-150000.pth.tar`, and search the best architecture with the regularized evolution strategy. Search in the supernet with the following command
.. code-block:: bash
python search.py
NNI support a latency filter to filter unsatisfied model from search phase. Latency is predicted by Microsoft nn-Meter (https://github.com/microsoft/nn-Meter). To apply the latency filter, users could run search.py with additional arguments ``--latency-filter``. Here is an example:
.. code-block:: bash
python search.py --latency-filter cortexA76cpu_tflite21
Note that the latency filter is only supported for base execution engine.
The final architecture exported from every epoch of evolution can be found in ``trials`` under the working directory of your tuner, which, by default, is ``$HOME/nni-experiments/your_experiment_id/trials``.
3. **Train for Evaluation**:
......@@ -366,7 +375,6 @@ Known Limitations
"""""""""""""""""
* Block search only. Channel search is not supported yet.
* In the search phase, training from the scratch is required. Inheriting weights from supernet is not supported yet.
Current Reproduction Results
""""""""""""""""""""""""""""
......
......@@ -8,10 +8,12 @@ import re
import torch
import nni.retiarii.nn.pytorch as nn
from nni.retiarii.nn.pytorch import LayerChoice
from nni.retiarii.serializer import model_wrapper
from blocks import ShuffleNetBlock, ShuffleXceptionBlock
@model_wrapper
class ShuffleNetV2OneShot(nn.Module):
block_keys = [
'shufflenet_3x3',
......
# This file is to demo the usage of multi-trial NAS in the usage of SPOS search space.
import click
import time
import json
import nni.retiarii.evaluator.pytorch as pl
import random
import logging
import argparse
import numpy as np
import torch
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import nni
from nn_meter import load_latency_predictor
import nni.retiarii.nn.pytorch as nn
import nni.retiarii.strategy as strategy
from nni.retiarii import serialize
from nni.retiarii.evaluator.functional import FunctionalEvaluator
from nni.retiarii.utils import original_state_dict_hooks
from nni.retiarii.oneshot.pytorch.utils import AverageMeterGroup
from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
from torchvision import transforms
from torchvision.datasets import CIFAR10
from nn_meter import load_latency_predictor
from network import ShuffleNetV2OneShot
from utils import get_archchoice_by_model
from network import ShuffleNetV2OneShot, load_and_parse_state_dict
from utils import CrossEntropyLabelSmooth, accuracy, ToBGRTensor, get_archchoice_by_model
logger = logging.getLogger("nni.spos.search")
def retrain_bn(model, criterion, max_iters, log_freq, loader):
with torch.no_grad():
logger.info("Clear BN statistics...")
for m in model.modules():
if isinstance(m, nn.BatchNorm2d):
m.running_mean = torch.zeros_like(m.running_mean)
m.running_var = torch.ones_like(m.running_var)
logger.info("Train BN with training set (BN sanitize)...")
model.train()
meters = AverageMeterGroup()
for step in range(max_iters):
inputs, targets = next(iter(loader))
inputs, targets = inputs.to('cuda'), targets.to('cuda')
logits = model(inputs)
loss = criterion(logits, targets)
metrics = accuracy(logits, targets)
metrics["loss"] = loss.item()
meters.update(metrics)
if step % log_freq == 0 or step + 1 == max_iters:
logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters)
def test_acc(model, criterion, log_freq, loader):
logger.info("Start testing...")
model.eval()
meters = AverageMeterGroup()
start_time = time.time()
with torch.no_grad():
for step, (inputs, targets) in enumerate(loader):
inputs, targets = inputs.to('cuda'), targets.to('cuda')
logits = model(inputs)
loss = criterion(logits, targets)
metrics = accuracy(logits, targets)
metrics["loss"] = loss.item()
meters.update(metrics)
if step % log_freq == 0 or step + 1 == len(loader):
logger.info("Valid Step [%d/%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f",
step + 1, len(loader), time.time() - start_time,
meters.acc1.avg, meters.acc5.avg, meters.loss.avg)
return meters.acc1.avg
def evaluate_acc(class_cls, criterion, args, train_dataset, val_dataset):
model = class_cls()
with original_state_dict_hooks(model):
model.load_state_dict(load_and_parse_state_dict(args.checkpoint), strict=False)
model.cuda()
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, num_workers=args.workers)
test_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.test_batch_size, num_workers=args.workers)
acc_before = test_acc(model, criterion, args.log_frequency, test_loader)
nni.report_intermediate_result(acc_before)
retrain_bn(model, criterion, args.train_iters, args.log_frequency, train_loader)
acc = test_acc(model, criterion, args.log_frequency, test_loader)
assert isinstance(acc, float)
nni.report_intermediate_result(acc)
nni.report_final_result(acc)
class LatencyFilter:
def __init__(self, threshold, predictor, predictor_version=None, reverse=False):
"""
Filter the models according to predicted latency.
Filter the models according to predicted latency. If the predicted latency of the ir model is larger than
the given threshold, the ir model will be filtered and will not be considered as a searched architecture.
Parameters
----------
......@@ -37,42 +109,78 @@ class LatencyFilter:
return latency < self.threshold
@click.command()
@click.option('--port', default=8081, help='On which port the experiment is run.')
def _main(port):
base_model = ShuffleNetV2OneShot(32)
base_predictor = 'cortexA76cpu_tflite21'
transf = [
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip()
]
normalize = [
transforms.ToTensor(),
transforms.Normalize([0.49139968, 0.48215827, 0.44653124], [0.24703233, 0.24348505, 0.26158768])
]
# FIXME
# CIFAR10 is used here temporarily.
# Actually we should load weight from supernet and evaluate on imagenet.
train_dataset = serialize(CIFAR10, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
test_dataset = serialize(CIFAR10, 'data', train=False, transform=transforms.Compose(normalize))
trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=64),
val_dataloaders=pl.DataLoader(test_dataset, batch_size=64),
max_epochs=2, gpus=1)
simple_strategy = strategy.RegularizedEvolution(model_filter=LatencyFilter(threshold=100, predictor=base_predictor),
sample_size=1, population_size=2, cycles=2)
exp = RetiariiExperiment(base_model, trainer, strategy=simple_strategy)
def _main():
parser = argparse.ArgumentParser("SPOS Evolutional Search")
parser.add_argument("--port", type=int, default=8084)
parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet")
parser.add_argument("--checkpoint", type=str, default="./data/checkpoint-150000.pth.tar")
parser.add_argument("--spos-preprocessing", action="store_true", default=False,
help="When true, image values will range from 0 to 255 and use BGR "
"(as in original repo).")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--workers", type=int, default=6)
parser.add_argument("--train-batch-size", type=int, default=128)
parser.add_argument("--train-iters", type=int, default=200)
parser.add_argument("--test-batch-size", type=int, default=512)
parser.add_argument("--log-frequency", type=int, default=10)
parser.add_argument("--label-smoothing", type=float, default=0.1)
parser.add_argument("--evolution-sample-size", type=int, default=10)
parser.add_argument("--evolution-population-size", type=int, default=50)
parser.add_argument("--evolution-cycles", type=int, default=10)
parser.add_argument("--latency-filter", type=str, default=None,
help="Apply latency filter by calling the name of the applied hardware.")
parser.add_argument("--latency-threshold", type=float, default=100)
args = parser.parse_args()
# use a fixed set of image will improve the performance
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True
assert torch.cuda.is_available()
base_model = ShuffleNetV2OneShot()
criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing)
if args.spos_preprocessing:
# ``nni.trace`` is used to make transforms serializable, so that the trials can run other processes or on remote servers.
trans = nni.trace(transforms.Compose)([
nni.trace(transforms.RandomResizedCrop)(224),
nni.trace(transforms.ColorJitter)(brightness=0.4, contrast=0.4, saturation=0.4),
nni.trace(transforms.RandomHorizontalFlip)(0.5),
nni.trace(ToBGRTensor)(),
])
else:
# ``nni.trace`` is used to make transforms serializable, so that the trials can run other processes or on remote servers.
trans = nni.trace(transforms.Compose)([
nni.trace(transforms.RandomResizedCrop)(224),
nni.trace(transforms.ToTensor)()
])
train_dataset = nni.trace(datasets.ImageNet)(args.imagenet_dir, split='train', transform=trans)
val_dataset = nni.trace(datasets.ImageNet)(args.imagenet_dir, split='val', transform=trans)
if args.latency_filter:
latency_filter = LatencyFilter(threshold=args.latency_threshold, predictor=args.latency_filter)
else:
latency_filter = None
evaluator = FunctionalEvaluator(evaluate_acc, criterion=criterion, args=args, train_dataset=train_dataset, val_dataset=val_dataset)
evolution_strategy = strategy.RegularizedEvolution(
model_filter=latency_filter,
sample_size=args.evolution_sample_size, population_size=args.evolution_population_size, cycles=args.evolution_cycles)
exp = RetiariiExperiment(base_model, evaluator, strategy=evolution_strategy)
exp_config = RetiariiExeConfig('local')
exp_config.trial_concurrency = 2
# exp_config.max_trial_number = 2
exp_config.trial_gpu_number = 1
exp_config.max_trial_number = args.evolution_cycles
exp_config.training_service.use_active_gpu = False
exp_config.execution_engine = 'base'
exp_config.dummy_input = [1, 3, 32, 32]
exp_config.dummy_input = [1, 3, 224, 224]
exp.run(exp_config, port)
exp.run(exp_config, args.port)
print('Exported models:')
for i, model in enumerate(exp.export_top_models(formatter='dict')):
......@@ -80,6 +188,5 @@ def _main(port):
with open(f'architecture_final_{i}.json', 'w') as f:
json.dump(get_archchoice_by_model(model), f, indent=4)
if __name__ == '__main__':
if __name__ == "__main__":
_main()
......@@ -47,10 +47,7 @@ if __name__ == "__main__":
if args.load_checkpoint:
if not args.spos_preprocessing:
logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.")
# load state_dict and
model_dict = model.state_dict()
model_dict.update(load_and_parse_state_dict())
model.load_state_dict(model_dict)
model.load_state_dict(load_and_parse_state_dict(), strict=False)
logger.info(f'Model loaded from ./data/checkpoint-150000.pth.tar')
model.cuda()
if torch.cuda.device_count() > 1: # exclude last gpu, saving for data preprocessing on gpu
......
......@@ -57,8 +57,13 @@ class ToBGRTensor(object):
def get_archchoice_by_model(model):
"""
For ``exp_config.execution_engine = 'base'``, the top model exported by ``exp.export_top_models`` is a dict with format as
``"LayerChoice1": "layerchoice_LayerChoice1_0"``. However when loading architecture from ``fixed_arch``, the dict value is needed
to be converted to int. This function removed "layerchoice" string in choice value to meet the requirement of ``fixed_arch``.
The output will be ``"LayerChoice1": "0"``.
"""
result = {}
for k, v in model.items():
assert k in v
result[k] = model[k].split("_")[-1]
return result
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment