Merge pull request #4674 from J-shang/fix-conflict

b469e1c1 · J-shang · GitHub · acad1c2d · e5c34ea9 · b469e1c1
Unverified Commit b469e1c1 authored Mar 22, 2022 by J-shang Committed by GitHub Mar 22, 2022
5 changed files
--- a/docs/source/nas/exploration_strategy.rst
+++ b/docs/source/nas/exploration_strategy.rst
@@ -346,12 +346,21 @@ Then follow the 3 steps:

   .. note:: The data loading used in the official repo is `slightly different from usual <https://github.com/megvii-model/SinglePathOneShot/issues/5>`__, as they use BGR tensor and keep the values between 0 and 255 intentionally to align with their own DL framework. The option ``--spos-preprocessing`` will simulate the behavior used originally and enable you to use the checkpoints pretrained.

-2. **Evolution Search**: Single Path One-Shot leverages evolution algorithm to search for the best architecture. In the paper, the search module, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set. In this example, we have an incomplete implementation of the evolution search. The example only support training from scratch. Inheriting weights from pretrained supernet is not supported yet. To search with the regularized evolution strategy, run
+2. **Evolution Search**: Single Path One-Shot leverages evolution algorithm to search for the best architecture. In the paper, the search module, which is responsible for testing the sampled architecture, recalculates all the batch norm for a subset of training images, and evaluates the architecture on the full validation set.
+   In this example, it will inherit the ``state_dict`` of supernet from `./data/checkpoint-150000.pth.tar`, and search the best architecture with the regularized evolution strategy. Search in the supernet with the following command

   .. code-block:: bash

      python search.py

+   NNI support a latency filter to filter unsatisfied model from search phase. Latency is predicted by Microsoft nn-Meter (https://github.com/microsoft/nn-Meter). To apply the latency filter, users could run search.py with additional arguments ``--latency-filter``. Here is an example:
+
+   .. code-block:: bash
+
+      python search.py --latency-filter cortexA76cpu_tflite21
+
+   Note that the latency filter is only supported for base execution engine.
+
   The final architecture exported from every epoch of evolution can be found in ``trials`` under the working directory of your tuner, which, by default, is ``$HOME/nni-experiments/your_experiment_id/trials``.

 3. **Train for Evaluation**:
@@ -366,7 +375,6 @@ Known Limitations
 """""""""""""""""

 * Block search only. Channel search is not supported yet.
-* In the search phase, training from the scratch is required. Inheriting weights from supernet is not supported yet.

 Current Reproduction Results
 """"""""""""""""""""""""""""

--- a/examples/nas/oneshot/spos/network.py
+++ b/examples/nas/oneshot/spos/network.py
@@ -8,10 +8,12 @@ import re
 import torch
 import nni.retiarii.nn.pytorch as nn
 from nni.retiarii.nn.pytorch import LayerChoice
+from nni.retiarii.serializer import model_wrapper

 from blocks import ShuffleNetBlock, ShuffleXceptionBlock


+@model_wrapper
 class ShuffleNetV2OneShot(nn.Module):
    block_keys = [
        'shufflenet_3x3',

--- a/examples/nas/oneshot/spos/search.py
+++ b/examples/nas/oneshot/spos/search.py
-# This file is to demo the usage of multi-trial NAS in the usage of SPOS search space.
-
-import click
+import time
 import json
-import nni.retiarii.evaluator.pytorch as pl
+import random
+import logging
+import argparse
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import nni
+from nn_meter import load_latency_predictor
+import nni.retiarii.nn.pytorch as nn
 import nni.retiarii.strategy as strategy
-from nni.retiarii import serialize
+from nni.retiarii.evaluator.functional import FunctionalEvaluator
+from nni.retiarii.utils import original_state_dict_hooks
+from nni.retiarii.oneshot.pytorch.utils import AverageMeterGroup
 from nni.retiarii.experiment.pytorch import RetiariiExeConfig, RetiariiExperiment
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-from nn_meter import load_latency_predictor

-from network import ShuffleNetV2OneShot
-from utils import get_archchoice_by_model
+from network import ShuffleNetV2OneShot, load_and_parse_state_dict
+from utils import CrossEntropyLabelSmooth, accuracy, ToBGRTensor, get_archchoice_by_model
+
+logger = logging.getLogger("nni.spos.search")
+
+
+def retrain_bn(model, criterion, max_iters, log_freq, loader):
+    with torch.no_grad():
+        logger.info("Clear BN statistics...")
+        for m in model.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.running_mean = torch.zeros_like(m.running_mean)
+                m.running_var = torch.ones_like(m.running_var)
+
+        logger.info("Train BN with training set (BN sanitize)...")
+        model.train()
+        meters = AverageMeterGroup()
+        for step in range(max_iters):
+            inputs, targets = next(iter(loader))
+            inputs, targets = inputs.to('cuda'), targets.to('cuda')
+            logits = model(inputs)
+            loss = criterion(logits, targets)
+            metrics = accuracy(logits, targets)
+            metrics["loss"] = loss.item()
+            meters.update(metrics)
+            if step % log_freq == 0 or step + 1 == max_iters:
+                logger.info("Train Step [%d/%d] %s", step + 1, max_iters, meters)
+
+
+def test_acc(model, criterion, log_freq, loader):
+    logger.info("Start testing...")
+    model.eval()
+    meters = AverageMeterGroup()
+    start_time = time.time()
+    with torch.no_grad():
+        for step, (inputs, targets) in enumerate(loader):
+            inputs, targets = inputs.to('cuda'), targets.to('cuda')
+            logits = model(inputs)
+            loss = criterion(logits, targets)
+            metrics = accuracy(logits, targets)
+            metrics["loss"] = loss.item()
+            meters.update(metrics)
+            if step % log_freq == 0 or step + 1 == len(loader):
+                logger.info("Valid Step [%d/%d] time %.3fs acc1 %.4f acc5 %.4f loss %.4f",
+                            step + 1, len(loader), time.time() - start_time,
+                            meters.acc1.avg, meters.acc5.avg, meters.loss.avg)
+    return meters.acc1.avg
+
+
+def evaluate_acc(class_cls, criterion, args, train_dataset, val_dataset):
+    model = class_cls()
+    with original_state_dict_hooks(model):
+        model.load_state_dict(load_and_parse_state_dict(args.checkpoint), strict=False)
+    model.cuda()
+    
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, num_workers=args.workers)
+    test_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.test_batch_size, num_workers=args.workers)
+
+    acc_before = test_acc(model, criterion, args.log_frequency, test_loader)
+    nni.report_intermediate_result(acc_before)
+
+    retrain_bn(model, criterion, args.train_iters, args.log_frequency, train_loader)
+    acc = test_acc(model, criterion, args.log_frequency, test_loader)
+    assert isinstance(acc, float)
+    nni.report_intermediate_result(acc)
+    nni.report_final_result(acc)


 class LatencyFilter:
    def __init__(self, threshold, predictor, predictor_version=None, reverse=False):
        """
-        Filter the models according to predicted latency.
+        Filter the models according to predicted latency. If the predicted latency of the ir model is larger than
+        the given threshold, the ir model will be filtered and will not be considered as a searched architecture.

        Parameters
        ----------
@@ -37,42 +109,78 @@ class LatencyFilter:
        return latency < self.threshold


-@click.command()
-@click.option('--port', default=8081, help='On which port the experiment is run.')
-def _main(port):
-    base_model = ShuffleNetV2OneShot(32)
-    base_predictor = 'cortexA76cpu_tflite21'
-    transf = [
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip()
-    ]
-    normalize = [
-        transforms.ToTensor(),
-        transforms.Normalize([0.49139968, 0.48215827, 0.44653124], [0.24703233, 0.24348505, 0.26158768])
-    ]
-    # FIXME
-    # CIFAR10 is used here temporarily.
-    # Actually we should load weight from supernet and evaluate on imagenet.
-    train_dataset = serialize(CIFAR10, 'data', train=True, download=True, transform=transforms.Compose(transf + normalize))
-    test_dataset = serialize(CIFAR10, 'data', train=False, transform=transforms.Compose(normalize))
-
-    trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=64),
-                                val_dataloaders=pl.DataLoader(test_dataset, batch_size=64),
-                                max_epochs=2, gpus=1)
-
-    simple_strategy = strategy.RegularizedEvolution(model_filter=LatencyFilter(threshold=100, predictor=base_predictor),
-                                                    sample_size=1, population_size=2, cycles=2)
-    exp = RetiariiExperiment(base_model, trainer, strategy=simple_strategy)
+def _main():
+    parser = argparse.ArgumentParser("SPOS Evolutional Search")
+    parser.add_argument("--port", type=int, default=8084)
+    parser.add_argument("--imagenet-dir", type=str, default="./data/imagenet")
+    parser.add_argument("--checkpoint", type=str, default="./data/checkpoint-150000.pth.tar")
+    parser.add_argument("--spos-preprocessing", action="store_true", default=False,
+                        help="When true, image values will range from 0 to 255 and use BGR "
+                             "(as in original repo).")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--workers", type=int, default=6)
+    parser.add_argument("--train-batch-size", type=int, default=128)
+    parser.add_argument("--train-iters", type=int, default=200)
+    parser.add_argument("--test-batch-size", type=int, default=512)
+    parser.add_argument("--log-frequency", type=int, default=10)
+    parser.add_argument("--label-smoothing", type=float, default=0.1)
+    parser.add_argument("--evolution-sample-size", type=int, default=10)
+    parser.add_argument("--evolution-population-size", type=int, default=50)
+    parser.add_argument("--evolution-cycles", type=int, default=10)
+    parser.add_argument("--latency-filter", type=str, default=None,
+                        help="Apply latency filter by calling the name of the applied hardware.")
+    parser.add_argument("--latency-threshold", type=float, default=100)
+
+    args = parser.parse_args()
+
+    # use a fixed set of image will improve the performance
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+    np.random.seed(args.seed)
+    random.seed(args.seed)
+    torch.backends.cudnn.deterministic = True
+
+    assert torch.cuda.is_available()
+
+    base_model = ShuffleNetV2OneShot()
+    criterion = CrossEntropyLabelSmooth(1000, args.label_smoothing)
+    if args.spos_preprocessing:
+        # ``nni.trace`` is used to make transforms serializable, so that the trials can run other processes or on remote servers.
+        trans = nni.trace(transforms.Compose)([
+            nni.trace(transforms.RandomResizedCrop)(224),
+            nni.trace(transforms.ColorJitter)(brightness=0.4, contrast=0.4, saturation=0.4),
+            nni.trace(transforms.RandomHorizontalFlip)(0.5),
+            nni.trace(ToBGRTensor)(),
+        ])
+    else:
+        # ``nni.trace`` is used to make transforms serializable, so that the trials can run other processes or on remote servers.
+        trans = nni.trace(transforms.Compose)([
+            nni.trace(transforms.RandomResizedCrop)(224),
+            nni.trace(transforms.ToTensor)()
+        ])
+    train_dataset = nni.trace(datasets.ImageNet)(args.imagenet_dir, split='train', transform=trans)
+    val_dataset = nni.trace(datasets.ImageNet)(args.imagenet_dir, split='val', transform=trans)
+
+    if args.latency_filter:
+        latency_filter = LatencyFilter(threshold=args.latency_threshold, predictor=args.latency_filter)
+    else:
+        latency_filter = None
+
+    evaluator = FunctionalEvaluator(evaluate_acc, criterion=criterion, args=args, train_dataset=train_dataset, val_dataset=val_dataset)
+    evolution_strategy = strategy.RegularizedEvolution(
+        model_filter=latency_filter,
+        sample_size=args.evolution_sample_size, population_size=args.evolution_population_size, cycles=args.evolution_cycles)
+    exp = RetiariiExperiment(base_model, evaluator, strategy=evolution_strategy)

    exp_config = RetiariiExeConfig('local')
    exp_config.trial_concurrency = 2
-    # exp_config.max_trial_number = 2
    exp_config.trial_gpu_number = 1
+    exp_config.max_trial_number = args.evolution_cycles
    exp_config.training_service.use_active_gpu = False
    exp_config.execution_engine = 'base'
-    exp_config.dummy_input = [1, 3, 32, 32]
+    exp_config.dummy_input = [1, 3, 224, 224]

-    exp.run(exp_config, port)
+    exp.run(exp_config, args.port)

    print('Exported models:')
    for i, model in enumerate(exp.export_top_models(formatter='dict')):
@@ -80,6 +188,5 @@ def _main(port):
        with open(f'architecture_final_{i}.json', 'w') as f: 
            json.dump(get_archchoice_by_model(model), f, indent=4)

-
-if __name__ == '__main__':
+if __name__ == "__main__":
    _main()
--- a/examples/nas/oneshot/spos/supernet.py
+++ b/examples/nas/oneshot/spos/supernet.py
@@ -47,10 +47,7 @@ if __name__ == "__main__":
    if args.load_checkpoint:
        if not args.spos_preprocessing:
            logger.warning("You might want to use SPOS preprocessing if you are loading their checkpoints.")
-        # load state_dict and 
-        model_dict = model.state_dict()
-        model_dict.update(load_and_parse_state_dict())
-        model.load_state_dict(model_dict)
+        model.load_state_dict(load_and_parse_state_dict(), strict=False)
        logger.info(f'Model loaded from ./data/checkpoint-150000.pth.tar')
    model.cuda()
    if torch.cuda.device_count() > 1:  # exclude last gpu, saving for data preprocessing on gpu

--- a/examples/nas/oneshot/spos/utils.py
+++ b/examples/nas/oneshot/spos/utils.py
@@ -57,8 +57,13 @@ class ToBGRTensor(object):


 def get_archchoice_by_model(model):
+    """
+    For ``exp_config.execution_engine = 'base'``, the top model exported by ``exp.export_top_models`` is a dict with format as
+    ``"LayerChoice1": "layerchoice_LayerChoice1_0"``. However when loading architecture from ``fixed_arch``, the dict value is needed 
+    to be converted to int. This function removed "layerchoice" string in choice value to meet the requirement of ``fixed_arch``.
+    The output will be ``"LayerChoice1": "0"``.
+    """
    result = {}
    for k, v in model.items():
-        assert k in v
        result[k] = model[k].split("_")[-1]
    return result