the source code of NNI for DCU

1011377c · qianyj · abc22158 · 1011377c · 1011377c · 1011377c
Commit 1011377c authored Mar 31, 2022 by qianyj
20 changed files
--- a/CHANGELOG
+++ b/CHANGELOG
--- a/examples/nas/legacy/cream/configs/retrain/114.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/114.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '112m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 470
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/14.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/14.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '14m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 470
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/23.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/23.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '23m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 470
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/287.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/287.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '287m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 470
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/43.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/43.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '43m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 43
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/481.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/481.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '481m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 481
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/604.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/604.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '604m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 604
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/retrain/72.yaml
+++ b/examples/nas/legacy/cream/configs/retrain/72.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: '72m_retrain'
+RESUME_PATH: './experiments/workspace/retrain/resume.pth.tar'
+SAVE_PATH: './experiments/workspace/retrain'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'random' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.2
+  SELECTION: 470
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9999
+
+
+LR: 0.064
+EPOCHS: 500
+OPT_EPS: 1e-3
+SCHED: 'cosine'
+OPT: 'rmsproptf'
+WARMUP_LR: 1e-6
+DECAY_EPOCHS: 2.4
+DECAY_RATE: 0.973
+WARMUP_EPOCHS: 3
+WEIGHT_DECAY: 1e-5
+
+AUGMENTATION:
+  AA: 'rand-m9-mstd0.5'
+  RE_PROB: 0.2  # random erase prob
+  RE_MODE: 'pixel'  # random erase mode
+
--- a/examples/nas/legacy/cream/configs/test.yaml
+++ b/examples/nas/legacy/cream/configs/test.yaml
+AUTO_RESUME: True
+DATA_DIR: './data/imagenet'
+MODEL: 'Childnet_Testing'
+RESUME_PATH: './experiments/workspace/ckps/42.pth.tar'
+SAVE_PATH: './'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 4
+NUM_GPU: 2
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'bilinear' # Image resize interpolation type
+  BATCH_SIZE: 32 # batch size
+  NO_PREFECHTER: False
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.0
+  SELECTION: 42
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9998
+
+OPTIMIZER:
+  MOMENTUM: 0.9
+  WEIGHT_DECAY: 1e-3
\ No newline at end of file
--- a/examples/nas/legacy/cream/configs/train.yaml
+++ b/examples/nas/legacy/cream/configs/train.yaml
+AUTO_RESUME: False
+DATA_DIR: './data/imagenet'
+MODEL: 'Supernet_Training'
+RESUME_PATH: './experiments/workspace/train/resume.pth.tar'
+SAVE_PATH: './'
+SEED: 42
+LOG_INTERVAL: 50
+RECOVERY_INTERVAL: 0
+WORKERS: 8
+NUM_GPU: 8
+SAVE_IMAGES: False
+AMP: False
+OUTPUT: 'None'
+EVAL_METRICS: 'prec1'
+TTA: 0
+LOCAL_RANK: 0
+
+DATASET:
+  NUM_CLASSES: 1000
+  IMAGE_SIZE: 224 # image patch size
+  INTERPOLATION: 'bilinear' # Image resize interpolation type
+  BATCH_SIZE: 128 # batch size
+
+NET:
+  GP: 'avg'
+  DROPOUT_RATE: 0.0
+
+  EMA:
+    USE: True
+    FORCE_CPU: False # force model ema to be tracked on CPU
+    DECAY: 0.9998
+
+OPT: 'sgd'
+LR: 1.0
+EPOCHS: 120
+META_LR: 1e-4
+
+BATCHNORM:
+  SYNC_BN: False
+
+SUPERNET:
+  UPDATE_ITER: 200
+  SLICE: 4
+  POOL_SIZE: 10
+  RESUNIT: False
+  DIL_CONV: False
+  UPDATE_2ND: True
+  FLOPS_MINIMUM: 0
+  FLOPS_MAXIMUM: 600
+  PICK_METHOD: 'meta'
+  META_STA_EPOCH: 20
+  HOW_TO_PROB: 'pre_prob'
+  PRE_PROB: (0.05,0.2,0.05,0.5,0.05,0.15)
\ No newline at end of file
--- a/examples/nas/legacy/cream/lib/config.py
+++ b/examples/nas/legacy/cream/lib/config.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# Written by Hao Du and Houwen Peng
+# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from yacs.config import CfgNode as CN
+
+DEFAULT_CROP_PCT = 0.875
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+__C = CN()
+
+cfg = __C
+
+__C.AUTO_RESUME = True
+__C.DATA_DIR = './data/imagenet'
+__C.MODEL = 'cream'
+__C.RESUME_PATH = './experiments/ckps/resume.pth.tar'
+__C.SAVE_PATH = './experiments/ckps/'
+__C.SEED = 42
+__C.LOG_INTERVAL = 50
+__C.RECOVERY_INTERVAL = 0
+__C.WORKERS = 4
+__C.NUM_GPU = 1
+__C.SAVE_IMAGES = False
+__C.AMP = False
+__C.ACC_GAP = 5
+__C.OUTPUT = 'output/path/'
+__C.EVAL_METRICS = 'prec1'
+__C.TTA = 0  # Test or inference time augmentation
+__C.LOCAL_RANK = 0
+__C.VERBOSE = False
+
+# dataset configs
+__C.DATASET = CN()
+__C.DATASET.NUM_CLASSES = 1000
+__C.DATASET.IMAGE_SIZE = 224  # image patch size
+__C.DATASET.INTERPOLATION = 'bilinear'  # Image resize interpolation type
+__C.DATASET.BATCH_SIZE = 32  # batch size
+__C.DATASET.NO_PREFECHTER = False
+__C.DATASET.PIN_MEM = True
+__C.DATASET.VAL_BATCH_MUL = 4
+
+
+# model configs
+__C.NET = CN()
+__C.NET.SELECTION = 14
+__C.NET.GP = 'avg'  # type of global pool ["avg", "max", "avgmax", "avgmaxc"]
+__C.NET.DROPOUT_RATE = 0.0  # dropout rate
+__C.NET.INPUT_ARCH = [[0], [3], [3, 3], [3, 1, 3], [3, 3, 3, 3], [3, 3, 3], [0]]
+
+# model ema parameters
+__C.NET.EMA = CN()
+__C.NET.EMA.USE = True
+__C.NET.EMA.FORCE_CPU = False  # force model ema to be tracked on CPU
+__C.NET.EMA.DECAY = 0.9998
+
+# optimizer configs
+__C.OPT = 'sgd'
+__C.OPT_EPS = 1e-2
+__C.MOMENTUM = 0.9
+__C.WEIGHT_DECAY = 1e-4
+__C.OPTIMIZER = CN()
+__C.OPTIMIZER.NAME = 'sgd'
+__C.OPTIMIZER.MOMENTUM = 0.9
+__C.OPTIMIZER.WEIGHT_DECAY = 1e-3
+
+# scheduler configs
+__C.SCHED = 'sgd'
+__C.LR_NOISE = None
+__C.LR_NOISE_PCT = 0.67
+__C.LR_NOISE_STD = 1.0
+__C.WARMUP_LR = 1e-4
+__C.MIN_LR = 1e-5
+__C.EPOCHS = 200
+__C.START_EPOCH = None
+__C.DECAY_EPOCHS = 30.0
+__C.WARMUP_EPOCHS = 3
+__C.COOLDOWN_EPOCHS = 10
+__C.PATIENCE_EPOCHS = 10
+__C.DECAY_RATE = 0.1
+__C.LR = 1e-2
+__C.META_LR = 1e-4
+
+# data augmentation parameters
+__C.AUGMENTATION = CN()
+__C.AUGMENTATION.AA = 'rand-m9-mstd0.5'
+__C.AUGMENTATION.COLOR_JITTER = 0.4
+__C.AUGMENTATION.RE_PROB = 0.2  # random erase prob
+__C.AUGMENTATION.RE_MODE = 'pixel'  # random erase mode
+__C.AUGMENTATION.MIXUP = 0.0  # mixup alpha
+__C.AUGMENTATION.MIXUP_OFF_EPOCH = 0  # turn off mixup after this epoch
+__C.AUGMENTATION.SMOOTHING = 0.1  # label smoothing parameters
+
+# batch norm parameters (only works with gen_efficientnet based models
+# currently)
+__C.BATCHNORM = CN()
+__C.BATCHNORM.SYNC_BN = False
+__C.BATCHNORM.BN_TF = False
+__C.BATCHNORM.BN_MOMENTUM = 0.1  # batchnorm momentum override
+__C.BATCHNORM.BN_EPS = 1e-5  # batchnorm eps override
+
+# supernet training hyperparameters
+__C.SUPERNET = CN()
+__C.SUPERNET.UPDATE_ITER = 1300
+__C.SUPERNET.SLICE = 4
+__C.SUPERNET.POOL_SIZE = 10
+__C.SUPERNET.RESUNIT = False
+__C.SUPERNET.DIL_CONV = False
+__C.SUPERNET.UPDATE_2ND = True
+__C.SUPERNET.FLOPS_MAXIMUM = 600
+__C.SUPERNET.FLOPS_MINIMUM = 0
+__C.SUPERNET.PICK_METHOD = 'meta'  # pick teacher method
+__C.SUPERNET.META_STA_EPOCH = 20  # start using meta picking method
+__C.SUPERNET.HOW_TO_PROB = 'pre_prob'  # sample method
+__C.SUPERNET.PRE_PROB = (0.05, 0.2, 0.05, 0.5, 0.05,
+                         0.15)  # sample prob in 'pre_prob'
--- a/examples/nas/legacy/cream/lib/core/retrain.py
+++ b/examples/nas/legacy/cream/lib/core/retrain.py
+import os
+import time
+import torch
+import torchvision
+
+from collections import OrderedDict
+
+from lib.utils.util import AverageMeter, accuracy, reduce_tensor
+
+def train_epoch(
+        epoch, model, loader, optimizer, loss_fn, cfg,
+        lr_scheduler=None, saver=None, output_dir='', use_amp=False,
+        model_ema=None, logger=None, writer=None, local_rank=0):
+    batch_time_m = AverageMeter()
+    data_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    prec1_m = AverageMeter()
+    prec5_m = AverageMeter()
+
+    model.train()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    num_updates = epoch * len(loader)
+    optimizer.zero_grad()
+    for batch_idx, (input, target) in enumerate(loader):
+        last_batch = batch_idx == last_idx
+        data_time_m.update(time.time() - end)
+
+        input = input.cuda()
+        target = target.cuda()
+        output = model(input)
+
+        loss = loss_fn(output, target)
+
+        prec1, prec5 = accuracy(output, target, topk=(1, 5))
+
+        if cfg.NUM_GPU > 1:
+            reduced_loss = reduce_tensor(loss.data, cfg.NUM_GPU)
+            prec1 = reduce_tensor(prec1, cfg.NUM_GPU)
+            prec5 = reduce_tensor(prec5, cfg.NUM_GPU)
+        else:
+            reduced_loss = loss.data
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        torch.cuda.synchronize()
+
+        losses_m.update(reduced_loss.item(), input.size(0))
+        prec1_m.update(prec1.item(), output.size(0))
+        prec5_m.update(prec5.item(), output.size(0))
+
+        if model_ema is not None:
+            model_ema.update(model)
+        num_updates += 1
+
+        batch_time_m.update(time.time() - end)
+        if last_batch or batch_idx % cfg.LOG_INTERVAL == 0:
+            lrl = [param_group['lr'] for param_group in optimizer.param_groups]
+            lr = sum(lrl) / len(lrl)
+
+            if local_rank == 0:
+                logger.info(
+                    'Train: {} [{:>4d}/{}] '
+                    'Loss: {loss.val:>9.6f} ({loss.avg:>6.4f}) '
+                    'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f}) '
+                    'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f}) '
+                    'Time: {batch_time.val:.3f}s, {rate:>7.2f}/s '
+                    '({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) '
+                    'LR: {lr:.3e}'
+                    'Data: {data_time.val:.3f} ({data_time.avg:.3f})'.format(
+                        epoch,
+                        batch_idx,
+                        len(loader),
+                        loss=losses_m,
+                        top1=prec1_m,
+                        top5=prec5_m,
+                        batch_time=batch_time_m,
+                        rate=input.size(0) *
+                        cfg.NUM_GPU /
+                        batch_time_m.val,
+                        rate_avg=input.size(0) *
+                        cfg.NUM_GPU /
+                        batch_time_m.avg,
+                        lr=lr,
+                        data_time=data_time_m))
+
+                writer.add_scalar(
+                    'Loss/train',
+                    prec1_m.avg,
+                    epoch *
+                    len(loader) +
+                    batch_idx)
+                writer.add_scalar(
+                    'Accuracy/train',
+                    prec1_m.avg,
+                    epoch *
+                    len(loader) +
+                    batch_idx)
+                writer.add_scalar(
+                    'Learning_Rate',
+                    optimizer.param_groups[0]['lr'],
+                    epoch * len(loader) + batch_idx)
+
+                if cfg.SAVE_IMAGES and output_dir:
+                    torchvision.utils.save_image(
+                        input, os.path.join(
+                            output_dir, 'train-batch-%d.jpg' %
+                            batch_idx), padding=0, normalize=True)
+
+        if saver is not None and cfg.RECOVERY_INTERVAL and (
+                last_batch or (batch_idx + 1) % cfg.RECOVERY_INTERVAL == 0):
+            saver.save_recovery(
+                model,
+                optimizer,
+                cfg,
+                epoch,
+                model_ema=model_ema,
+                use_amp=use_amp,
+                batch_idx=batch_idx)
+
+        if lr_scheduler is not None:
+            lr_scheduler.step_update(
+                num_updates=num_updates,
+                metric=losses_m.avg)
+
+        end = time.time()
+        # end for
+
+    if hasattr(optimizer, 'sync_lookahead'):
+        optimizer.sync_lookahead()
+
+    return OrderedDict([('loss', losses_m.avg)])
--- a/examples/nas/legacy/cream/lib/core/test.py
+++ b/examples/nas/legacy/cream/lib/core/test.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# Written by Hao Du and Houwen Peng
+# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com
+
+import time
+import torch
+
+from collections import OrderedDict
+from lib.utils.util import AverageMeter, accuracy, reduce_tensor
+
+
+def validate(epoch, model, loader, loss_fn, cfg, log_suffix='', logger=None, writer=None, local_rank=0):
+    batch_time_m = AverageMeter()
+    losses_m = AverageMeter()
+    prec1_m = AverageMeter()
+    prec5_m = AverageMeter()
+
+    model.eval()
+
+    end = time.time()
+    last_idx = len(loader) - 1
+    with torch.no_grad():
+        for batch_idx, (input, target) in enumerate(loader):
+            last_batch = batch_idx == last_idx
+
+            output = model(input)
+            if isinstance(output, (tuple, list)):
+                output = output[0]
+
+            # augmentation reduction
+            reduce_factor = cfg.TTA
+            if reduce_factor > 1:
+                output = output.unfold(
+                    0,
+                    reduce_factor,
+                    reduce_factor).mean(
+                    dim=2)
+                target = target[0:target.size(0):reduce_factor]
+
+            loss = loss_fn(output, target)
+            prec1, prec5 = accuracy(output, target, topk=(1, 5))
+
+            if cfg.NUM_GPU > 1:
+                reduced_loss = reduce_tensor(loss.data, cfg.NUM_GPU)
+                prec1 = reduce_tensor(prec1, cfg.NUM_GPU)
+                prec5 = reduce_tensor(prec5, cfg.NUM_GPU)
+            else:
+                reduced_loss = loss.data
+
+            torch.cuda.synchronize()
+
+            losses_m.update(reduced_loss.item(), input.size(0))
+            prec1_m.update(prec1.item(), output.size(0))
+            prec5_m.update(prec5.item(), output.size(0))
+
+            batch_time_m.update(time.time() - end)
+            end = time.time()
+            if local_rank == 0 and (last_batch or batch_idx % cfg.LOG_INTERVAL == 0):
+                log_name = 'Test' + log_suffix
+                logger.info(
+                    '{0}: [{1:>4d}/{2}]  '
+                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Prec@1: {top1.val:>7.4f} ({top1.avg:>7.4f})  '
+                    'Prec@5: {top5.val:>7.4f} ({top5.avg:>7.4f})'.format(
+                        log_name, batch_idx, last_idx,
+                        batch_time=batch_time_m, loss=losses_m,
+                        top1=prec1_m, top5=prec5_m))
+
+                writer.add_scalar(
+                    'Loss' + log_suffix + '/vaild',
+                    prec1_m.avg,
+                    epoch * len(loader) + batch_idx)
+                writer.add_scalar(
+                    'Accuracy' +
+                    log_suffix +
+                    '/vaild',
+                    prec1_m.avg,
+                    epoch *
+                    len(loader) +
+                    batch_idx)
+
+    metrics = OrderedDict(
+        [('loss', losses_m.avg), ('prec1', prec1_m.avg), ('prec5', prec5_m.avg)])
+
+    return metrics
--- a/examples/nas/legacy/cream/lib/models/blocks/__init__.py
+++ b/examples/nas/legacy/cream/lib/models/blocks/__init__.py
+from lib.models.blocks.residual_block import get_Bottleneck, get_BasicBlock
+from lib.models.blocks.inverted_residual_block import InvertedResidual
\ No newline at end of file
--- a/examples/nas/legacy/cream/lib/models/blocks/inverted_residual_block.py
+++ b/examples/nas/legacy/cream/lib/models/blocks/inverted_residual_block.py
+# This file is downloaded from
+# https://github.com/rwightman/pytorch-image-models
+
+import torch.nn as nn
+
+from timm.models.layers import create_conv2d
+from timm.models.efficientnet_blocks import make_divisible, resolve_se_args, \
+    SqueezeExcite, drop_path
+
+
+class InvertedResidual(nn.Module):
+    """ Inverted residual block w/ optional SE and CondConv routing"""
+
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            dw_kernel_size=3,
+            stride=1,
+            dilation=1,
+            pad_type='',
+            act_layer=nn.ReLU,
+            noskip=False,
+            exp_ratio=1.0,
+            exp_kernel_size=1,
+            pw_kernel_size=1,
+            se_ratio=0.,
+            se_kwargs=None,
+            norm_layer=nn.BatchNorm2d,
+            norm_kwargs=None,
+            conv_kwargs=None,
+            drop_path_rate=0.):
+        super(InvertedResidual, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        conv_kwargs = conv_kwargs or {}
+        mid_chs = make_divisible(in_chs * exp_ratio)
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+        self.drop_path_rate = drop_path_rate
+
+        # Point-wise expansion
+        self.conv_pw = create_conv2d(
+            in_chs,
+            mid_chs,
+            exp_kernel_size,
+            padding=pad_type,
+            **conv_kwargs)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Depth-wise convolution
+        self.conv_dw = create_conv2d(
+            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
+            padding=pad_type, depthwise=True, **conv_kwargs)
+        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
+
+        # Point-wise linear projection
+        self.conv_pwl = create_conv2d(
+            mid_chs,
+            out_chs,
+            pw_kernel_size,
+            padding=pad_type,
+            **conv_kwargs)
+        self.bn3 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, input to PWL
+            info = dict(
+                module='conv_pwl',
+                hook_type='forward_pre',
+                num_chs=self.conv_pwl.in_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(
+                module='',
+                hook_type='',
+                num_chs=self.conv_pwl.out_channels)
+        return info
+
+    def forward(self, x):
+        residual = x
+
+        # Point-wise expansion
+        x = self.conv_pw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Depth-wise convolution
+        x = self.conv_dw(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x)
+        x = self.bn3(x)
+
+        if self.has_residual:
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
+            x += residual
+
+        return x
--- a/examples/nas/legacy/cream/lib/models/blocks/residual_block.py
+++ b/examples/nas/legacy/cream/lib/models/blocks/residual_block.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# Written by Hao Du and Houwen Peng
+# email: haodu8-c@my.cityu.edu.hk and houwen.peng@microsoft.com
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=True)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+
+    def __init__(self, inplanes, planes, stride=1, expansion=4):
+        super(Bottleneck, self).__init__()
+        planes = int(planes / expansion)
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=True)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes,
+            planes * expansion,
+            kernel_size=1,
+            bias=True)
+        self.bn3 = nn.BatchNorm2d(planes * expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+        self.expansion = expansion
+        if inplanes != planes * self.expansion:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(inplanes, planes * self.expansion,
+                          kernel_size=1, stride=stride, bias=True),
+                nn.BatchNorm2d(planes * self.expansion),
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+def get_Bottleneck(in_c, out_c, stride):
+    return Bottleneck(in_c, out_c, stride=stride)
+
+
+def get_BasicBlock(in_c, out_c, stride):
+    return BasicBlock(in_c, out_c, stride=stride)
--- a/examples/nas/legacy/cream/lib/models/builders/build_childnet.py
+++ b/examples/nas/legacy/cream/lib/models/builders/build_childnet.py
+from lib.utils.util import *
+
+from timm.models.efficientnet_blocks import *
+
+
+class ChildNetBuilder:
+    def __init__(
+            self,
+            channel_multiplier=1.0,
+            channel_divisor=8,
+            channel_min=None,
+            output_stride=32,
+            pad_type='',
+            act_layer=None,
+            se_kwargs=None,
+            norm_layer=nn.BatchNorm2d,
+            norm_kwargs=None,
+            drop_path_rate=0.,
+            feature_location='',
+            verbose=False,
+            logger=None):
+        self.channel_multiplier = channel_multiplier
+        self.channel_divisor = channel_divisor
+        self.channel_min = channel_min
+        self.output_stride = output_stride
+        self.pad_type = pad_type
+        self.act_layer = act_layer
+        self.se_kwargs = se_kwargs
+        self.norm_layer = norm_layer
+        self.norm_kwargs = norm_kwargs
+        self.drop_path_rate = drop_path_rate
+        self.feature_location = feature_location
+        assert feature_location in ('pre_pwl', 'post_exp', '')
+        self.verbose = verbose
+        self.in_chs = None
+        self.features = OrderedDict()
+        self.logger = logger
+
+    def _round_channels(self, chs):
+        return round_channels(
+            chs,
+            self.channel_multiplier,
+            self.channel_divisor,
+            self.channel_min)
+
+    def _make_block(self, ba, block_idx, block_count):
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
+        bt = ba.pop('block_type')
+        ba['in_chs'] = self.in_chs
+        ba['out_chs'] = self._round_channels(ba['out_chs'])
+        if 'fake_in_chs' in ba and ba['fake_in_chs']:
+            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+        ba['norm_layer'] = self.norm_layer
+        ba['norm_kwargs'] = self.norm_kwargs
+        ba['pad_type'] = self.pad_type
+        # block act fn overrides the model default
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
+        if bt == 'ir':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                self.logger.info(
+                    '  InvertedResidual {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = InvertedResidual(**ba)
+        elif bt == 'ds' or bt == 'dsa':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                self.logger.info(
+                    '  DepthwiseSeparable {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = DepthwiseSeparableConv(**ba)
+        elif bt == 'cn':
+            if self.verbose:
+                self.logger.info(
+                    '  ConvBnAct {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = ConvBnAct(**ba)
+        else:
+            assert False, 'Uknkown block type (%s) while building model.' % bt
+        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
+
+        return block
+
+    def __call__(self, in_chs, model_block_args):
+        """ Build the blocks
+        Args:
+            in_chs: Number of input-channels passed to first block
+            model_block_args: A list of lists, outer list defines stages, inner
+                list contains strings defining block configuration(s)
+        Return:
+             List of block stacks (each stack wrapped in nn.Sequential)
+        """
+        if self.verbose:
+            self.logger.info(
+                'Building model trunk with %d stages...' %
+                len(model_block_args))
+        self.in_chs = in_chs
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        feature_idx = 0
+        stages = []
+        # outer list of block_args defines the stacks ('stages' by some
+        # conventions)
+        for stage_idx, stage_block_args in enumerate(model_block_args):
+            last_stack = stage_idx == (len(model_block_args) - 1)
+            if self.verbose:
+                self.logger.info('Stack: {}'.format(stage_idx))
+            assert isinstance(stage_block_args, list)
+
+            blocks = []
+            # each stack (stage) contains a list of block arguments
+            for block_idx, block_args in enumerate(stage_block_args):
+                last_block = block_idx == (len(stage_block_args) - 1)
+                extract_features = ''  # No features extracted
+                if self.verbose:
+                    self.logger.info(' Block: {}'.format(block_idx))
+
+                # Sort out stride, dilation, and feature extraction details
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:
+                    # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                do_extract = False
+                if self.feature_location == 'pre_pwl':
+                    if last_block:
+                        next_stage_idx = stage_idx + 1
+                        if next_stage_idx >= len(model_block_args):
+                            do_extract = True
+                        else:
+                            do_extract = model_block_args[next_stage_idx][0]['stride'] > 1
+                elif self.feature_location == 'post_exp':
+                    if block_args['stride'] > 1 or (last_stack and last_block):
+                        do_extract = True
+                if do_extract:
+                    extract_features = self.feature_location
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                        if self.verbose:
+                            self.logger.info(
+                                '  Converting stride to dilation to maintain output_stride=={}'.format(
+                                    self.output_stride))
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+                # create the block
+                block = self._make_block(
+                    block_args, total_block_idx, total_block_count)
+                blocks.append(block)
+
+                # stash feature module name and channel info for model feature
+                # extraction
+                if extract_features:
+                    feature_module = block.feature_module(extract_features)
+                    if feature_module:
+                        feature_module = 'blocks.{}.{}.'.format(
+                            stage_idx, block_idx) + feature_module
+                    feature_channels = block.feature_channels(extract_features)
+                    self.features[feature_idx] = dict(
+                        name=feature_module,
+                        num_chs=feature_channels
+                    )
+                    feature_idx += 1
+
+                # incr global block idx (across all stacks)
+                total_block_idx += 1
+            stages.append(nn.Sequential(*blocks))
+        return stages
--- a/examples/nas/legacy/cream/lib/models/builders/build_supernet.py
+++ b/examples/nas/legacy/cream/lib/models/builders/build_supernet.py
+from copy import deepcopy
+
+from lib.utils.builder_util import modify_block_args
+from lib.models.blocks import get_Bottleneck, InvertedResidual
+
+from timm.models.efficientnet_blocks import *
+
+from nni.nas.pytorch import mutables
+
+class SuperNetBuilder:
+    """ Build Trunk Blocks
+    """
+
+    def __init__(
+            self,
+            choices,
+            channel_multiplier=1.0,
+            channel_divisor=8,
+            channel_min=None,
+            output_stride=32,
+            pad_type='',
+            act_layer=None,
+            se_kwargs=None,
+            norm_layer=nn.BatchNorm2d,
+            norm_kwargs=None,
+            drop_path_rate=0.,
+            feature_location='',
+            verbose=False,
+            resunit=False,
+            dil_conv=False,
+            logger=None):
+
+        # dict
+        # choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]}
+        self.choices = [[x, y] for x in choices['kernel_size']
+                        for y in choices['exp_ratio']]
+        self.choices_num = len(self.choices) - 1
+        self.channel_multiplier = channel_multiplier
+        self.channel_divisor = channel_divisor
+        self.channel_min = channel_min
+        self.output_stride = output_stride
+        self.pad_type = pad_type
+        self.act_layer = act_layer
+        self.se_kwargs = se_kwargs
+        self.norm_layer = norm_layer
+        self.norm_kwargs = norm_kwargs
+        self.drop_path_rate = drop_path_rate
+        self.feature_location = feature_location
+        assert feature_location in ('pre_pwl', 'post_exp', '')
+        self.verbose = verbose
+        self.resunit = resunit
+        self.dil_conv = dil_conv
+        self.logger = logger
+
+        # state updated during build, consumed by model
+        self.in_chs = None
+
+    def _round_channels(self, chs):
+        return round_channels(
+            chs,
+            self.channel_multiplier,
+            self.channel_divisor,
+            self.channel_min)
+
+    def _make_block(
+            self,
+            ba,
+            choice_idx,
+            block_idx,
+            block_count,
+            resunit=False,
+            dil_conv=False):
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
+        bt = ba.pop('block_type')
+        ba['in_chs'] = self.in_chs
+        ba['out_chs'] = self._round_channels(ba['out_chs'])
+        if 'fake_in_chs' in ba and ba['fake_in_chs']:
+            # FIXME this is a hack to work around mismatch in origin impl input
+            # filters
+            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+        ba['norm_layer'] = self.norm_layer
+        ba['norm_kwargs'] = self.norm_kwargs
+        ba['pad_type'] = self.pad_type
+        # block act fn overrides the model default
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
+        if bt == 'ir':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                self.logger.info(
+                    '  InvertedResidual {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = InvertedResidual(**ba)
+        elif bt == 'ds' or bt == 'dsa':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            if self.verbose:
+                self.logger.info(
+                    '  DepthwiseSeparable {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = DepthwiseSeparableConv(**ba)
+        elif bt == 'cn':
+            if self.verbose:
+                self.logger.info(
+                    '  ConvBnAct {}, Args: {}'.format(
+                        block_idx, str(ba)))
+            block = ConvBnAct(**ba)
+        else:
+            assert False, 'Uknkown block type (%s) while building model.' % bt
+        if choice_idx == self.choice_num - 1:
+            self.in_chs = ba['out_chs']  # update in_chs for arg of next block
+
+        return block
+
+    def __call__(self, in_chs, model_block_args):
+        """ Build the blocks
+        Args:
+            in_chs: Number of input-channels passed to first block
+            model_block_args: A list of lists, outer list defines stages, inner
+                list contains strings defining block configuration(s)
+        Return:
+             List of block stacks (each stack wrapped in nn.Sequential)
+        """
+        if self.verbose:
+            logging.info('Building model trunk with %d stages...' % len(model_block_args))
+        self.in_chs = in_chs
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        feature_idx = 0
+        stages = []
+        # outer list of block_args defines the stacks ('stages' by some conventions)
+        for stage_idx, stage_block_args in enumerate(model_block_args):
+            last_stack = stage_idx == (len(model_block_args) - 1)
+            if self.verbose:
+                self.logger.info('Stack: {}'.format(stage_idx))
+            assert isinstance(stage_block_args, list)
+
+            # blocks = []
+            # each stack (stage) contains a list of block arguments
+            for block_idx, block_args in enumerate(stage_block_args):
+                last_block = block_idx == (len(stage_block_args) - 1)
+                if self.verbose:
+                    self.logger.info(' Block: {}'.format(block_idx))
+
+                # Sort out stride, dilation, and feature extraction details
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:
+                    # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+
+                if stage_idx==0 or stage_idx==6:
+                    self.choice_num = 1
+                else:
+                    self.choice_num = len(self.choices)
+
+                    if self.dil_conv:
+                        self.choice_num += 2
+
+                choice_blocks = []
+                block_args_copy = deepcopy(block_args)
+                if self.choice_num == 1:
+                    # create the block
+                    block = self._make_block(block_args, 0, total_block_idx, total_block_count)
+                    choice_blocks.append(block)
+                else:
+                    for choice_idx, choice in enumerate(self.choices):
+                        # create the block
+                        block_args = deepcopy(block_args_copy)
+                        block_args = modify_block_args(block_args, choice[0], choice[1])
+                        block = self._make_block(block_args, choice_idx, total_block_idx, total_block_count)
+                        choice_blocks.append(block)
+                    if self.dil_conv:
+                        block_args = deepcopy(block_args_copy)
+                        block_args = modify_block_args(block_args, 3, 0)
+                        block = self._make_block(block_args, self.choice_num - 2, total_block_idx, total_block_count,
+                                                 resunit=self.resunit, dil_conv=self.dil_conv)
+                        choice_blocks.append(block)
+
+                        block_args = deepcopy(block_args_copy)
+                        block_args = modify_block_args(block_args, 5, 0)
+                        block = self._make_block(block_args, self.choice_num - 1, total_block_idx, total_block_count,
+                                                 resunit=self.resunit, dil_conv=self.dil_conv)
+                        choice_blocks.append(block)
+
+                    if self.resunit:
+                        block = get_Bottleneck(block.conv_pw.in_channels,
+                                               block.conv_pwl.out_channels,
+                                               block.conv_dw.stride[0])
+                        choice_blocks.append(block)
+
+                choice_block = mutables.LayerChoice(choice_blocks)
+                stages.append(choice_block)
+                # create the block
+                # block = self._make_block(block_args, total_block_idx, total_block_count)
+                total_block_idx += 1  # incr global block idx (across all stacks)
+
+            # stages.append(blocks)
+        return stages
--- a/examples/nas/legacy/cream/lib/models/structures/childnet.py
+++ b/examples/nas/legacy/cream/lib/models/structures/childnet.py
+from lib.utils.builder_util import *
+from lib.models.builders.build_childnet import *
+
+from timm.models.layers import SelectAdaptivePool2d
+from timm.models.layers.activations import hard_sigmoid
+
+
+class ChildNet(nn.Module):
+
+    def __init__(
+            self,
+            block_args,
+            num_classes=1000,
+            in_chans=3,
+            stem_size=16,
+            num_features=1280,
+            head_bias=True,
+            channel_multiplier=1.0,
+            pad_type='',
+            act_layer=nn.ReLU,
+            drop_rate=0.,
+            drop_path_rate=0.,
+            se_kwargs=None,
+            norm_layer=nn.BatchNorm2d,
+            norm_kwargs=None,
+            global_pool='avg',
+            logger=None,
+            verbose=False):
+        super(ChildNet, self).__init__()
+
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+        self._in_chs = in_chans
+        self.logger = logger
+
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier)
+        self.conv_stem = create_conv2d(
+            self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self._in_chs = stem_size
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = ChildNetBuilder(
+            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, verbose=verbose)
+        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
+        # self.blocks = builder(self._in_chs, block_args)
+        self._in_chs = builder.in_chs
+
+        # Head + Pooling
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.conv_head = create_conv2d(
+            self._in_chs,
+            self.num_features,
+            1,
+            padding=pad_type,
+            bias=head_bias)
+        self.act2 = act_layer(inplace=True)
+
+        # Classifier
+        self.classifier = nn.Linear(
+            self.num_features *
+            self.global_pool.feat_mult(),
+            self.num_classes)
+
+        efficientnet_init_weights(self)
+
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.num_classes = num_classes
+        self.classifier = nn.Linear(
+            self.num_features * self.global_pool.feat_mult(),
+            num_classes) if self.num_classes else None
+
+    def forward_features(self, x):
+        # architecture = [[0], [], [], [], [], [0]]
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.flatten(1)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        return x
+
+
+def gen_childnet(arch_list, arch_def, **kwargs):
+    # arch_list = [[0], [], [], [], [], [0]]
+    choices = {'kernel_size': [3, 5, 7], 'exp_ratio': [4, 6]}
+    choices_list = [[x, y] for x in choices['kernel_size']
+                    for y in choices['exp_ratio']]
+
+    num_features = 1280
+
+    # act_layer = HardSwish
+    act_layer = Swish
+
+    new_arch = []
+    # change to child arch_def
+    for i, (layer_choice, layer_arch) in enumerate(zip(arch_list, arch_def)):
+        if len(layer_arch) == 1:
+            new_arch.append(layer_arch)
+            continue
+        else:
+            new_layer = []
+            for j, (block_choice, block_arch) in enumerate(
+                    zip(layer_choice, layer_arch)):
+                kernel_size, exp_ratio = choices_list[block_choice]
+                elements = block_arch.split('_')
+                block_arch = block_arch.replace(
+                    elements[2], 'k{}'.format(str(kernel_size)))
+                block_arch = block_arch.replace(
+                    elements[4], 'e{}'.format(str(exp_ratio)))
+                new_layer.append(block_arch)
+            new_arch.append(new_layer)
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(new_arch),
+        num_features=num_features,
+        stem_size=16,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=act_layer,
+        se_kwargs=dict(
+            act_layer=nn.ReLU,
+            gate_fn=hard_sigmoid,
+            reduce_mid=True,
+            divisor=8),
+        **kwargs,
+    )
+    model = ChildNet(**model_kwargs)
+    return model