create branch for v2.9

e773dfcc · qianyj · e773dfcc · e773dfcc · e773dfcc · e773dfcc
Commit e773dfcc authored Mar 21, 2023 by qianyj
20 changed files
--- a/examples/feature_engineering/gradient_feature_selector/benchmark_test.py
+++ b/examples/feature_engineering/gradient_feature_selector/benchmark_test.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import bz2
+import urllib.request
+import numpy as np
+import datetime
+import line_profiler
+profile = line_profiler.LineProfiler()
+import os
+from sklearn.datasets import load_svmlight_file
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.feature_selection import SelectFromModel
+from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
+class Benchmark():
+    def __init__(self, files=None, test_size=0.2):
+        self.files =  files
+        self.test_size = test_size
+    def run_all_test(self, pipeline):
+        for file_name in self.files:
+            file_path = self.files[file_name]
+            self.run_test(pipeline, file_name, file_path)
+    def run_test(self, pipeline, name, path):
+        print("download " + name)
+        update_name = self.download(name, path)
+        X, y = load_svmlight_file(update_name)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size, random_state=42)
+        pipeline.fit(X_train, y_train)
+        print("[Benchmark "+ name + " Score]: ", pipeline.score(X_test, y_test))
+    def download(self, name, path):
+        old_name = name + '_train.bz2'
+        update_name = name + '_train.svm'
+        if os.path.exists(old_name) and os.path.exists(update_name):
+            return update_name
+        urllib.request.urlretrieve(path, filename=old_name)
+        f_svm = open(update_name, 'wt')
+        with bz2.open(old_name, 'rb') as f_zip:
+            data = f_zip.read()
+            f_svm.write(data.decode('utf-8'))
+        f_svm.close()
+        return update_name
+@profile
+def test_memory(pipeline_name, name, path):
+    if pipeline_name == "LR":
+        pipeline = make_pipeline(LogisticRegression())
+    if pipeline_name == "FGS":
+        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())
+    if pipeline_name == "Tree":
+        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
+    test_benchmark = Benchmark()
+    print("Dataset:\t", name)
+    print("Pipeline:\t", pipeline_name)
+    test_benchmark.run_test(pipeline, name, path)
+    print("")
+def test_time(pipeline_name, name, path):
+    if pipeline_name == "LR":
+        pipeline = make_pipeline(LogisticRegression())
+    if pipeline_name == "FGS":
+        pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())
+    if pipeline_name == "Tree":
+        pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
+    test_benchmark = Benchmark()
+    print("Dataset:\t", name)
+    print("Pipeline:\t", pipeline_name)
+    starttime = datetime.datetime.now()
+    test_benchmark.run_test(pipeline, name, path)
+    endtime = datetime.datetime.now()
+    print("Used time: ", (endtime - starttime).microseconds/1000)
+    print("")
+if __name__ == "__main__":
+    LIBSVM_DATA = {
+        "rcv1" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2",
+        "colon-cancer" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2",
+        "gisette" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/gisette_scale.bz2",
+        "news20.binary" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2",
+        "real-sim" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/real-sim.bz2",
+        "webspam" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/webspam_wc_normalized_trigram.svm.bz2",
+        "avazu" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2"
+    }
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pipeline_name', type=str, help='display pipeline_name.')
+    parser.add_argument('--name', type=str, help='display name.')
+    parser.add_argument('--object', type=str, help='display test object: time or memory.')
+    args = parser.parse_args()
+    pipeline_name = args.pipeline_name
+    name = args.name
+    test_object = args.object
+    path = LIBSVM_DATA[name]
+    if test_object == 'time':
+        test_time(pipeline_name, name, path)
+    elif test_object == 'memory':
+        test_memory(pipeline_name, name, path)
+    else:
+        print("Not support test object.\t", test_object)
+    print("Done.")
--- a/examples/feature_engineering/gradient_feature_selector/sklearn_test.py
+++ b/examples/feature_engineering/gradient_feature_selector/sklearn_test.py
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import bz2
+import urllib.request
+import numpy as np
+from sklearn.datasets import load_svmlight_file
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.feature_selection import SelectFromModel
+from nni.algorithms.feature_engineering.gradient_selector import FeatureGradientSelector
+def test():
+    url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
+    urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
+    f_svm = open('train.svm', 'wt')
+    with bz2.open('train.bz2', 'rb') as f_zip:
+        data = f_zip.read()
+        f_svm.write(data.decode('utf-8'))
+    f_svm.close()
+    X, y = load_svmlight_file('train.svm')
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+    pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
+    # pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
+    pipeline.fit(X_train, y_train)
+    print("Pipeline Score: ", pipeline.score(X_train, y_train))
+if __name__ == "__main__":
+    test()
--- a/examples/feature_engineering/gradient_feature_selector/test_memory.py
+++ b/examples/feature_engineering/gradient_feature_selector/test_memory.py
+import os
+LIBSVM_DATA = {
+    "rcv1" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2",
+    "colon-cancer" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2",
+    "gisette" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/gisette_scale.bz2",
+    "news20.binary" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2",
+    "real-sim" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/real-sim.bz2",
+    "avazu" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2",
+}
+pipeline_name = "Tree"
+device = "CUDA_VISIBLE_DEVICES=0 "
+script = "setsid python -m memory_profiler benchmark_test.py "
+test_object = "memory"
+for name in LIBSVM_DATA:
+    log_name = "_".join([pipeline_name, name, test_object])
+    command = device + script + "--pipeline_name " + pipeline_name + " --name " + name + " --object " + test_object + " >" +log_name + " 2>&1 &"
+    print("command is\t", command)
+    os.system(command)
+    print("log is here\t", log_name)
+print("Done.")
--- a/examples/feature_engineering/gradient_feature_selector/test_time.py
+++ b/examples/feature_engineering/gradient_feature_selector/test_time.py
+import os
+LIBSVM_DATA = {
+    "rcv1" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2",
+    "colon-cancer" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/covtype.libsvm.binary.bz2",
+    "gisette" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/gisette_scale.bz2",
+    "news20.binary" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/news20.binary.bz2",
+    "real-sim" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/real-sim.bz2",
+    "avazu" : "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2",
+}
+pipeline_name = "LR"
+device = "CUDA_VISIBLE_DEVICES=0 "
+script = "setsid python benchmark_test.py "
+test_object = "time"
+for name in LIBSVM_DATA:
+    log_name = "_".join([pipeline_name, name, test_object])
+    command = device + script + "--pipeline_name " + pipeline_name + " --name " + name + " --object " + test_object + " >" +log_name + " 2>&1 &"
+    print("command is\t", command)
+    os.system(command)
+    print("log is here\t", log_name)
+print("Done.")
--- a/examples/model_compress/.gitignore
+++ b/examples/model_compress/.gitignore
+.pth
+.tar.gz
+data/
+MNIST/
+cifar-10-batches-py/
+experiment_data/
+pruning/models
+pruning/pruning_log
\ No newline at end of file
--- a/examples/model_compress/auto_compress/torch/auto_compress_module.py
+++ b/examples/model_compress/auto_compress/torch/auto_compress_module.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from typing import Callable, Optional, Iterable
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torchvision import datasets, transforms
+from nni.algorithms.compression.pytorch.auto_compress import AbstractAutoCompressionModule
+torch.manual_seed(1)
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout2d(0.25)
+        self.dropout2 = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+_use_cuda = torch.cuda.is_available()
+_train_kwargs = {'batch_size': 64}
+_test_kwargs = {'batch_size': 1000}
+if _use_cuda:
+    _cuda_kwargs = {'num_workers': 1,
+                    'pin_memory': True,
+                    'shuffle': True}
+    _train_kwargs.update(_cuda_kwargs)
+    _test_kwargs.update(_cuda_kwargs)
+_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.1307,), (0.3081,))
+])
+_device = torch.device("cuda" if _use_cuda else "cpu")
+_train_loader = None
+_test_loader = None
+def _train(model, optimizer, criterion, epoch):
+    global _train_loader
+    if _train_loader is None:
+        dataset = datasets.MNIST('./data', train=True, download=True, transform=_transform)
+        _train_loader = torch.utils.data.DataLoader(dataset, **_train_kwargs)
+    model.train()
+    for data, target in _train_loader:
+        data, target = data.to(_device), target.to(_device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+def _test(model):
+    global _test_loader
+    if _test_loader is None:
+        dataset = datasets.MNIST('./data', train=False, transform=_transform)
+        _test_loader = torch.utils.data.DataLoader(dataset, **_test_kwargs)
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in _test_loader:
+            data, target = data.to(_device), target.to(_device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(_test_loader.dataset)
+    acc = 100 * correct / len(_test_loader.dataset)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(_test_loader.dataset), acc))
+    return acc
+_model = LeNet().to(_device)
+_model.load_state_dict(torch.load('mnist_pretrain_lenet.pth'))
+class AutoCompressionModule(AbstractAutoCompressionModule):
+    @classmethod
+    def model(cls) -> nn.Module:
+        return _model
+    @classmethod
+    def evaluator(cls) -> Callable[[nn.Module], float]:
+        return _test
+    @classmethod
+    def optimizer_factory(cls) -> Optional[Callable[[Iterable], optim.Optimizer]]:
+        def _optimizer_factory(params: Iterable):
+            return torch.optim.SGD(params, lr=0.01)
+        return _optimizer_factory
+    @classmethod
+    def criterion(cls) -> Optional[Callable]:
+        return F.nll_loss
+    @classmethod
+    def sparsifying_trainer(cls, compress_algorithm_name: str) -> Optional[Callable[[nn.Module, optim.Optimizer, Callable, int], None]]:
+        return _train
+    @classmethod
+    def post_compress_finetuning_trainer(cls, compress_algorithm_name: str) -> Optional[Callable[[nn.Module, optim.Optimizer, Callable, int], None]]:
+        return _train
+    @classmethod
+    def post_compress_finetuning_epochs(cls, compress_algorithm_name: str) -> int:
+        return 2
--- a/examples/model_compress/auto_compress/torch/auto_compress_torch.py
+++ b/examples/model_compress/auto_compress/torch/auto_compress_torch.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from pathlib import Path
+from nni.algorithms.compression.pytorch.auto_compress import AutoCompressionExperiment, AutoCompressionSearchSpaceGenerator
+from auto_compress_module import AutoCompressionModule
+generator = AutoCompressionSearchSpaceGenerator()
+generator.add_config('level', [
+    {
+        "sparsity": {
+            "_type": "uniform",
+            "_value": [0.01, 0.99]
+        },
+        'op_types': ['default']
+    }
+])
+generator.add_config('l1', [
+    {
+        "sparsity": {
+            "_type": "uniform",
+            "_value": [0.01, 0.99]
+        },
+        'op_types': ['Conv2d']
+    }
+])
+generator.add_config('qat', [
+    {
+        'quant_types': ['weight', 'output'],
+        'quant_bits': {
+            'weight': 8,
+            'output': 8
+        },
+        'op_types': ['Conv2d', 'Linear']
+    }])
+search_space = generator.dumps()
+experiment = AutoCompressionExperiment(AutoCompressionModule, 'local')
+experiment.config.experiment_name = 'auto compression torch example'
+experiment.config.trial_concurrency = 1
+experiment.config.max_trial_number = 10
+experiment.config.search_space = search_space
+experiment.config.trial_code_directory = Path(__file__).parent
+experiment.config.tuner.name = 'TPE'
+experiment.config.tuner.class_args['optimize_mode'] = 'maximize'
+experiment.config.training_service.use_active_gpu = True
+experiment.run(8088)
--- a/examples/model_compress/auto_compress/torch/mnist_pretrain_lenet.pth
+++ b/examples/model_compress/auto_compress/torch/mnist_pretrain_lenet.pth
--- a/examples/model_compress/end2end_compression.py
+++ b/examples/model_compress/end2end_compression.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""
+NNI example for combined pruning and quantization to compress a model.
+In this example, we show the compression process to first prune a model, then quantize the pruned model.
+"""
+import argparse
+import os
+import time
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.optim.lr_scheduler import StepLR
+from torchvision import datasets, transforms
+from nni.compression.pytorch.utils import count_flops_params
+from nni.compression.pytorch import ModelSpeedup
+from nni.algorithms.compression.pytorch.pruning import L1FilterPruner
+from nni.algorithms.compression.pytorch.quantization import QAT_Quantizer
+from models.mnist.naive import NaiveModel
+from nni.compression.pytorch.quantization_speedup import ModelSpeedupTensorRT
+def get_model_time_cost(model, dummy_input):
+    model.eval()
+    n_times = 100
+    time_list = []
+    for _ in range(n_times):
+        torch.cuda.synchronize()
+        tic = time.time()
+        _ = model(dummy_input)
+        torch.cuda.synchronize()
+        time_list.append(time.time()-tic)
+    time_list = time_list[10:]
+    return sum(time_list) / len(time_list)
+def train(args, model, device, train_loader, criterion, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+            if args.dry_run:
+                break
+def test(args, model, device, criterion, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += criterion(output, target).item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+    acc = 100 * correct / len(test_loader.dataset)
+    print('Test Loss: {:.6f}  Accuracy: {}%\n'.format(
+        test_loss, acc))
+    return acc
+def test_trt(engine, test_loader):
+    test_loss = 0
+    correct = 0
+    time_elasped = 0
+    for data, target in test_loader:
+        output, time = engine.inference(data)
+        test_loss += F.nll_loss(output, target, reduction='sum').item()
+        pred = output.argmax(dim=1, keepdim=True)
+        correct += pred.eq(target.view_as(pred)).sum().item()
+        time_elasped += time
+    test_loss /= len(test_loader.dataset)
+    print('Loss: {}  Accuracy: {}%'.format(
+        test_loss, 100 * correct / len(test_loader.dataset)))
+    print("Inference elapsed_time (whole dataset): {}s".format(time_elasped))
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    os.makedirs(args.experiment_data_dir, exist_ok=True)
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+        ])
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=True, download=True, transform=transform),
+        batch_size=64,)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('data', train=False, transform=transform),
+        batch_size=1000)
+    # Step1. Model Pretraining
+    model = NaiveModel().to(device)
+    criterion = torch.nn.NLLLoss()
+    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
+    flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False)
+    if args.pretrained_model_dir is None:
+        args.pretrained_model_dir = os.path.join(args.experiment_data_dir, f'pretrained.pth')
+        best_acc = 0
+        for epoch in range(args.pretrain_epochs):
+            train(args, model, device, train_loader, criterion, optimizer, epoch)
+            scheduler.step()
+            acc = test(args, model, device, criterion, test_loader)
+            if acc > best_acc:
+                best_acc = acc
+                state_dict = model.state_dict()
+        model.load_state_dict(state_dict)
+        torch.save(state_dict, args.pretrained_model_dir)
+        print(f'Model saved to {args.pretrained_model_dir}')
+    else:
+        state_dict = torch.load(args.pretrained_model_dir)
+        model.load_state_dict(state_dict)
+        best_acc = test(args, model, device, criterion, test_loader)
+    dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
+    time_cost = get_model_time_cost(model, dummy_input)
+    # 125.49 M, 0.85M, 93.29, 1.1012
+    print(f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}')
+    # Step2. Model Pruning
+    config_list = [{
+        'sparsity': args.sparsity,
+        'op_types': ['Conv2d']
+    }]
+    kw_args = {}
+    if args.dependency_aware:
+        dummy_input = torch.randn([1000, 1, 28, 28]).to(device)
+        print('Enable the dependency_aware mode')
+        # note that, not all pruners support the dependency_aware mode
+        kw_args['dependency_aware'] = True
+        kw_args['dummy_input'] = dummy_input
+    pruner = L1FilterPruner(model, config_list, **kw_args)
+    model = pruner.compress()
+    pruner.get_pruned_weights()
+    mask_path = os.path.join(args.experiment_data_dir, 'mask.pth')
+    model_path = os.path.join(args.experiment_data_dir, 'pruned.pth')
+    pruner.export_model(model_path=model_path, mask_path=mask_path)
+    pruner._unwrap_model()  # unwrap all modules to normal state
+    # Step3. Model Speedup
+    m_speedup = ModelSpeedup(model, dummy_input, mask_path, device)
+    m_speedup.speedup_model()
+    print('model after speedup', model)
+    flops, params, _ = count_flops_params(model, dummy_input, verbose=False)
+    acc = test(args, model, device, criterion, test_loader)
+    time_cost = get_model_time_cost(model, dummy_input)
+    print(f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}')
+    # Step4. Model Finetuning
+    optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr)
+    scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
+    best_acc = 0
+    for epoch in range(args.finetune_epochs):
+        train(args, model, device, train_loader, criterion, optimizer, epoch)
+        scheduler.step()
+        acc = test(args, model, device, criterion, test_loader)
+        if acc > best_acc:
+            best_acc = acc
+            state_dict = model.state_dict()
+    model.load_state_dict(state_dict)
+    save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth')
+    torch.save(state_dict, save_path)
+    flops, params, _ = count_flops_params(model, dummy_input, verbose=True)
+    time_cost = get_model_time_cost(model, dummy_input)
+    # FLOPs 28.48 M, #Params: 0.18M, Accuracy:  89.03, Time Cost: 1.03
+    print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}')
+    print(f'Model saved to {save_path}')
+    # Step5. Model Quantization via QAT
+    config_list = [{
+        'quant_types': ['weight', 'output'],
+        'quant_bits': {'weight': 8, 'output': 8},
+        'op_names': ['conv1']
+    }, {
+        'quant_types': ['output'],
+        'quant_bits': {'output':8},
+        'op_names': ['relu1']
+    }, {
+        'quant_types': ['weight', 'output'],
+        'quant_bits': {'weight': 8, 'output': 8},
+        'op_names': ['conv2']
+    }, {
+        'quant_types': ['output'],
+        'quant_bits': {'output': 8},
+        'op_names': ['relu2']
+    }]
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
+    quantizer = QAT_Quantizer(model, config_list, optimizer)
+    quantizer.compress()
+    # Step6. Quantization Aware Training
+    best_acc = 0
+    for epoch in range(1):
+        train(args, model, device, train_loader, criterion, optimizer, epoch)
+        scheduler.step()
+        acc = test(args, model, device, criterion, test_loader)
+        if acc > best_acc:
+            best_acc = acc
+            state_dict = model.state_dict()
+    calibration_path = os.path.join(args.experiment_data_dir, 'calibration.pth')
+    calibration_config = quantizer.export_model(model_path, calibration_path)
+    print("calibration_config: ", calibration_config)
+    # Step7. Model Speedup
+    batch_size = 32
+    input_shape = (batch_size, 1, 28, 28)
+    engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32)
+    engine.compress()
+    test_trt(engine, test_loader)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
+    # dataset and model
+    # parser.add_argument('--dataset', type=str, default='mnist',
+    #                     help='dataset to use, mnist, cifar10 or imagenet')
+    # parser.add_argument('--data-dir', type=str, default='./data/',
+    #                     help='dataset directory')
+    parser.add_argument('--pretrained-model-dir', type=str, default=None,
+                        help='path to pretrained model')
+    parser.add_argument('--pretrain-epochs', type=int, default=10,
+                        help='number of epochs to pretrain the model')
+    parser.add_argument('--pretrain-lr', type=float, default=1.0,
+                        help='learning rate to pretrain the model')
+    parser.add_argument('--experiment-data-dir', type=str, default='./experiment_data',
+                        help='For saving output checkpoints')
+    parser.add_argument('--log-interval', type=int, default=100, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--dry-run', action='store_true', default=False,
+                        help='quickly check a single pass')
+    # parser.add_argument('--multi-gpu', action='store_true', default=False,
+    #                     help='run on mulitple gpus')
+    # parser.add_argument('--test-only', action='store_true', default=False,
+    #                     help='run test only')
+    # pruner
+    # parser.add_argument('--pruner', type=str, default='l1filter',
+    #                     choices=['level', 'l1filter', 'l2filter', 'slim', 'agp',
+    #                              'fpgm', 'mean_activation', 'apoz', 'admm'],
+    #                     help='pruner to use')
+    parser.add_argument('--sparsity', type=float, default=0.5,
+                        help='target overall target sparsity')
+    parser.add_argument('--dependency-aware', action='store_true', default=False,
+                        help='toggle dependency-aware mode')
+    # finetuning
+    parser.add_argument('--finetune-epochs', type=int, default=5,
+                        help='epochs to fine tune')
+    # parser.add_argument('--kd', action='store_true', default=False,
+    #                     help='quickly check a single pass')
+    # parser.add_argument('--kd_T', type=float, default=4,
+    #                     help='temperature for KD distillation')
+    # parser.add_argument('--finetune-lr', type=float, default=0.5,
+    #                     help='learning rate to finetune the model')
+    # speedup
+    # parser.add_argument('--speedup', action='store_true', default=False,
+    #                     help='whether to speedup the pruned model')
+    # parser.add_argument('--nni', action='store_true', default=False,
+    #                     help="whether to tune the pruners using NNi tuners")
+    args = parser.parse_args()
+    main(args)
--- a/examples/model_compress/experimental/compression_experiment/demo.py
+++ b/examples/model_compress/experimental/compression_experiment/demo.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+from pathlib import Path
+import torch
+from torch.optim import Adam
+import nni
+from nni.compression.experiment.experiment import CompressionExperiment
+from nni.compression.experiment.config import CompressionExperimentConfig, TaylorFOWeightPrunerConfig
+from vessel import LeNet, finetuner, evaluator, trainer, criterion, device
+model = LeNet().to(device)
+# pre-training model
+finetuner(model)
+optimizer = nni.trace(Adam)(model.parameters())
+dummy_input = torch.rand(16, 1, 28, 28).to(device)
+# normal experiment setting, no need to set search_space and trial_command
+config = CompressionExperimentConfig('local')
+config.experiment_name = 'auto compression torch example'
+config.trial_concurrency = 1
+config.max_trial_number = 10
+config.trial_code_directory = Path(__file__).parent
+config.tuner.name = 'TPE'
+config.tuner.class_args['optimize_mode'] = 'maximize'
+# compression experiment specific setting
+# single float value means the expected remaining ratio upper limit for flops & params, lower limit for metric
+config.compression_setting.flops = 0.2
+config.compression_setting.params = 0.5
+config.compression_setting.module_types = ['Conv2d', 'Linear']
+config.compression_setting.exclude_module_names = ['fc2']
+config.compression_setting.pruners = [TaylorFOWeightPrunerConfig()]
+experiment = CompressionExperiment(config, model, finetuner, evaluator, dummy_input, trainer, optimizer, criterion, device)
+experiment.run(8080)
--- a/examples/model_compress/experimental/compression_experiment/vessel.py
+++ b/examples/model_compress/experimental/compression_experiment/vessel.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import Adam
+from torchvision import datasets, transforms
+import nni
+@nni.trace
+class LeNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout2d(0.25)
+        self.dropout2 = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+_use_cuda = True
+device = torch.device("cuda" if _use_cuda else "cpu")
+_train_kwargs = {'batch_size': 64}
+_test_kwargs = {'batch_size': 1000}
+if _use_cuda:
+    _cuda_kwargs = {'num_workers': 1,
+                    'pin_memory': True,
+                    'shuffle': True}
+    _train_kwargs.update(_cuda_kwargs)
+    _test_kwargs.update(_cuda_kwargs)
+_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.1307,), (0.3081,))
+])
+_train_loader = None
+_test_loader = None
+def trainer(model, optimizer, criterion):
+    global _train_loader
+    if _train_loader is None:
+        dataset = datasets.MNIST('./data', train=True, download=True, transform=_transform)
+        _train_loader = torch.utils.data.DataLoader(dataset, **_train_kwargs)
+    model.train()
+    for data, target in _train_loader:
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+def evaluator(model):
+    global _test_loader
+    if _test_loader is None:
+        dataset = datasets.MNIST('./data', train=False, transform=_transform, download=True)
+        _test_loader = torch.utils.data.DataLoader(dataset, **_test_kwargs)
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in _test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(_test_loader.dataset)
+    acc = 100 * correct / len(_test_loader.dataset)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(_test_loader.dataset), acc))
+    return acc
+criterion = F.nll_loss
+def finetuner(model: nn.Module):
+    optimizer = Adam(model.parameters())
+    for i in range(3):
+        trainer(model, optimizer, criterion)
--- a/examples/model_compress/models/cifar10/resnet.py
+++ b/examples/model_compress/models/cifar10/resnet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion *
+                               planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+        # this layer is different from torchvision.resnet18() since this model adopted for Cifar10
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+def ResNet18():
+    return ResNet(BasicBlock, [2, 2, 2, 2])
+def ResNet34():
+    return ResNet(BasicBlock, [3, 4, 6, 3])
+def ResNet50():
+    return ResNet(Bottleneck, [3, 4, 6, 3])
+def ResNet101():
+    return ResNet(Bottleneck, [3, 4, 23, 3])
+def ResNet152():
+    return ResNet(Bottleneck, [3, 8, 36, 3])
--- a/examples/model_compress/models/cifar10/vgg.py
+++ b/examples/model_compress/models/cifar10/vgg.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+defaultcfg = {
+    11: [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512],
+    13: [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512],
+    16: [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512],
+    19: [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512],
+}
+class VGG(nn.Module):
+    def __init__(self, depth=16):
+        super(VGG, self).__init__()
+        cfg = defaultcfg[depth]
+        self.cfg = cfg
+        self.feature = self.make_layers(cfg, True)
+        num_classes = 10
+        self.classifier = nn.Sequential(
+            nn.Linear(cfg[-1], 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(inplace=True),
+            nn.Linear(512, num_classes)
+        )
+        self._initialize_weights()
+    def make_layers(self, cfg, batch_norm=False):
+        layers = []
+        in_channels = 3
+        for v in cfg:
+            if v == 'M':
+                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1, bias=False)
+                if batch_norm:
+                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+                else:
+                    layers += [conv2d, nn.ReLU(inplace=True)]
+                in_channels = v
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.feature(x)
+        x = nn.AvgPool2d(2)(x)
+        x = x.view(x.size(0), -1)
+        y = self.classifier(x)
+        return y
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(0.5)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
--- a/examples/model_compress/models/mnist/lenet.py
+++ b/examples/model_compress/models/mnist/lenet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LeNet(nn.Module):
+    def __init__(self):
+        super(LeNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout2d(0.25)
+        self.dropout2 = nn.Dropout2d(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
--- a/examples/model_compress/models/mnist/naive.py
+++ b/examples/model_compress/models/mnist/naive.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import reduce
+class NaiveModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = torch.nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = torch.nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = torch.nn.Linear(500, 10)
+        self.relu1 = torch.nn.ReLU6()
+        self.relu2 = torch.nn.ReLU6()
+        self.relu3 = torch.nn.ReLU6()
+        self.max_pool1 = torch.nn.MaxPool2d(2, 2)
+        self.max_pool2 = torch.nn.MaxPool2d(2, 2)
+    def forward(self, x):
+        x = self.relu1(self.conv1(x))
+        x = self.max_pool1(x)
+        x = self.relu2(self.conv2(x))
+        x = self.max_pool2(x)
+        x = x.view(-1, x.size()[1:].numel())
+        x = self.relu3(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
\ No newline at end of file
--- a/examples/model_compress/models/mobilenet.py
+++ b/examples/model_compress/models/mobilenet.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch.nn as nn
+import math
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+def conv_dw(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+        nn.BatchNorm2d(inp),
+        nn.ReLU(inplace=True),
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True),
+    )
+class MobileNet(nn.Module):
+    def __init__(self, n_class,  profile='normal'):
+        super(MobileNet, self).__init__()
+        # original
+        if profile == 'normal':
+            in_planes = 32
+            cfg = [64, (128, 2), 128, (256, 2), 256, (512, 2), 512, 512, 512, 512, 512, (1024, 2), 1024]
+        # 0.5 AMC
+        elif profile == '0.5flops':
+            in_planes = 24
+            cfg = [48, (96, 2), 80, (192, 2), 200, (328, 2), 352, 368, 360, 328, 400, (736, 2), 752]
+        else:
+            raise NotImplementedError
+        self.conv1 = conv_bn(3, in_planes, stride=2)
+        self.features = self._make_layers(in_planes, cfg, conv_dw)
+        self.classifier = nn.Sequential(
+            nn.Linear(cfg[-1], n_class),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.features(x)
+        x = x.mean([2, 3]) # global average pooling
+        x = self.classifier(x)
+        return x
+    def _make_layers(self, in_planes, cfg, layer):
+        layers = []
+        for x in cfg:
+            out_planes = x if isinstance(x, int) else x[0]
+            stride = 1 if isinstance(x, int) else x[1]
+            layers.append(layer(in_planes, out_planes, stride))
+            in_planes = out_planes
+        return nn.Sequential(*layers)
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
--- a/examples/model_compress/models/mobilenet_v2.py
+++ b/examples/model_compress/models/mobilenet_v2.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import torch.nn as nn
+import math
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.features(x)
+        # it's same with .mean(3).mean(2), but
+        # speedup only suport the mean option
+        # whose output only have two dimensions
+        x = x.mean([2, 3])
+        x = self.classifier(x)
+        return x
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
--- a/examples/model_compress/pruning/activation_pruning_torch.py
+++ b/examples/model_compress/pruning/activation_pruning_torch.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+'''
+NNI example for supported ActivationAPoZRank and ActivationMeanRank pruning algorithms.
+In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
+Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
+'''
+import argparse
+import sys
+import torch
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import MultiStepLR
+import nni
+from nni.compression.pytorch import ModelSpeedup
+from nni.compression.pytorch.utils import count_flops_params
+from nni.compression.pytorch.pruning import ActivationAPoZRankPruner, ActivationMeanRankPruner
+from pathlib import Path
+sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
+from cifar10.vgg import VGG
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+g_epoch = 0
+train_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
+        transforms.RandomHorizontalFlip(),
+        transforms.RandomCrop(32, 4),
+        transforms.ToTensor(),
+        normalize,
+    ]), download=True),
+    batch_size=128, shuffle=True)
+test_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        normalize,
+    ])),
+    batch_size=128, shuffle=False)
+def trainer(model, optimizer, criterion):
+    global g_epoch
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx and batch_idx % 100 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                g_epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+    g_epoch += 1
+def evaluator(model):
+    model.eval()
+    correct = 0.0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    acc = 100 * correct / len(test_loader.dataset)
+    print('Accuracy: {}%\n'.format(acc))
+    return acc
+def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
+    optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
+    scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
+    return optimizer, scheduler
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
+    parser.add_argument('--pruner', type=str, default='apoz',
+                        choices=['apoz', 'mean'],
+                        help='pruner to use')
+    parser.add_argument('--pretrain-epochs', type=int, default=20,
+                        help='number of epochs to pretrain the model')
+    parser.add_argument('--fine-tune-epochs', type=int, default=20,
+                        help='number of epochs to fine tune the model')
+    args = parser.parse_args()
+    print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
+    model = VGG().to(device)
+    optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
+    criterion = torch.nn.CrossEntropyLoss()
+    pre_best_acc = 0.0
+    best_state_dict = None
+    for i in range(args.pretrain_epochs):
+        trainer(model, optimizer, criterion)
+        scheduler.step()
+        acc = evaluator(model)
+        if acc > pre_best_acc:
+            pre_best_acc = acc
+            best_state_dict = model.state_dict()
+    print("Best accuracy: {}".format(pre_best_acc))
+    model.load_state_dict(best_state_dict)
+    pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
+    g_epoch = 0
+    # Start to prune and speedup
+    print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
+    config_list = [{
+        'total_sparsity': 0.5,
+        'op_types': ['Conv2d'],
+    }]
+    # make sure you have used nni.trace to wrap the optimizer class before initialize
+    traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
+    if 'apoz' in args.pruner:
+        pruner = ActivationAPoZRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
+    else:
+        pruner = ActivationMeanRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
+    _, masks = pruner.compress()
+    pruner.show_pruned_weights()
+    pruner._unwrap_model()
+    ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
+    print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
+    evaluator(model)
+    # Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
+    print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
+    optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
+    best_acc = 0.0
+    g_epoch = 0
+    for i in range(args.fine_tune_epochs):
+        trainer(model, optimizer, criterion)
+        scheduler.step()
+        best_acc = max(evaluator(model), best_acc)
+    flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
+    print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
+    print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
--- a/examples/model_compress/pruning/admm_pruning_torch.py
+++ b/examples/model_compress/pruning/admm_pruning_torch.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+'''
+NNI example for supported ADMM pruning algorithms.
+In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
+Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
+'''
+import argparse
+import sys
+import torch
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import MultiStepLR
+import nni
+from nni.compression.pytorch.speedup import ModelSpeedup
+from nni.compression.pytorch.utils import count_flops_params
+from nni.compression.pytorch.pruning import ADMMPruner
+from pathlib import Path
+sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
+from cifar10.vgg import VGG
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+g_epoch = 0
+train_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
+        transforms.RandomHorizontalFlip(),
+        transforms.RandomCrop(32, 4),
+        transforms.ToTensor(),
+        normalize,
+    ]), download=True),
+    batch_size=128, shuffle=True)
+test_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        normalize,
+    ])),
+    batch_size=128, shuffle=False)
+def trainer(model, optimizer, criterion):
+    global g_epoch
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx and batch_idx % 100 == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                g_epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+    g_epoch += 1
+def evaluator(model):
+    model.eval()
+    correct = 0.0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    acc = 100 * correct / len(test_loader.dataset)
+    print('Accuracy: {}%\n'.format(acc))
+    return acc
+def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
+    optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
+    scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
+    return optimizer, scheduler
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
+    parser.add_argument('--pretrain-epochs', type=int, default=20,
+                        help='number of epochs to pretrain the model')
+    parser.add_argument('--fine-tune-epochs', type=int, default=20,
+                        help='number of epochs to fine tune the model')
+    args = parser.parse_args()
+    print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
+    model = VGG().to(device)
+    optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
+    criterion = torch.nn.CrossEntropyLoss()
+    pre_best_acc = 0.0
+    best_state_dict = None
+    for i in range(args.pretrain_epochs):
+        trainer(model, optimizer, criterion)
+        scheduler.step()
+        acc = evaluator(model)
+        if acc > pre_best_acc:
+            pre_best_acc = acc
+            best_state_dict = model.state_dict()
+    print("Best accuracy: {}".format(pre_best_acc))
+    model.load_state_dict(best_state_dict)
+    pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
+    g_epoch = 0
+    # Start to prune and speedup
+    print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
+    config_list = [{
+        'sparsity': 0.8,
+        'op_types': ['Conv2d'],
+    }]
+    # make sure you have used nni.trace to wrap the optimizer class before initialize
+    traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
+    pruner = ADMMPruner(model, config_list, trainer, traced_optimizer, criterion, iterations=10, training_epochs=1, granularity='coarse-grained')
+    _, masks = pruner.compress()
+    pruner.show_pruned_weights()
+    pruner._unwrap_model()
+    ModelSpeedup(model, torch.randn([128, 3, 32, 32]).to(device), masks).speedup_model()
+    print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER PRUNING ' + '=' * 50)
+    evaluator(model)
+    # Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
+    print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
+    optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
+    best_acc = 0.0
+    g_epoch = 0
+    for i in range(args.fine_tune_epochs):
+        trainer(model, optimizer, criterion)
+        scheduler.step()
+        best_acc = max(evaluator(model), best_acc)
+    flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
+    print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
+    print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
--- a/examples/model_compress/pruning/amc_pruning_torch.py
+++ b/examples/model_compress/pruning/amc_pruning_torch.py
+import sys
+from tqdm import tqdm
+import torch
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import MultiStepLR
+from nni.compression.pytorch.pruning import AMCPruner
+from nni.compression.pytorch.utils import count_flops_params
+from pathlib import Path
+sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
+from cifar10.vgg import VGG
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
+train_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
+        transforms.RandomHorizontalFlip(),
+        transforms.RandomCrop(32, 4),
+        transforms.ToTensor(),
+        normalize,
+    ]), download=True),
+    batch_size=128, shuffle=True)
+test_loader = torch.utils.data.DataLoader(
+    datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
+        transforms.ToTensor(),
+        normalize,
+    ])),
+    batch_size=128, shuffle=False)
+criterion = torch.nn.CrossEntropyLoss()
+def trainer(model, optimizer, criterion, epoch):
+    model.train()
+    for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+def finetuner(model):
+    model.train()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+    criterion = torch.nn.CrossEntropyLoss()
+    for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+def evaluator(model):
+    model.eval()
+    correct = 0
+    with torch.no_grad():
+        for data, target in tqdm(iterable=test_loader, desc='Test'):
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    acc = 100 * correct / len(test_loader.dataset)
+    print('Accuracy: {}%\n'.format(acc))
+    return acc
+if __name__ == '__main__':
+    # model = MobileNetV2(n_class=10).to(device)
+    model = VGG().to(device)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
+    scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
+    criterion = torch.nn.CrossEntropyLoss()
+    for i in range(100):
+        trainer(model, optimizer, criterion, i)
+    pre_best_acc = evaluator(model)
+    dummy_input = torch.rand(10, 3, 32, 32).to(device)
+    pre_flops, pre_params, _ = count_flops_params(model, dummy_input)
+    config_list = [{'op_types': ['Conv2d'], 'total_sparsity': 0.5, 'max_sparsity_per_layer': 0.8}]
+    # if you just want to keep the final result as the best result, you can pass evaluator as None.
+    # or the result with the highest score (given by evaluator) will be the best result.
+    ddpg_params = {'hidden1': 300, 'hidden2': 300, 'lr_c': 1e-3, 'lr_a': 1e-4, 'warmup': 100, 'discount': 1., 'bsize': 64,
+                   'rmsize': 100, 'window_length': 1, 'tau': 0.01, 'init_delta': 0.5, 'delta_decay': 0.99, 'max_episode_length': 1e9, 'epsilon': 50000}
+    pruner = AMCPruner(400, model, config_list, dummy_input, evaluator, finetuner=finetuner, ddpg_params=ddpg_params, target='flops')
+    pruner.compress()
+    _, model, masks, best_acc, _ = pruner.get_best_result()
+    flops, params, _ = count_flops_params(model, dummy_input)
+    print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
+    print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')