[Retiarii] cross-graph optimization: device placement and input deduplication (#3202)

f2f58dbb · Zhenhua Han · GitHub · 6645bd33 · f2f58dbb · f2f58dbb
Unverified Commit f2f58dbb authored Jul 30, 2021 by Zhenhua Han Committed by GitHub Jul 30, 2021
9 changed files
--- a/test/retiarii_test/cgo/test.py
+++ b/test/retiarii_test/cgo/test.py
+import json
+import os
+import sys
+import torch
+from pathlib import Path
+import nni.retiarii.evaluator.pytorch.cgo.evaluator as cgo
+import nni.retiarii.evaluator.pytorch.lightning as pl
+import nni.retiarii.strategy as strategy
+from nni.retiarii import serialize
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from darts_model import CNN
+if __name__ == '__main__':
+    base_model = CNN(32, 3, 16, 10, 8)
+    train_transform = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    valid_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = serialize(CIFAR10, root='data/cifar10', train=True, download=True, transform=train_transform)
+    test_dataset = serialize(CIFAR10, root='data/cifar10', train=False, download=True, transform=valid_transform)
+    trainer = cgo.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                                 val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
+                                 max_epochs=1, limit_train_batches=0.2)
+    simple_strategy = strategy.Random()
+    exp = RetiariiExperiment(base_model, trainer, [], simple_strategy)
+    exp_config = RetiariiExeConfig('local')
+    exp_config.experiment_name = 'darts_search'
+    exp_config.execution_engine = 'cgo'
+    exp_config.trial_concurrency = 3
+    # since CGO may merge multiple trials into one, RetiariiExperiment may run more trials than max_trial_number
+    # when max_trial_number = 3, it actually runs 9 models since each merged trial contains 3 trials from strategy
+    exp_config.max_trial_number = 100
+    exp_config.devices = ['cuda:0', 'cuda:1', 'cuda:2']
+    exp_config.trial_gpu_number = 1
+    exp_config.batch_waiting_time = 100
+    exp_config.training_service.use_active_gpu = True
+    exp_config.training_service.gpu_indices = [0, 1, 2]
+    exp.run(exp_config, 8081)
--- a/test/retiarii_test/cgo_mnasnet/base_mnasnet.py
+++ b/test/retiarii_test/cgo_mnasnet/base_mnasnet.py
+from nni.retiarii import basic_unit
+import nni.retiarii.nn.pytorch as nn
+import warnings
+import torch
+import torch.nn as torch_nn
+from torchvision.models.utils import load_state_dict_from_url
+import torch.nn.functional as F
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parents[2]))
+# Paper suggests 0.9997 momentum, for TensorFlow. Equivalent PyTorch momentum is
+# 1.0 - tensorflow.
+_BN_MOMENTUM = 1 - 0.9997
+_FIRST_DEPTH = 32
+_MOBILENET_V2_FILTERS = [16, 24, 32, 64, 96, 160, 320]
+_MOBILENET_V2_NUM_LAYERS = [1, 2, 3, 4, 3, 3, 1]
+class _ResidualBlock(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
+    def forward(self, x):
+        return self.net(x) + x
+class _InvertedResidual(nn.Module):
+    def __init__(self, in_ch, out_ch, kernel_size, stride, expansion_factor, skip, bn_momentum=0.1):
+        super(_InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+        assert kernel_size in [3, 5]
+        mid_ch = in_ch * expansion_factor
+        self.apply_residual = skip and in_ch == out_ch and stride == 1
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=kernel_size // 2,
+                      stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=bn_momentum),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=bn_momentum))
+    def forward(self, input):
+        if self.apply_residual:
+            ret = self.layers(input) + input
+        else:
+            ret = self.layers(input)
+        return ret
+def _stack_inverted_residual(in_ch, out_ch, kernel_size, skip, stride, exp_factor, repeats, bn_momentum):
+    """ Creates a stack of inverted residuals. """
+    assert repeats >= 1
+    # First one has no skip, because feature map size changes.
+    first = _InvertedResidual(in_ch, out_ch, kernel_size, stride, exp_factor, skip, bn_momentum=bn_momentum)
+    remaining = []
+    for _ in range(1, repeats):
+        remaining.append(_InvertedResidual(out_ch, out_ch, kernel_size, 1, exp_factor, skip, bn_momentum=bn_momentum))
+    return nn.Sequential(first, *remaining)
+def _stack_normal_conv(in_ch, out_ch, kernel_size, skip, dconv, stride, repeats, bn_momentum):
+    assert repeats >= 1
+    stack = []
+    for i in range(repeats):
+        s = stride if i == 0 else 1
+        if dconv:
+            modules = [
+                nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=s, groups=in_ch, bias=False),
+                nn.BatchNorm2d(in_ch, momentum=bn_momentum),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False),
+                nn.BatchNorm2d(out_ch, momentum=bn_momentum)
+            ]
+        else:
+            modules = [
+                nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=s, bias=False),
+                nn.ReLU(inplace=True),
+                nn.BatchNorm2d(out_ch, momentum=bn_momentum)
+            ]
+        if skip and in_ch == out_ch and s == 1:
+            # use different implementation for skip and noskip to align with pytorch
+            stack.append(_ResidualBlock(nn.Sequential(*modules)))
+        else:
+            stack += modules
+        in_ch = out_ch
+    return stack
+def _round_to_multiple_of(val, divisor, round_up_bias=0.9):
+    """ Asymmetric rounding to make `val` divisible by `divisor`. With default
+    bias, will round up, unless the number is no more than 10% greater than the
+    smaller divisible value, i.e. (83, 8) -> 80, but (84, 8) -> 88. """
+    assert 0.0 < round_up_bias < 1.0
+    new_val = max(divisor, int(val + divisor / 2) // divisor * divisor)
+    return new_val if new_val >= round_up_bias * val else new_val + divisor
+def _get_depths(depths, alpha):
+    """ Scales tensor depths as in reference MobileNet code, prefers rouding up
+    rather than down. """
+    return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
+class MNASNet(nn.Module):
+    """ MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    implements the B1 variant of the model.
+    >>> model = MNASNet(1000, 1.0)
+    >>> x = torch.rand(1, 3, 224, 224)
+    >>> y = model(x)
+    >>> y.dim()
+    1
+    >>> y.nelement()
+    1000
+    """
+    # Version 2 adds depth scaling in the initial stages of the network.
+    _version = 2
+    def __init__(self, alpha, depths, convops, kernel_sizes, num_layers,
+                 skips, num_classes=1000, dropout=0.2):
+        super().__init__()
+        assert alpha > 0.0
+        assert len(depths) == len(convops) == len(kernel_sizes) == len(num_layers) == len(skips) == 7
+        self.alpha = alpha
+        self.num_classes = num_classes
+        depths = _get_depths([_FIRST_DEPTH] + depths, alpha)
+        base_filter_sizes = [16, 24, 40, 80, 96, 192, 320]
+        exp_ratios = [3, 3, 3, 6, 6, 6, 6]
+        strides = [1, 2, 2, 2, 1, 2, 1]
+        layers = [
+            # First layer: regular conv.
+            nn.Conv2d(3, depths[0], 3, padding=1, stride=2, bias=False),
+            nn.BatchNorm2d(depths[0], momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        count = 0
+        # for conv, prev_depth, depth, ks, skip, stride, repeat, exp_ratio in \
+        #        zip(convops, depths[:-1], depths[1:], kernel_sizes, skips, strides, num_layers, exp_ratios):
+        for filter_size, exp_ratio, stride in zip(base_filter_sizes, exp_ratios, strides):
+            # TODO: restrict that "choose" can only be used within mutator
+            ph = nn.Placeholder(label=f'mutable_{count}', **{
+                'kernel_size_options': [1, 3, 5],
+                'n_layer_options': [1, 2, 3, 4],
+                'op_type_options': ['__mutated__.base_mnasnet.RegularConv',
+                                    '__mutated__.base_mnasnet.DepthwiseConv',
+                                    '__mutated__.base_mnasnet.MobileConv'],
+                # 'se_ratio_options': [0, 0.25],
+                'skip_options': ['identity', 'no'],
+                'n_filter_options': [int(filter_size*x) for x in [0.75, 1.0, 1.25]],
+                'exp_ratio': exp_ratio,
+                'stride': stride,
+                'in_ch': depths[0] if count == 0 else None
+            })
+            layers.append(ph)
+            '''if conv == "mconv":
+                # MNASNet blocks: stacks of inverted residuals.
+                layers.append(_stack_inverted_residual(prev_depth, depth, ks, skip,
+                                                       stride, exp_ratio, repeat, _BN_MOMENTUM))
+            else:
+                # Normal conv and depth-separated conv
+                layers += _stack_normal_conv(prev_depth, depth, ks, skip, conv == "dconv",
+                                             stride, repeat, _BN_MOMENTUM)'''
+            count += 1
+            if count >= 2:
+                break
+        layers += [
+            # Final mapping to classifier input.
+            nn.Conv2d(depths[7], 1280, 1, padding=0, stride=1, bias=False),
+            nn.BatchNorm2d(1280, momentum=_BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+        ]
+        self.layers = nn.Sequential(*layers)
+        self.classifier = nn.Sequential(nn.Dropout(p=dropout, inplace=True),
+                                        nn.Linear(1280, num_classes))
+        self._initialize_weights()
+        #self.for_test = 10
+    def forward(self, x):
+        # if self.for_test == 10:
+        x = self.layers(x)
+        # Equivalent to global avgpool and removing H and W dimensions.
+        x = x.mean([2, 3])
+        x = F.relu(x)
+        return self.classifier(x)
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch_nn.init.kaiming_normal_(m.weight, mode="fan_out",
+                                              nonlinearity="relu")
+                if m.bias is not None:
+                    torch_nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                torch_nn.init.ones_(m.weight)
+                torch_nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                torch_nn.init.kaiming_uniform_(m.weight, mode="fan_out",
+                                               nonlinearity="sigmoid")
+                torch_nn.init.zeros_(m.bias)
+def test_model(model):
+    model(torch.randn(2, 3, 224, 224))
+# ====================definition of candidate op classes
+BN_MOMENTUM = 1 - 0.9997
+class RegularConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+        self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, padding=kernel_size // 2, stride=stride, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
+    def forward(self, x):
+        out = self.bn(self.relu(self.conv(x)))
+        if self.skip == 'identity':
+            out = out + x
+        return out
+class DepthwiseConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+        self.conv1 = nn.Conv2d(in_ch, in_ch, kernel_size, padding=kernel_size // 2, stride=stride, groups=in_ch, bias=False)
+        self.bn1 = nn.BatchNorm2d(in_ch, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(in_ch, out_ch, 1, padding=0, stride=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM)
+    def forward(self, x):
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        if self.skip == 'identity':
+            out = out + x
+        return out
+class MobileConv(nn.Module):
+    def __init__(self, kernel_size, in_ch, out_ch, skip, exp_ratio, stride):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.in_ch = in_ch
+        self.out_ch = out_ch
+        self.skip = skip
+        self.exp_ratio = exp_ratio
+        self.stride = stride
+        mid_ch = in_ch * exp_ratio
+        self.layers = nn.Sequential(
+            # Pointwise
+            nn.Conv2d(in_ch, mid_ch, 1, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Depthwise
+            nn.Conv2d(mid_ch, mid_ch, kernel_size, padding=(kernel_size - 1) // 2,
+                      stride=stride, groups=mid_ch, bias=False),
+            nn.BatchNorm2d(mid_ch, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True),
+            # Linear pointwise. Note that there's no activation.
+            nn.Conv2d(mid_ch, out_ch, 1, bias=False),
+            nn.BatchNorm2d(out_ch, momentum=BN_MOMENTUM))
+    def forward(self, x):
+        out = self.layers(x)
+        if self.skip == 'identity':
+            out = out + x
+        return out
+# mnasnet0_5
+ir_module = _InvertedResidual(16, 16, 3, 1, 1, True)
--- a/test/retiarii_test/cgo_mnasnet/mutator.py
+++ b/test/retiarii_test/cgo_mnasnet/mutator.py
+import logging
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).resolve().parents[2]))
+from nni.retiarii import Mutator
+from base_mnasnet import RegularConv, DepthwiseConv, MobileConv
+_logger = logging.getLogger(__name__)
+class BlockMutator(Mutator):
+    def __init__(self, target: str):
+        super(BlockMutator, self).__init__()
+        self.target = target
+    def mutate(self, model):
+        nodes = model.get_nodes_by_label(self.target)
+        assert len(nodes) == 1
+        node = nodes[0]
+        graph = node.graph
+        related_info = node.operation.parameters
+        kernel_size = self.choice(related_info['kernel_size_options'])
+        op_type = self.choice(related_info['op_type_options'])
+        #self.choice(related_info['se_ratio_options'])
+        skip = self.choice(related_info['skip_options'])
+        n_filter = self.choice(related_info['n_filter_options'])
+        if related_info['in_ch'] is not None:
+            in_ch = related_info['in_ch']
+        else:
+            assert len(node.predecessors) == 1
+            the_node = node.predecessors[0]
+            _logger.debug(repr(the_node.operation.parameters))
+            _logger.debug(the_node.__repr__())
+            in_ch = the_node.operation.parameters['out_ch']
+        # update the placeholder to be a new operation
+        node.update_operation(op_type, {
+            'kernel_size': kernel_size,
+            'in_ch': in_ch,
+            'out_ch': n_filter,
+            'skip': 'no',
+            'exp_ratio': related_info['exp_ratio'],
+            'stride': related_info['stride']
+        })
+        # insert new nodes after the placeholder
+        n_layer = self.choice(related_info['n_layer_options'])
+        for i in range(1, n_layer):
+            node = graph.insert_node_on_edge(node.outgoing_edges[0],
+                                             '{}_{}'.format(self.target, i),
+                                             op_type,
+                                             {'kernel_size': kernel_size,
+                                              'in_ch': n_filter,
+                                              'out_ch': n_filter,
+                                              'skip': skip,
+                                              'exp_ratio': related_info['exp_ratio'],
+                                              'stride': 1})
+        # fix possible shape mismatch
+        # TODO: use formal method function to update parameters
+        if len(node.successors) == 1 and 'in_channels' in node.successors[0].operation.parameters:
+            node.successors[0].operation.parameters['in_channels'] = n_filter
\ No newline at end of file
--- a/test/retiarii_test/cgo_mnasnet/test.py
+++ b/test/retiarii_test/cgo_mnasnet/test.py
+import os
+import sys
+import torch
+from pathlib import Path
+import nni.retiarii.evaluator.pytorch.lightning as pl
+import nni.retiarii.evaluator.pytorch.cgo.evaluator as cgo
+from nni.retiarii import serialize
+from base_mnasnet import MNASNet
+from nni.experiment import RemoteMachineConfig
+from nni.retiarii.experiment.pytorch import RetiariiExperiment, RetiariiExeConfig
+from nni.retiarii.strategy import TPEStrategy
+from torchvision import transforms
+from torchvision.datasets import CIFAR10
+from mutator import BlockMutator
+if __name__ == '__main__':
+    _DEFAULT_DEPTHS = [16, 24, 40, 80, 96, 192, 320]
+    _DEFAULT_CONVOPS = ["dconv", "mconv", "mconv", "mconv", "mconv", "mconv", "mconv"]
+    _DEFAULT_SKIPS = [False, True, True, True, True, True, True]
+    _DEFAULT_KERNEL_SIZES = [3, 3, 5, 5, 3, 5, 3]
+    _DEFAULT_NUM_LAYERS = [1, 3, 3, 3, 2, 4, 1]
+    base_model = MNASNet(0.5, _DEFAULT_DEPTHS, _DEFAULT_CONVOPS, _DEFAULT_KERNEL_SIZES,
+                         _DEFAULT_NUM_LAYERS, _DEFAULT_SKIPS)
+    train_transform = transforms.Compose([
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    valid_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+    ])
+    train_dataset = serialize(CIFAR10, root='data/cifar10', train=True, download=True, transform=train_transform)
+    test_dataset = serialize(CIFAR10, root='data/cifar10', train=False, download=True, transform=valid_transform)
+    # trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+    #                             val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
+    #                             max_epochs=1, limit_train_batches=0.2)
+    trainer = cgo.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                                val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
+                                max_epochs=1, limit_train_batches=0.2)
+    applied_mutators = [
+        BlockMutator('mutable_0'),
+        BlockMutator('mutable_1')
+    ]
+    simple_strategy = TPEStrategy()
+    exp = RetiariiExperiment(base_model, trainer, applied_mutators, simple_strategy)
+    exp_config = RetiariiExeConfig('remote')
+    exp_config.experiment_name = 'darts_search'
+    exp_config.trial_concurrency = 3
+    exp_config.max_trial_number = 10
+    exp_config.trial_gpu_number = 1
+    exp_config.training_service.use_active_gpu = True
+    exp_config.training_service.reuse_mode = True
+    exp_config.training_service.gpu_indices = [0, 1, 2]
+    exp_config.max_concurrency_cgo = 1
+    exp_config.batch_waiting_time = 0
+    rm_conf = RemoteMachineConfig()
+    rm_conf.host = '127.0.0.1'
+    rm_conf.user = 'xxx'
+    rm_conf.password = 'xxx'
+    rm_conf.port = 22
+    rm_conf.python_path = '/home/xxx/py38/bin'
+    rm_conf.gpu_indices = [0, 1, 2]
+    rm_conf.use_active_gpu = True
+    rm_conf.max_trial_number_per_gpu = 3
+    exp_config.training_service.machine_list = [rm_conf]
+    exp_config.execution_engine = 'cgo'
+    exp.run(exp_config, 8099)
\ No newline at end of file
--- a/test/retiarii_test/darts/test.py
+++ b/test/retiarii_test/darts/test.py
@@ -31,7 +31,8 @@ if __name__ == '__main__':
    test_dataset = serialize(CIFAR10, root='data/cifar10', train=False, download=True, transform=valid_transform)
    trainer = pl.Classification(train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
                                val_dataloaders=pl.DataLoader(test_dataset, batch_size=100),
-                                max_epochs=1, limit_train_batches=0.2)
+                                max_epochs=1, limit_train_batches=0.2,
+                                progress_bar_refresh_rate=0)
    simple_strategy = strategy.Random()

--- a/test/ut/retiarii/converted_mnist_pytorch.json
+++ b/test/ut/retiarii/converted_mnist_pytorch.json
-{
-    "_model__stem":{
-       "inputs":[
-          "_inputs__1"
-       ],
-       "outputs":[
-          "pool2__1"
-       ],
-       "nodes":{
-          "_model__stem__conv1":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.conv.Conv2d",
-                "parameters":{
-                   "out_channels":32,
-                   "in_channels":1,
-                   "kernel_size":5
-                }
-             }
-          },
-          "_model__stem__pool1":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.pooling.MaxPool2d",
-                "parameters":{
-                   "kernel_size":2
-                }
-             }
-          },
-          "_model__stem__conv2":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.conv.Conv2d",
-                "parameters":{
-                   "out_channels":64,
-                   "in_channels":32,
-                   "kernel_size":5
-                }
-             }
-          },
-          "_model__stem__pool2":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.pooling.MaxPool2d",
-                "parameters":{
-                   "kernel_size":2
-                }
-             }
-          }
-       },
-       "edges":[
-          {
-             "head":[
-                "_inputs",
-                0
-             ],
-             "tail":[
-                "_model__stem__conv1",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__stem__conv1",
-                null
-             ],
-             "tail":[
-                "_model__stem__pool1",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__stem__pool1",
-                null
-             ],
-             "tail":[
-                "_model__stem__conv2",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__stem__conv2",
-                null
-             ],
-             "tail":[
-                "_model__stem__pool2",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__stem__pool2",
-                null
-             ],
-             "tail":[
-                "_outputs",
-                null
-             ]
-          }
-       ]
-    },
-    "_model":{
-       "inputs":[
-          "image__1"
-       ],
-       "outputs":[
-          "softmax__1"
-       ],
-       "nodes":{
-          "_model__Constant2":{
-             "operation":{
-                "type":"prim::Constant",
-                "parameters":{
-                }
-             }
-          },
-          "_model__Constant3":{
-             "operation":{
-                "type":"prim::Constant",
-                "parameters":{
-                   "value":3
-                }
-             }
-          },
-          "_model__Constant4":{
-             "operation":{
-                "type":"prim::Constant",
-                "parameters":{
-                   "value":-1
-                }
-             }
-          },
-          "_model__Constant5":{
-             "operation":{
-                "type":"prim::Constant",
-                "parameters":{
-                   "value":0
-                }
-             }
-          },
-          "_model__stem":{
-             "operation":{
-                "type":"_cell",
-                "parameters":{
-                },
-                "cell_name":"_model__stem"
-             }
-          },
-          "_model__Size6":{
-             "operation":{
-                "type":"aten::size",
-                "parameters":{
-                }
-             }
-          },
-          "_model__ListConstruct7":{
-             "operation":{
-                "type":"prim::ListConstruct",
-                "parameters":{
-                }
-             }
-          },
-          "_model__View8":{
-             "operation":{
-                "type":"aten::view",
-                "parameters":{
-                }
-             }
-          },
-          "_model__fc1":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.linear.Linear",
-                "parameters":{
-                   "in_features":1024,
-                   "out_features":256
-                }
-             }
-          },
-          "_model__fc2":{
-             "operation":{
-                "type":"__torch__.torch.nn.modules.linear.Linear",
-                "parameters":{
-                   "in_features":256,
-                   "out_features":10
-                }
-             }
-          },
-          "_model__softmax9":{
-             "operation":{
-                "type":"Function.softmax",
-                "parameters":{
-                }
-             }
-          }
-       },
-       "edges":[
-          {
-             "head":[
-                "_inputs",
-                0
-             ],
-             "tail":[
-                "_model__stem",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__stem",
-                null
-             ],
-             "tail":[
-                "_model__Size6",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__Constant5",
-                null
-             ],
-             "tail":[
-                "_model__Size6",
-                1
-             ]
-          },
-          {
-             "head":[
-                "_model__Size6",
-                null
-             ],
-             "tail":[
-                "_model__ListConstruct7",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__Constant4",
-                null
-             ],
-             "tail":[
-                "_model__ListConstruct7",
-                1
-             ]
-          },
-          {
-             "head":[
-                "_model__stem",
-                null
-             ],
-             "tail":[
-                "_model__View8",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__ListConstruct7",
-                null
-             ],
-             "tail":[
-                "_model__View8",
-                1
-             ]
-          },
-          {
-             "head":[
-                "_model__View8",
-                null
-             ],
-             "tail":[
-                "_model__fc1",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__fc1",
-                null
-             ],
-             "tail":[
-                "_model__fc2",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__fc2",
-                null
-             ],
-             "tail":[
-                "_model__softmax9",
-                0
-             ]
-          },
-          {
-             "head":[
-                "_model__Constant4",
-                null
-             ],
-             "tail":[
-                "_model__softmax9",
-                1
-             ]
-          },
-          {
-             "head":[
-                "_model__Constant3",
-                null
-             ],
-             "tail":[
-                "_model__softmax9",
-                2
-             ]
-          },
-          {
-             "head":[
-                "_model__Constant2",
-                null
-             ],
-             "tail":[
-                "_model__softmax9",
-                3
-             ]
-          },
-          {
-             "head":[
-                "_model__softmax9",
-                null
-             ],
-             "tail":[
-                "_outputs",
-                null
-             ]
-          }
-       ]
-    },
-    "_evaluator": {
-        "module": "nni.retiarii.trainer.PyTorchImageClassificationTrainer",
-        "kwargs": {
-            "dataset_cls": "MNIST",
-            "dataset_kwargs": {
-                "root": "data/mnist",
-                "download": true
-            },
-            "dataloader_kwargs": {
-                "batch_size": 32
-            },
-            "optimizer_cls" : "SGD",
-            "optimizer_kwargs": {
-                "lr": 1e-3
-            },
-            "trainer_kwargs": {
-                "max_epochs": 1
-            }
-        }
-    }
- }
\ No newline at end of file
--- a/test/ut/retiarii/test_cgo_engine.py
+++ b/test/ut/retiarii/test_cgo_engine.py
 import json
 import os
-import sys
 import threading
 import unittest
-import logging
 import time
 import torch
+import torch.nn as nn
 from pathlib import Path
-from nni.retiarii.execution.cgo_engine import CGOExecutionEngine
+import nni
-from nni.retiarii.execution.logical_optimizer.logical_plan import LogicalPlan
-from nni.retiarii.execution.logical_optimizer.opt_dedup_input import DedupInputOptimizer
-from nni.retiarii.codegen import model_to_pytorch_script
-from nni.retiarii import Model, Node
-from nni.retiarii import Model, submit_models
+try:
-from nni.retiarii.codegen import model_to_pytorch_script
+    from nni.common.device import GPUDevice
-from nni.retiarii.integration import RetiariiAdvisor
+    from nni.retiarii.execution.cgo_engine import CGOExecutionEngine
-from nni.retiarii.evaluator.pytorch import PyTorchImageClassificationTrainer, PyTorchMultiModelTrainer
+    from nni.retiarii import Model
-from nni.retiarii.utils import import_
+    from nni.retiarii.graph import Node
+    from nni.retiarii import Model, submit_models
+    from nni.retiarii.integration import RetiariiAdvisor
+    from nni.retiarii.execution import set_execution_engine
+    from nni.retiarii.execution.logical_optimizer.opt_dedup_input import DedupInputOptimizer
+    from nni.retiarii.execution.logical_optimizer.logical_plan import LogicalPlan
+    from nni.retiarii.utils import import_
+    from nni.retiarii import serialize
+    import nni.retiarii.evaluator.pytorch.lightning as pl
+    from nni.retiarii.evaluator.pytorch.cgo.evaluator import MultiModelSupervisedLearningModule, _MultiModelSupervisedLearningModule
+    import nni.retiarii.evaluator.pytorch.cgo.trainer as cgo_trainer
+    module_import_failed = False
+except ImportError:
+    module_import_failed = True
+import pytest
+from torchvision.datasets import MNIST
+from torchvision import transforms
+from torch.utils.data import Dataset
+from sklearn.datasets import load_diabetes
+class _model_cpu(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.M_1_stem = M_1_stem()
+        self.M_2_stem = M_2_stem()
+        self.M_1_flatten = torch.nn.Flatten()
+        self.M_2_flatten = torch.nn.Flatten()
+        self.M_1_fc1 = torch.nn.Linear(out_features=256, in_features=1024)
+        self.M_2_fc1 = torch.nn.Linear(out_features=256, in_features=1024)
+        self.M_1_fc2 = torch.nn.Linear(out_features=10, in_features=256)
+        self.M_2_fc2 = torch.nn.Linear(out_features=10, in_features=256)
+        self.M_1_softmax = torch.nn.Softmax()
+        self.M_2_softmax = torch.nn.Softmax()
+    def forward(self, *_inputs):
+        M_1__inputs_to_M_2_stem = _inputs[0]
+        M_1_stem = self.M_1_stem(_inputs[0])
+        M_2_stem = self.M_2_stem(M_1__inputs_to_M_2_stem)
+        M_1_flatten = self.M_1_flatten(M_1_stem)
+        M_2_flatten = self.M_2_flatten(M_2_stem)
+        M_1_fc1 = self.M_1_fc1(M_1_flatten)
+        M_2_fc1 = self.M_2_fc1(M_2_flatten)
+        M_1_fc2 = self.M_1_fc2(M_1_fc1)
+        M_2_fc2 = self.M_2_fc2(M_2_fc1)
+        M_1_softmax = self.M_1_softmax(M_1_fc2)
+        M_2_softmax = self.M_2_softmax(M_2_fc2)
+        return M_1_softmax, M_2_softmax
+class _model_gpu(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.M_1_stem = M_1_stem().to('cuda:0')
+        self.M_2_stem = M_2_stem().to('cuda:1')
+        self.M_1_flatten = torch.nn.Flatten().to('cuda:0')
+        self.M_2_flatten = torch.nn.Flatten().to('cuda:1')
+        self.M_1_fc1 = torch.nn.Linear(out_features=256, in_features=1024).to('cuda:0')
+        self.M_2_fc1 = torch.nn.Linear(out_features=256, in_features=1024).to('cuda:1')
+        self.M_1_fc2 = torch.nn.Linear(out_features=10, in_features=256).to('cuda:0')
+        self.M_2_fc2 = torch.nn.Linear(out_features=10, in_features=256).to('cuda:1')
+        self.M_1_softmax = torch.nn.Softmax().to('cuda:0')
+        self.M_2_softmax = torch.nn.Softmax().to('cuda:1')
+    def forward(self, *_inputs):
+        M_1__inputs_to_M_1_stem = _inputs[0].to("cuda:0")
+        M_1__inputs_to_M_2_stem = _inputs[0].to("cuda:1")
+        M_1_stem = self.M_1_stem(M_1__inputs_to_M_1_stem)
+        M_2_stem = self.M_2_stem(M_1__inputs_to_M_2_stem)
+        M_1_flatten = self.M_1_flatten(M_1_stem)
+        M_2_flatten = self.M_2_flatten(M_2_stem)
+        M_1_fc1 = self.M_1_fc1(M_1_flatten)
+        M_2_fc1 = self.M_2_fc1(M_2_flatten)
+        M_1_fc2 = self.M_1_fc2(M_1_fc1)
+        M_2_fc2 = self.M_2_fc2(M_2_fc1)
+        M_1_softmax = self.M_1_softmax(M_1_fc2)
+        M_2_softmax = self.M_2_softmax(M_2_fc2)
+        return M_1_softmax, M_2_softmax
+class M_1_stem(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(out_channels=32, in_channels=1, kernel_size=5)
+        self.pool1 = torch.nn.MaxPool2d(kernel_size=2)
+        self.conv2 = torch.nn.Conv2d(out_channels=64, in_channels=32, kernel_size=5)
+        self.pool2 = torch.nn.MaxPool2d(kernel_size=2)
+    def forward(self, *_inputs):
+        conv1 = self.conv1(_inputs[0])
+        pool1 = self.pool1(conv1)
+        conv2 = self.conv2(pool1)
+        pool2 = self.pool2(conv2)
+        return pool2
+class M_2_stem(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = torch.nn.Conv2d(out_channels=32, in_channels=1, kernel_size=5)
+        self.pool1 = torch.nn.MaxPool2d(kernel_size=2)
+        self.conv2 = torch.nn.Conv2d(out_channels=64, in_channels=32, kernel_size=5)
+        self.pool2 = torch.nn.MaxPool2d(kernel_size=2)
+    def forward(self, *_inputs):
+        conv1 = self.conv1(_inputs[0])
+        pool1 = self.pool1(conv1)
+        conv2 = self.conv2(pool1)
+        pool2 = self.pool2(conv2)
+        return pool2
+def _reset():
+    # this is to not affect other tests in sdk
+    nni.trial._intermediate_seq = 0
+    nni.trial._params = {'foo': 'bar', 'parameter_id': 0}
+    nni.runtime.platform.test._last_metric = None
+    nni.retiarii.integration_api._advisor = None
+    nni.retiarii.execution.api._execution_engine = None
+def _new_trainer():
+    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+    train_dataset = serialize(MNIST, root='data/mnist', train=True, download=True, transform=transform)
+    test_dataset = serialize(MNIST, root='data/mnist', train=False, download=True, transform=transform)
+    multi_module = MultiModelSupervisedLearningModule(nn.CrossEntropyLoss, {'acc': pl._AccuracyWithLogits})
+    lightning = pl.Lightning(multi_module, cgo_trainer.Trainer(use_cgo=True,
+                                                               max_epochs=1,
+                                                               limit_train_batches=0.25,
+                                                               progress_bar_refresh_rate=0),
+                             train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                             val_dataloaders=pl.DataLoader(test_dataset, batch_size=100))
+    return lightning
 def _load_mnist(n_models: int = 1):
-    path = Path(__file__).parent / 'converted_mnist_pytorch.json'
+    path = Path(__file__).parent / 'mnist_pytorch.json'
    with open(path) as f:
        mnist_model = Model._load(json.load(f))
+        mnist_model.evaluator = _new_trainer()
    if n_models == 1:
        return mnist_model
    else:
        models = [mnist_model]
-        for i in range(n_models-1):
+        for i in range(n_models - 1):
-            models.append(mnist_model.fork())
+            forked_model = mnist_model.fork()
+            forked_model.evaluator = _new_trainer()
+            models.append(forked_model)
        return models
-@unittest.skip('Skipped in this version')
+def _get_final_result():
+    result = json.loads(nni.runtime.platform.test._last_metric)['value']
+    if isinstance(result, list):
+        return [float(_) for _ in result]
+    else:
+        if isinstance(result, str) and '[' in result:
+            return json.loads(result)
+        return [float(result)]
 class CGOEngineTest(unittest.TestCase):
+    def setUp(self):
+        if module_import_failed:
+            self.skipTest('test skip due to failed import of nni.retiarii.evaluator.pytorch.lightning')
+    def test_multi_model_trainer_cpu(self):
+        _reset()
+        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+        train_dataset = serialize(MNIST, root='data/mnist', train=True, download=True, transform=transform)
+        test_dataset = serialize(MNIST, root='data/mnist', train=False, download=True, transform=transform)
+        multi_module = _MultiModelSupervisedLearningModule(nn.CrossEntropyLoss, {'acc': pl._AccuracyWithLogits}, n_models=2)
+        lightning = pl.Lightning(multi_module, cgo_trainer.Trainer(use_cgo=True,
+                                                                   max_epochs=1,
+                                                                   limit_train_batches=0.25),
+                                 train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                                 val_dataloaders=pl.DataLoader(test_dataset, batch_size=100))
+        lightning._execute(_model_cpu)
+        result = _get_final_result()
+        assert len(result) == 2
+        for _ in result:
+            assert _ > 0.8
+    def test_multi_model_trainer_gpu(self):
+        _reset()
+        if not (torch.cuda.is_available() and torch.cuda.device_count() >= 2):
+            pytest.skip('test requires GPU and torch+cuda')
+        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
+        train_dataset = serialize(MNIST, root='data/mnist', train=True, download=True, transform=transform)
+        test_dataset = serialize(MNIST, root='data/mnist', train=False, download=True, transform=transform)
+        multi_module = _MultiModelSupervisedLearningModule(nn.CrossEntropyLoss, {'acc': pl._AccuracyWithLogits}, n_models=2)
+        lightning = pl.Lightning(multi_module, cgo_trainer.Trainer(use_cgo=True,
+                                                                   max_epochs=1,
+                                                                   limit_train_batches=0.25),
+                                 train_dataloader=pl.DataLoader(train_dataset, batch_size=100),
+                                 val_dataloaders=pl.DataLoader(test_dataset, batch_size=100))
+        lightning._execute(_model_gpu)
+        result = _get_final_result()
+        assert len(result) == 2
+        for _ in result:
+            assert _ > 0.8
+    def _build_logical_with_mnist(self, n_models: int):
+        lp = LogicalPlan()
+        models = _load_mnist(n_models=n_models)
+        for m in models:
+            lp.add_model(m)
+        return lp, models
+    def test_add_model(self):
+        _reset()
+        lp, models = self._build_logical_with_mnist(3)
+        for node in lp.logical_graph.hidden_nodes:
+            old_nodes = [m.root_graph.get_node_by_id(node.id) for m in models]
+            self.assertTrue(any([old_nodes[0].__repr__() == Node.__repr__(x) for x in old_nodes]))
+    def test_dedup_input_four_devices(self):
+        _reset()
+        lp, models = self._build_logical_with_mnist(3)
+        opt = DedupInputOptimizer()
+        opt.convert(lp)
+        advisor = RetiariiAdvisor()
+        available_devices = [GPUDevice("test", 0), GPUDevice("test", 1), GPUDevice("test", 2), GPUDevice("test", 3)]
+        cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0)
+        phy_models = cgo._assemble(lp)
+        self.assertTrue(len(phy_models) == 1)
+        advisor.stopping = True
+        advisor.default_worker.join()
+        advisor.assessor_worker.join()
+        cgo.join()
+    def test_dedup_input_two_devices(self):
+        _reset()
+        lp, models = self._build_logical_with_mnist(3)
+        opt = DedupInputOptimizer()
+        opt.convert(lp)
+        advisor = RetiariiAdvisor()
+        available_devices = [GPUDevice("test", 0), GPUDevice("test", 1)]
+        cgo = CGOExecutionEngine(devices=available_devices, batch_waiting_time=0)
+        phy_models = cgo._assemble(lp)
+        self.assertTrue(len(phy_models) == 2)
+        advisor.stopping = True
+        advisor.default_worker.join()
+        advisor.assessor_worker.join()
+        cgo.join()
    def test_submit_models(self):
-        os.environ['CGO'] = 'true'
+        _reset()
+        nni.retiarii.debug_configs.framework = 'pytorch'
        os.makedirs('generated', exist_ok=True)
-        from nni.runtime import protocol, platform
+        from nni.runtime import protocol
        import nni.runtime.platform.test as tt
        protocol._out_file = open('generated/debug_protocol_out_file.py', 'wb')
        protocol._in_file = open('generated/debug_protocol_out_file.py', 'rb')
        models = _load_mnist(2)
        advisor = RetiariiAdvisor()
+        cgo_engine = CGOExecutionEngine(devices=[GPUDevice("test", 0), GPUDevice("test", 1),
+                                                 GPUDevice("test", 2), GPUDevice("test", 3)], batch_waiting_time=0)
+        set_execution_engine(cgo_engine)
        submit_models(*models)
+        time.sleep(3)
        if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
            cmd, data = protocol.receive()
            params = json.loads(data)
-            params['parameters']['training_kwargs']['max_steps'] = 100
            tt.init_params(params)
-            trial_thread = threading.Thread(target=CGOExecutionEngine.trial_execute_graph())
+            trial_thread = threading.Thread(target=CGOExecutionEngine.trial_execute_graph)
            trial_thread.start()
            last_metric = None
            while True:
@@ -66,15 +321,20 @@ class CGOEngineTest(unittest.TestCase):
                    metric = tt.get_last_metric()
                    if metric == last_metric:
                        continue
+                    if 'value' in metric:
+                        metric['value'] = json.dumps(metric['value'])
                    advisor.handle_report_metric_data(metric)
                    last_metric = metric
                if not trial_thread.is_alive():
+                    trial_thread.join()
                    break
            trial_thread.join()
        advisor.stopping = True
        advisor.default_worker.join()
        advisor.assessor_worker.join()
+        cgo_engine.join()
 if __name__ == '__main__':

--- a/test/ut/retiarii/test_dedup_input.py
+++ b/test/ut/retiarii/test_dedup_input.py
-import json
-import os
-import sys
-import threading
-import unittest
-import logging
-import time
-from pathlib import Path
-from nni.retiarii.execution.cgo_engine import CGOExecutionEngine
-from nni.retiarii.execution.logical_optimizer.logical_plan import LogicalPlan
-from nni.retiarii.execution.logical_optimizer.opt_dedup_input import DedupInputOptimizer
-from nni.retiarii.codegen import model_to_pytorch_script
-from nni.retiarii import Model, Node
-from nni.retiarii import Model, submit_models
-from nni.retiarii.codegen import model_to_pytorch_script
-from nni.retiarii.integration import RetiariiAdvisor
-from nni.retiarii.utils import import_
-def _load_mnist(n_models: int = 1):
-    path = Path(__file__).parent / 'converted_mnist_pytorch.json'
-    with open(path) as f:
-        mnist_model = Model._load(json.load(f))
-    if n_models == 1:
-        return mnist_model
-    else:
-        models = [mnist_model]
-        for i in range(n_models-1):
-            models.append(mnist_model.fork())
-        return models
-@unittest.skip('Skipped in this version')
-class DedupInputTest(unittest.TestCase):
-    def _build_logical_with_mnist(self, n_models: int):
-        lp = LogicalPlan()
-        models = _load_mnist(n_models=n_models)
-        for m in models:
-            lp.add_model(m)
-        return lp, models
-    def _test_add_model(self):
-        lp, models = self._build_logical_with_mnist(3)
-        for node in lp.logical_graph.hidden_nodes:
-            old_nodes = [m.root_graph.get_node_by_id(node.id) for m in models]
-            self.assertTrue(any([old_nodes[0].__repr__() == Node.__repr__(x) for x in old_nodes]))
-    def test_dedup_input(self):
-        os.environ['CGO'] = 'true'
-        lp, models = self._build_logical_with_mnist(3)
-        opt = DedupInputOptimizer()
-        opt.convert(lp)
-        with open('dedup_logical_graph.json', 'r') as fp:
-            correct_dump = fp.readlines()
-        lp_dump = lp.logical_graph._dump()
-        self.assertTrue(correct_dump[0] == json.dumps(lp_dump))
-        advisor = RetiariiAdvisor()
-        cgo = CGOExecutionEngine()
-        phy_models = cgo._assemble(lp)
-        self.assertTrue(len(phy_models) == 1)
-        # logging.info(phy_models[0][0]._dump())
-        # script=model_to_pytorch_script(phy_models[0][0], placement = phy_models[0][1])
-        # logging.info(script)
-        # with open('generated/debug_dedup_input.py', 'w') as fp:
-        #     fp.write(script)
-        # sys.path.insert(0, 'generated')
-        # multi_model = import_('debug_dedup_input.logical_0')
-        # trainer = PyTorchMultiModelTrainer(
-        #     multi_model(), phy_models[0][0].evaluator.kwargs
-        # )
-        # trainer.fit()
-        advisor.stopping = True
-        advisor.default_worker.join()
-        advisor.assessor_worker.join()
-if __name__ == '__main__':
-    unittest.main()
--- a/test/ut/retiarii/test_engine.py
+++ b/test/ut/retiarii/test_engine.py
@@ -22,6 +22,8 @@ class EngineTest(unittest.TestCase):
        self.assertEqual(script.strip(), reference_script.strip())
    def test_base_execution_engine(self):
+        nni.retiarii.integration_api._advisor = None
+        nni.retiarii.execution.api._execution_engine = None
        advisor = RetiariiAdvisor()
        set_execution_engine(BaseExecutionEngine())
        with open(self.enclosing_dir / 'mnist_pytorch.json') as f:
@@ -33,7 +35,8 @@ class EngineTest(unittest.TestCase):
        advisor.assessor_worker.join()
    def test_py_execution_engine(self):
+        nni.retiarii.integration_api._advisor = None
+        nni.retiarii.execution.api._execution_engine = None
        advisor = RetiariiAdvisor()
        set_execution_engine(PurePythonExecutionEngine())
        model = Model._load({