[Retiarii] Remove unused code and enrich integration tests (#4097)

619177b9 · Yuge Zhang · GitHub · 0918ea0c · 619177b9 · 0918ea0c
Unverified Commit 619177b9 authored Sep 13, 2021 by Yuge Zhang Committed by GitHub Sep 13, 2021
5 changed files
--- a/nni/retiarii/evaluator/pytorch/__init__.py
+++ b/nni/retiarii/evaluator/pytorch/__init__.py
-from .base import PyTorchImageClassificationTrainer, PyTorchMultiModelTrainer
 from .lightning import *
--- a/nni/retiarii/evaluator/pytorch/base.py
+++ b/nni/retiarii/evaluator/pytorch/base.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-# This file is deprecated.
-import abc
-from typing import Any, List, Dict, Tuple
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader
-from torchvision import datasets, transforms
-import nni
-class BaseTrainer(abc.ABC):
-    @abc.abstractmethod
-    def fit(self) -> None:
-        pass
-def get_default_transform(dataset: str) -> Any:
-    """
-    To get a default transformation of image for a specific dataset.
-    This is needed because transform objects can not be directly passed as arguments.
-    Parameters
-    ----------
-    dataset : str
-        Dataset class name.
-    Returns
-    -------
-    transform object
-    """
-    if dataset == 'MNIST':
-        return transforms.Compose([
-            transforms.ToTensor(),
-            transforms.Normalize((0.1307,), (0.3081,))
-        ])
-    if dataset == 'CIFAR10':
-        return transforms.Compose([
-            transforms.RandomCrop(32, padding=4),
-            transforms.RandomHorizontalFlip(),
-            transforms.ToTensor(),
-            transforms.Normalize((0.4914, 0.4822, 0.4465),
-                                 (0.2023, 0.1994, 0.2010)),
-        ])
-    # unsupported dataset, return None
-    return None
-class PyTorchImageClassificationTrainer(BaseTrainer):
-    """
-    Image classification trainer for PyTorch.
-    A model, along with corresponding dataset, optimizer config is used to initialize the trainer.
-    The trainer will run for a fixed number of epochs (by default 10), and report the final result.
-    TODO
-    Support scheduler, validate every n epochs, train/valid dataset
-    Limitation induced by NNI: kwargs must be serializable to put into a JSON packed in parameters.
-    """
-    def __init__(self, model,
-                 dataset_cls='MNIST', dataset_kwargs=None, dataloader_kwargs=None,
-                 optimizer_cls='SGD', optimizer_kwargs=None, trainer_kwargs=None):
-        """Initialization of image classification trainer.
-        Parameters
-        ----------
-        model : nn.Module
-            Model to train.
-        dataset_cls : str, optional
-            Dataset class name that is available in ``torchvision.datasets``, by default 'MNIST'
-        dataset_kwargs : dict, optional
-            Keyword arguments passed to initialization of dataset class, by default None
-        dataset_kwargs : dict, optional
-            Keyword arguments passed to ``torch.utils.data.DataLoader``, by default None
-        optimizer_cls : str, optional
-            Optimizer class name that is available in ``torch.optim``, by default 'SGD'
-        optimizer_kwargs : dict, optional
-            Keyword arguments passed to initialization of optimizer class, by default None
-        trainer_kwargs: dict, optional
-            Keyword arguments passed to trainer. Will be passed to Trainer class in future. Currently,
-            only the key ``max_epochs`` is useful.
-        """
-        super().__init__()
-        self._use_cuda = torch.cuda.is_available()
-        self.model = model
-        if self._use_cuda:
-            self.model.cuda()
-        self._loss_fn = nn.CrossEntropyLoss()
-        self._train_dataset = getattr(datasets, dataset_cls)(train=True, transform=get_default_transform(dataset_cls),
-                                                             **(dataset_kwargs or {}))
-        self._val_dataset = getattr(datasets, dataset_cls)(train=False, transform=get_default_transform(dataset_cls),
-                                                           **(dataset_kwargs or {}))
-        self._optimizer = getattr(torch.optim, optimizer_cls)(model.parameters(), **(optimizer_kwargs or {}))
-        self._trainer_kwargs = trainer_kwargs or {'max_epochs': 10}
-        self._train_dataloader = DataLoader(self._train_dataset, **(dataloader_kwargs or {}))
-        self._val_dataloader = DataLoader(self._val_dataset, **(dataloader_kwargs or {}))
-    def _accuracy(self, input, target):  # pylint: disable=redefined-builtin
-        _, predict = torch.max(input.data, 1)
-        correct = predict.eq(target.data).cpu().sum().item()
-        return correct / input.size(0)
-    def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> Dict[str, Any]:
-        x, y = self.training_step_before_model(batch, batch_idx)
-        y_hat = self.model(x)
-        return self.training_step_after_model(x, y, y_hat)
-    def training_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int):
-        x, y = batch
-        if self._use_cuda:
-            x, y = x.cuda(torch.device('cuda:0')), y.cuda(torch.device('cuda:0'))
-        return x, y
-    def training_step_after_model(self, x, y, y_hat):
-        loss = self._loss_fn(y_hat, y)
-        return loss
-    def validation_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> Dict[str, Any]:
-        x, y = self.validation_step_before_model(batch, batch_idx)
-        y_hat = self.model(x)
-        return self.validation_step_after_model(x, y, y_hat)
-    def validation_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int):
-        x, y = batch
-        if self._use_cuda:
-            x, y = x.cuda(), y.cuda()
-        return x, y
-    def validation_step_after_model(self, x, y, y_hat):
-        acc = self._accuracy(y_hat, y)
-        return {'val_acc': acc}
-    def validation_epoch_end(self, outputs: List[Dict[str, Any]]) -> Dict[str, Any]:
-        # We might need dict metrics in future?
-        avg_acc = np.mean([x['val_acc'] for x in outputs]).item()
-        nni.report_intermediate_result(avg_acc)
-        return {'val_acc': avg_acc}
-    def _validate(self):
-        validation_outputs = []
-        for i, batch in enumerate(self._val_dataloader):
-            validation_outputs.append(self.validation_step(batch, i))
-        return self.validation_epoch_end(validation_outputs)
-    def _train(self):
-        for i, batch in enumerate(self._train_dataloader):
-            self._optimizer.zero_grad()
-            loss = self.training_step(batch, i)
-            loss.backward()
-            self._optimizer.step()
-    def fit(self) -> None:
-        for _ in range(self._trainer_kwargs['max_epochs']):
-            self._train()
-            self._validate()
-        # assuming val_acc here
-        nni.report_final_result(self._validate()['val_acc'])
-class PyTorchMultiModelTrainer(BaseTrainer):
-    def __init__(self, multi_model, kwargs=[]):
-        self.multi_model = multi_model
-        self.kwargs = kwargs
-        self._train_dataloaders = []
-        self._train_datasets = []
-        self._val_dataloaders = []
-        self._val_datasets = []
-        self._optimizers = []
-        self._trainers = []
-        self._loss_fn = nn.CrossEntropyLoss()
-        self.max_steps = self.kwargs['max_steps'] if 'makx_steps' in self.kwargs else None
-        self.n_model = len(self.kwargs['model_kwargs'])
-        for m in self.kwargs['model_kwargs']:
-            if m['use_input']:
-                dataset_cls = m['dataset_cls']
-                dataset_kwargs = m['dataset_kwargs']
-                dataloader_kwargs = m['dataloader_kwargs']
-                train_dataset = getattr(datasets, dataset_cls)(train=True, transform=get_default_transform(dataset_cls),
-                                                               **(dataset_kwargs or {}))
-                val_dataset = getattr(datasets, dataset_cls)(train=False, transform=get_default_transform(dataset_cls),
-                                                             **(dataset_kwargs or {}))
-                train_dataloader = DataLoader(train_dataset, **(dataloader_kwargs or {}))
-                val_dataloader = DataLoader(val_dataset, **(dataloader_kwargs or {}))
-                self._train_datasets.append(train_dataset)
-                self._train_dataloaders.append(train_dataloader)
-                self._val_datasets.append(val_dataset)
-                self._val_dataloaders.append(val_dataloader)
-            if m['use_output']:
-                optimizer_cls = m['optimizer_cls']
-                optimizer_kwargs = m['optimizer_kwargs']
-                m_header = f"M_{m['model_id']}"
-                one_model_params = []
-                for name, param in multi_model.named_parameters():
-                    name_prefix = '_'.join(name.split('_')[:2])
-                    if m_header == name_prefix:
-                        one_model_params.append(param)
-                optimizer = getattr(torch.optim, optimizer_cls)(one_model_params, **(optimizer_kwargs or {}))
-                self._optimizers.append(optimizer)
-    def fit(self) -> None:
-        torch.autograd.set_detect_anomaly(True)
-        max_epochs = max([x['trainer_kwargs']['max_epochs'] for x in self.kwargs['model_kwargs']])
-        for _ in range(max_epochs):
-            self._train()
-            self._validate()
-        nni.report_final_result(self._validate())
-    def _train(self):
-        for batch_idx, multi_model_batch in enumerate(zip(*self._train_dataloaders)):
-            for opt in self._optimizers:
-                opt.zero_grad()
-            xs = []
-            ys = []
-            for idx, batch in enumerate(multi_model_batch):
-                x, y = self.training_step_before_model(batch, batch_idx, f'cuda:{idx}')
-                xs.append(x)
-                ys.append(y)
-            y_hats = self.multi_model(*xs)
-            if len(ys) != len(xs):
-                raise ValueError('len(ys) should be equal to len(xs)')
-            losses = []
-            report_loss = {}
-            for output_idx, yhat in enumerate(y_hats):
-                if len(ys) == len(y_hats):
-                    loss = self.training_step_after_model(xs[output_idx], ys[output_idx], yhat)
-                elif len(ys) == 1:
-                    loss = self.training_step_after_model(xs[0], ys[0].to(yhat.get_device()), yhat)
-                else:
-                    raise ValueError('len(ys) should be either 1 or len(y_hats)')
-                losses.append(loss.to("cuda:0"))
-                report_loss[self.kwargs['model_kwargs'][output_idx]['model_id']] = loss.item()
-            summed_loss = sum(losses)
-            summed_loss.backward()
-            for opt in self._optimizers:
-                opt.step()
-            if self.max_steps and batch_idx >= self.max_steps:
-                return
-    def training_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int, device=None):
-        x, y = batch
-        if device:
-            x, y = x.cuda(torch.device(device)), y.cuda(torch.device(device))
-        return x, y
-    def training_step_after_model(self, x, y, y_hat):
-        loss = self._loss_fn(y_hat, y)
-        return loss
-    def _validate(self):
-        all_val_outputs = {idx: [] for idx in range(self.n_model)}
-        for batch_idx, multi_model_batch in enumerate(zip(*self._val_dataloaders)):
-            xs = []
-            ys = []
-            for idx, batch in enumerate(multi_model_batch):
-                x, y = self.training_step_before_model(batch, batch_idx, f'cuda:{idx}')
-                xs.append(x)
-                ys.append(y)
-            if len(ys) != len(xs):
-                raise ValueError('len(ys) should be equal to len(xs)')
-            y_hats = self.multi_model(*xs)
-            for output_idx, yhat in enumerate(y_hats):
-                if len(ys) == len(y_hats):
-                    acc = self.validation_step_after_model(xs[output_idx], ys[output_idx], yhat)
-                elif len(ys) == 1:
-                    acc = self.validation_step_after_model(xs[0], ys[0].to(yhat.get_device()), yhat)
-                else:
-                    raise ValueError('len(ys) should be either 1 or len(y_hats)')
-                all_val_outputs[output_idx].append(acc)
-        report_acc = {}
-        for idx in all_val_outputs:
-            avg_acc = np.mean([x['val_acc'] for x in all_val_outputs[idx]]).item()
-            report_acc[self.kwargs['model_kwargs'][idx]['model_id']] = avg_acc
-        nni.report_intermediate_result(report_acc)
-        return report_acc
-    def validation_step_before_model(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int, device=None):
-        x, y = batch
-        if device:
-            x, y = x.cuda(torch.device(device)), y.cuda(torch.device(device))
-        return x, y
-    def validation_step_after_model(self, x, y, y_hat):
-        acc = self._accuracy(y_hat, y)
-        return {'val_acc': acc}
-    def _accuracy(self, input, target):  # pylint: disable=redefined-builtin
-        _, predict = torch.max(input.data, 1)
-        correct = predict.eq(target.data).cpu().sum().item()
-        return correct / input.size(0)
--- a/nni/retiarii/experiment/__init__.py
+++ b/nni/retiarii/experiment/__init__.py
--- a/nni/retiarii/experiment/pytorch.py
+++ b/nni/retiarii/experiment/pytorch.py
@@ -3,41 +3,38 @@
 import atexit
 import logging
+import os
+import socket
 import time
 from dataclasses import dataclass
-import os
 from pathlib import Path
-import socket
 from subprocess import Popen
 from threading import Thread
-import time
 from typing import Any, List, Optional, Union
 import colorama
 import psutil
 import torch
 import torch.nn as nn
 import nni.runtime.log
-from nni.experiment import Experiment, TrainingServiceConfig
+from nni.common.device import GPUDevice
-from nni.experiment import management, launcher, rest
+from nni.experiment import Experiment, TrainingServiceConfig, launcher, management, rest
 from nni.experiment.config import util
 from nni.experiment.config.base import ConfigBase, PathLike
 from nni.experiment.pipe import Pipe
 from nni.tools.nnictl.command_utils import kill_command
-from nni.common.device import GPUDevice
 from ..codegen import model_to_pytorch_script
 from ..converter import convert_to_graph
 from ..converter.graph_gen import GraphConverterWithShape
 from ..execution import list_models, set_execution_engine
 from ..execution.python import get_mutation_dict
-from ..graph import Model, Evaluator
+from ..graph import Evaluator
 from ..integration import RetiariiAdvisor
 from ..mutator import Mutator
-from ..nn.pytorch.mutator import process_inline_mutation, extract_mutation_from_pt_module
+from ..nn.pytorch.mutator import extract_mutation_from_pt_module, process_inline_mutation
-from ..strategy import BaseStrategy
 from ..oneshot.interface import BaseOneShotTrainer
+from ..strategy import BaseStrategy
 _logger = logging.getLogger(__name__)
@@ -73,7 +70,7 @@ class RetiariiExeConfig(ConfigBase):
        super().__init__(**kwargs)
        if training_service_platform is not None:
            assert 'training_service' not in kwargs
-            self.training_service = util.training_service_config_factory(platform = training_service_platform)
+            self.training_service = util.training_service_config_factory(platform=training_service_platform)
        self.__dict__['trial_command'] = 'python3 -m nni.retiarii.trial_entry py'
    def __setattr__(self, key, value):
@@ -117,6 +114,7 @@ _validation_rules = {
    'training_service': lambda value: (type(value) is not TrainingServiceConfig, 'cannot be abstract base class')
 }
 def preprocess_model(base_model, trainer, applied_mutators, full_ir=True, dummy_input=None):
    # TODO: this logic might need to be refactored into execution engine
    if full_ir:
@@ -220,6 +218,7 @@ class RetiariiExperiment(Experiment):
            engine = BaseExecutionEngine()
        elif self.config.execution_engine == 'cgo':
            from ..execution.cgo_engine import CGOExecutionEngine
            # assert self.config.trial_gpu_number==1, "trial_gpu_number must be 1 to use CGOExecutionEngine"
            assert self.config.batch_waiting_time is not None
            devices = self._construct_devices()
@@ -273,14 +272,14 @@ class RetiariiExperiment(Experiment):
    def _construct_devices(self):
        devices = []
        if hasattr(self.config.training_service, 'machine_list'):
-            for machine_idx, machine in enumerate(self.config.training_service.machine_list):
+            for machine in self.config.training_service.machine_list:
                for gpu_idx in machine.gpu_indices:
                    devices.append(GPUDevice(machine.host, gpu_idx))
        else:
            for gpu_idx in self.config.training_service.gpu_indices:
                devices.append(GPUDevice('local', gpu_idx))
        return devices
    def _create_dispatcher(self):
        return self._dispatcher

--- a/test/scripts/nas.sh
+++ b/test/scripts/nas.sh
@@ -6,17 +6,9 @@ echo ""
 echo "===========================Testing: NAS==========================="
 EXAMPLE_DIR=${CWD}/../examples/nas
-echo "testing nnictl ss_gen (classic nas)..."
+echo "testing mnist..."
-cd $EXAMPLE_DIR/legacy/classic_nas
+cd $EXAMPLE_DIR/multi-trial/mnist
-SEARCH_SPACE_JSON=nni_auto_gen_search_space.json
+python3 search.py
-if [ -f $SEARCH_SPACE_JSON ]; then
-    rm $SEARCH_SPACE_JSON
-fi
-nnictl ss_gen -t "python3 mnist.py"
-if [ ! -f $SEARCH_SPACE_JSON ]; then
-    echo "Search space file not found!"
-    exit 1
-fi
 echo "testing darts..."
 cd $EXAMPLE_DIR/oneshot/darts