Merge pull request #218 from microsoft/master

merge master

Merge pull request #218 from microsoft/master
merge master
cd3a912a · SparkSnail · GitHub · a0846f2a · e9cba778 · cd3a912a
Unverified Commit cd3a912a authored Nov 27, 2019 by SparkSnail Committed by GitHub Nov 27, 2019
20 changed files
--- a/src/sdk/pynni/nni/curvefitting_assessor/curvefunctions.py
+++ b/src/sdk/pynni/nni/curvefitting_assessor/curvefunctions.py
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 """
 A family of functions used by CurvefittingAssessor
 """
+
 import numpy as np

 all_models = {}

--- a/src/sdk/pynni/nni/curvefitting_assessor/model_factory.py
+++ b/src/sdk/pynni/nni/curvefitting_assessor/model_factory.py
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.

 import logging
 import numpy as np

--- a/src/sdk/pynni/nni/curvefitting_assessor/test.py
+++ b/src/sdk/pynni/nni/curvefitting_assessor/test.py
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.

 import numpy as np
 import unittest

--- a/src/sdk/pynni/nni/env_vars.py
+++ b/src/sdk/pynni/nni/env_vars.py
-# Copyright (c) Microsoft Corporation. All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-# associated documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish, distribute,
-# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all copies or
-# substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
-# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
-# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-# ==================================================================================================
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.

 import os
 from collections import namedtuple

--- a/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py
+++ b/src/sdk/pynni/nni/evolution_tuner/evolution_tuner.py
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge,
-# to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 """
 evolution_tuner.py
 """

--- a/src/sdk/pynni/nni/evolution_tuner/test_evolution_tuner.py
+++ b/src/sdk/pynni/nni/evolution_tuner/test_evolution_tuner.py
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge,
-# to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
 """
 test_evolution_tuner.py
 """

--- a/src/sdk/pynni/nni/feature_engineering/__init__.py
+++ b/src/sdk/pynni/nni/feature_engineering/__init__.py
--- a/src/sdk/pynni/nni/feature_engineering/feature_selector.py
+++ b/src/sdk/pynni/nni/feature_engineering/feature_selector.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+import logging
+
+_logger = logging.getLogger(__name__)
+
+
+class FeatureSelector():
+
+    def __init__(self, **kwargs):
+        self.selected_features_ = None
+        self.X = None
+        self.y = None
+
+
+    def fit(self, X, y, **kwargs):
+        """
+        Fit the training data to FeatureSelector
+
+        Paramters
+        ---------
+        X : array-like numpy matrix
+            The training input samples, which shape is [n_samples, n_features].
+        y: array-like numpy matrix
+            The target values (class labels in classification, real numbers in
+            regression). Which shape is [n_samples].
+        """
+        self.X = X
+        self.y = y
+
+
+    def get_selected_features(self):
+        """
+        Fit the training data to FeatureSelector
+
+        Returns
+        -------
+        list :
+                Return the index of imprtant feature.
+        """
+        return self.selected_features_
--- a/src/sdk/pynni/nni/feature_engineering/gbdt_selector/__init__.py
+++ b/src/sdk/pynni/nni/feature_engineering/gbdt_selector/__init__.py
+from .gbdt_selector import GBDTSelector
\ No newline at end of file
--- a/src/sdk/pynni/nni/feature_engineering/gbdt_selector/gbdt_selector.py
+++ b/src/sdk/pynni/nni/feature_engineering/gbdt_selector/gbdt_selector.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+"""
+gbdt_selector.py including:
+    class GBDTSelector
+"""
+
+import random
+from sklearn.model_selection import train_test_split
+from nni.feature_engineering.feature_selector import FeatureSelector
+
+# pylint: disable=E0401
+import lightgbm as lgb
+
+
+class GBDTSelector(FeatureSelector):
+
+    def __init__(self, **kwargs):
+        self.selected_features_ = None
+        self.X = None
+        self.y = None
+        self.feature_importance = None
+        self.lgb_params = None
+        self.eval_ratio = None
+        self.early_stopping_rounds = None
+        self.importance_type = None
+        self.num_boost_round = None
+        self.model = None
+
+
+    def fit(self, X, y, **kwargs):
+        """
+        Fit the training data to FeatureSelector
+
+        Paramters
+        ---------
+        X : array-like numpy matrix
+            The training input samples, which shape is [n_samples, n_features].
+        y : array-like numpy matrix
+            The target values (class labels in classification, real numbers in
+            regression). Which shape is [n_samples].
+        lgb_params : dict
+            Parameters of lightgbm
+        eval_ratio : float
+            The ratio of data size. It's used for split the eval data and train data from self.X.
+        early_stopping_rounds : int
+            The early stopping setting in lightgbm.
+        importance_type : str
+            Supporting type is 'gain' or 'split'.
+        num_boost_round : int
+            num_boost_round in lightgbm.
+        """
+        assert kwargs['lgb_params']
+        assert kwargs['eval_ratio']
+        assert kwargs['early_stopping_rounds']
+        assert kwargs['importance_type']
+        assert kwargs['num_boost_round']
+
+        self.X = X
+        self.y = y
+        self.lgb_params = kwargs['lgb_params']
+        self.eval_ratio = kwargs['eval_ratio']
+        self.early_stopping_rounds = kwargs['early_stopping_rounds']
+        self.importance_type = kwargs['importance_type']
+        self.num_boost_round = kwargs['num_boost_round']
+
+        X_train, X_test, y_train, y_test = train_test_split(self.X,
+                                                            self.y,
+                                                            test_size=self.eval_ratio,
+                                                            random_state=random.seed(41))
+        lgb_train = lgb.Dataset(X_train, y_train)
+        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
+
+        self.model = lgb.train(self.lgb_params,
+                               lgb_train,
+                               num_boost_round=self.num_boost_round,
+                               valid_sets=lgb_eval,
+                               early_stopping_rounds=self.early_stopping_rounds)
+
+        self.feature_importance = self.model.feature_importance(self.importance_type)
+
+
+    def get_selected_features(self, topk):
+        """
+        Fit the training data to FeatureSelector
+
+        Returns
+        -------
+        list :
+                Return the index of imprtant feature.
+        """
+        assert topk > 0
+
+        self.selected_features_ = self.feature_importance.argsort()[-topk:][::-1]
+
+        return self.selected_features_
--- a/src/sdk/pynni/nni/feature_engineering/gbdt_selector/requirements.txt
+++ b/src/sdk/pynni/nni/feature_engineering/gbdt_selector/requirements.txt
+lightgbm
\ No newline at end of file
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/__init__.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/__init__.py
+from .gradient_selector import FeatureGradientSelector
\ No newline at end of file
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/constants.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/constants.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+
+import numpy as np
+
+
+class StorageLevel:
+    DISK = 'disk'
+    SPARSE = 'sparse'
+    DENSE = 'dense'
+
+
+class DataFormat:
+    SVM = 'svm'
+    NUMPY = 'numpy'
+    ALL_FORMATS = [SVM, NUMPY]
+
+
+class Preprocess:
+    """
+    center the data to mean 0 and create unit variance
+    center the data to mean 0
+    """
+    ZSCORE = 'zscore'
+    CENTER = 'center'
+
+
+class Device:
+    CUDA = 'cuda'
+    CPU = 'cpu'
+
+
+class Checkpoint:
+    MODEL = 'model_state_dict'
+    OPT = 'optimizer_state_dict'
+    RNG = 'torch_rng_state'
+
+
+class NanError(ValueError):
+    pass
+
+
+class Initialization:
+    ZERO = 'zero'
+    ON = 'on'
+    OFF = 'off'
+    ON_HIGH = 'onhigh'
+    OFF_HIGH = 'offhigh'
+    SKLEARN = 'sklearn'
+    RANDOM = 'random'
+    VALUE_DICT = {ZERO: 0,
+                  ON: 1,
+                  OFF: -1,
+                  ON_HIGH: 5,
+                  OFF_HIGH: -1,
+                  SKLEARN: None,
+                  RANDOM: None}
+
+
+class Coefficients:
+    """"
+    coefficients for sublinear estimator were computed running the sublinear
+    paper's authors' code
+    """
+    SLE = {1: np.array([0.60355337]),
+           2: np.array([1.52705001, -0.34841729]),
+           3: np.array([2.90254224, -1.87216745, 0.]),
+           4: np.array([4.63445685, -5.19936195, 0., 1.50391676]),
+           5: np.array([6.92948049, -14.12216211, 9.4475009, 0., -1.21093546]),
+           6: np.array([9.54431082, -28.09414643, 31.84703652, -11.18763791, -1.14175281, 0.]),
+           7: np.array([12.54505041, -49.64891525, 79.78828031, -46.72250909, 0., 0., 5.02973646]),
+           8: np.array([16.03550163, -84.286182, 196.86078756, -215.36747071, 92.63961263, 0., 0., -4.86280869]),
+           9: np.array([19.86409184, -130.76801006, 390.95349861, -570.09210416, 354.77764899, 0., -73.84234865, 0., 10.09148767]),
+           10: np.array([2.41117752e+01, -1.94946061e+02, 7.34214614e+02, -1.42851995e+03, 1.41567410e+03, \
+                         -5.81738134e+02, 0., 0., 3.11664751e+01, 1.05018365e+00]),
+           11: np.array([28.75280839, -279.22576729, 1280.46325445, -3104.47148101, 3990.6092248, -2300.29413333, \
+                         0., 427.35289033, 0., 0., -42.17587475]),
+           12: np.array([33.85141912, -391.4229382, 2184.97827882, -6716.28280208, 11879.75233977, -11739.97267239, \
+                         5384.94542245, 0., -674.23291712, 0., 0., 39.37456439])}
+
+
+EPSILON = 1e-8
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/fginitialize.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/fginitialize.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+
+import os
+import pickle
+import sys
+import time
+
+import numpy as np
+import scipy.sparse
+from sklearn.datasets import load_svmlight_file
+
+import torch
+from torch.utils.data import DataLoader, Dataset
+# pylint: disable=E0611
+from torch.utils.data.dataloader import _DataLoaderIter, _utils
+
+import nni.feature_engineering.gradient_selector.constants as constants
+import nni.feature_engineering.gradient_selector.syssettings as syssettings
+
+torch.set_default_tensor_type(syssettings.torch.tensortype)
+sparsetensor = syssettings.torch.sparse.tensortype
+
+BYTESPERREAL = 8.
+BYTESPERGB = 1024. ** 3
+
+
+class PrepareData(Dataset):
+
+    def __init__(self,
+                 path_data=None,
+                 data_format=constants.DataFormat.NUMPY,
+                 D=None, N=None,
+                 classification=True,
+                 ordinal=False,
+                 balanced=True,
+                 preprocess=None,
+                 n_to_estimate=None,
+                 MAXMEMGB=syssettings.MAXMEMGB,
+                 set_params=True,
+                 path_mappings=None,
+                 X=None,
+                 y=None,
+                 verbose=0,
+                 n_classes=None,
+                 device=constants.Device.CPU):
+        """
+        Dataset class with helpful features and functions for being included in a dataloader
+        and managing memory usage.
+        can read following formats:
+            svm:        svm light format (sklearn.datasets.load_svmlight_file)
+            numpy:      Pass X and y as numpy or sparse arrays
+
+        assumes
+            1. if classification, y is in {-1, 1} or continuous and 0 indexed
+            2. y can fit into memory
+            3. consecutive calls to __getitem__() have consecutive idx values
+
+        notes:
+            1. this implementation is not careful wrt/ precise memory reqts. for
+            example, being able to store one dense row in memory is necessary,
+            but not sufficient.
+            2. for y with 4.2 billion elements, 31.3 GB of memory is  necessary
+            @ 8 bytes/scalar. Use partial fit to avoid loading the entire dataset
+            at once
+            3. disk_size always refer to size of complete data file, even after
+            a split().
+
+
+        Parameters
+        ----------
+        path_data : str
+            Path to load data from
+        data_format : str
+            File ending for path data.
+            "numpy" is the default when passing in X and y
+        D : int
+            Number of features.
+        N : int
+            Number of rows.
+        classification : bool
+            If True, problem is classification, else regression.
+        ordinal: bool
+            If True, problem is ordinal classification. Requires classification to be True.
+        balanced : bool
+            If true, each class is weighted equally in optimization, otherwise
+            weighted is done via support of each class. Requires classification to be True.
+        prerocess : str
+            'zscore' which refers to centering and normalizing data to unit variance or
+            'center' which only centers the data to 0 mean
+        n_to_estimate : int
+            Number of rows of data to estimate
+        MAXMEMGB : float
+            Maximum allowable size for a minibatch
+        set_params : bool
+            Whether or not to determine the statistics of the dataset
+        path_mappings : str
+            Used when streaming from disk
+        X : array-like
+            Shape = [n_samples, n_features]
+            The training input samples.
+        y : array-like
+            Shape = [n_samples]
+            The target values (class labels in classification, real numbers in
+            regression).
+        verbose : int
+            Controls the verbosity when fitting. Set to 0 for no printing
+            1 or higher for printing every verbose number of gradient steps.
+        device : str
+            'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
+        n_classes : int
+            number of classes
+        """
+
+        self.path_data = path_data
+        if self.path_data:
+            self.disk_size = os.path.getsize(path_data)
+        else:
+            assert X is not None, 'X must be specified if no path data'
+            self.disk_size = X.nbytes if not scipy.sparse.issparse(
+                X) else X.data.nbytes
+        assert data_format in constants.DataFormat.ALL_FORMATS, 'Format must in {0}.'.format(
+            ", ".join(constants.DataFormat.ALL_FORMATS))
+        self.format = data_format
+        self.classification = classification
+        self.ordinal = ordinal
+        self.balanced = balanced
+        self.MAXMEMGB = MAXMEMGB
+        self.preprocess = preprocess
+        self.set_params = set_params
+        self.verbose = verbose
+        self.n_classes = n_classes
+        self.device = device
+
+        self.path_data_stats = None
+
+        if D is None:
+            assert self.disk_size / BYTESPERGB <= self.MAXMEMGB, \
+                'Cannot load data into memory. Supply D.'
+
+            if self.format == constants.DataFormat.SVM:
+                self.X, self.y = load_svmlight_file(path_data)
+            elif self.format == constants.DataFormat.NUMPY:
+                assert X is not None, 'X must be specified in numpy mode'
+                assert y is not None, 'y must be specified in numpy mode'
+                self.X = X
+                self.y = y
+                if self.n_classes is None:
+                    self.n_classes = np.unique(y).shape[0]
+                elif self.classification:
+                    assert self.n_classes >= np.unique(y).shape[0], \
+                        'n_classes given must be greater than or equal to the number of classes in y'
+            else:
+                raise NotImplementedError
+            self.y = torch.as_tensor(self.y, dtype=torch.get_default_dtype())
+
+            self.N, self.D = self.X.shape
+
+            # assumes X was returned as a sparse array
+            self.storage_level = (constants.StorageLevel.SPARSE
+                                  if scipy.sparse.issparse(self.X)
+                                  else constants.StorageLevel.DENSE)
+
+        else:
+            assert N is not None, 'Supply N.'
+            self.N, self.D = N, D
+
+            # assume sparse matrix cannot fit into memory
+            self.storage_level = constants.StorageLevel.DISK
+
+        self.dense_size_gb = self.get_dense_size()
+
+        # check dense size
+        self.set_dense_X()
+
+        self.max_rows = int(self.MAXMEMGB * BYTESPERGB / BYTESPERREAL / self.D)
+        assert self.max_rows, \
+            'Cannot fit one dense row into %d GB memory.' % self.MAXMEMGB
+        self.max_rows = self.max_batch_size()
+        sys.stdout.flush()
+
+        if n_to_estimate is None:
+            self.n_to_estimate = self.max_batch_size()
+        else:
+            assert n_to_estimate <= self.N, 'n_to_estimate must be <= N.'
+            self.n_to_estimate = n_to_estimate
+
+        # initialize disk loader
+        if self.storage_level == constants.StorageLevel.DISK and self.set_params:
+            if self.format == constants.DataFormat.SVM:
+                raise NotImplementedError(
+                    'Please use partial fit to train on datasets that do not fit in memory')
+            else:
+                raise NotImplementedError
+
+        # TODO: use a passed-in RNG here
+        self.ix_statistics = np.random.permutation(self.N)[:self.n_to_estimate]
+        self.n_features = self.D
+        if self.set_params:
+            if self.verbose:
+                print('Finding data statistics...', end='')
+                sys.stdout.flush()
+            Xmn, sv1, Xsd, ymn, ysd = self.compute_data_stats()
+            self.set_data_stats(Xmn, sv1, Xsd, ymn, ysd)
+            if self.verbose:
+                print()
+            self.set_return_raw(False)
+        else:
+            self.set_return_raw(True)
+
+        self.set_return_np(False)
+
+        # this needs to occur after setting preprocessing params
+        if (self.storage_level == constants.StorageLevel.DISK and
+                self.format == constants.DataFormat.SVM and self.set_params):
+            self.loader.batchsize = 1
+
+    def get_dense_size(self):
+        return self.N * self.D * BYTESPERREAL / BYTESPERGB
+
+    def set_dense_X(self):
+        if self.storage_level != constants.StorageLevel.DISK:
+            if self.dense_size_gb <= self.MAXMEMGB:
+                if self.storage_level == constants.StorageLevel.SPARSE:
+                    self.X = self.X.toarray()
+                self.X = torch.as_tensor(
+                    self.X, dtype=torch.get_default_dtype())
+                self.storage_level = constants.StorageLevel.DENSE
+
+    def set_return_np(self, boolean):
+
+        self.return_np = boolean
+
+    def set_return_raw(self, boolean):
+
+        self.return_raw = boolean
+
+    def save_data_stats(self, path_data_stats):
+        """
+        Dumps dataset statistics to pickle file.
+        """
+
+        data_stats = {
+            'Xmn': self.Xmn,
+            'sv1': self.sv1,
+            'Xsd': self.Xsd,
+            'ymn': self.ymn,
+            'ysd': self.ysd,
+            'ix_statistics': self.ix_statistics,
+        }
+        pickle.dump(data_stats, open(path_data_stats, 'wb'))
+
+    def load_data_stats(self, path_data_stats):
+
+        stats = pickle.load(open(path_data_stats, 'rb'))
+        self.path_data_stats = path_data_stats
+
+        self.set_data_stats(np.asarray(stats['Xmn']), stats['sv1'],
+                            stats['Xsd'], stats['ymn'], stats['ysd'])
+
+        if self.storage_level == constants.StorageLevel.DISK and hasattr(
+                self, 'path_mappings'):
+            if 'ix_statistics' in stats:
+                self.ix_statistics = stats['ix_statistics']
+            else:
+                self.ix_statistics = range(self.N)
+
+        self.set_return_raw(False)
+
+    def reset(self):
+        """
+        Resets the dataloader. Only implemented for disk StorageLevel.
+        """
+
+        if self.storage_level == constants.StorageLevel.DENSE:
+            pass
+        elif self.storage_level == constants.StorageLevel.SPARSE:
+            pass
+        elif self.storage_level == constants.StorageLevel.DISK:
+            if self.format == constants.DataFormat.SVM:
+                self.loader.reset()
+            else:
+                raise NotImplementedError
+
+    def todense(self):
+
+        assert hasattr(self, 'Xmn'), 'Set preprocess params first.'
+        assert len(self) <= self.max_batch_size(
+        ), 'N must be <= max_batch_size().'
+
+        with torch.no_grad():
+            dense, _ = self.split(range(len(self)))
+            Braw = self.return_raw
+            Bnp = self.return_np
+            self.set_return_raw(True)
+            self.set_return_np(True)
+            dense.X, dense.y = [], []
+
+            def f_Xy(X, y):
+                dense.X.append(X)
+                dense.y.append(y)
+            self.apply(f_Xy=f_Xy)
+            dense.X = dense.X[-1]
+            dense.y = dense.y[-1]
+            self.set_return_raw(Braw)
+            self.set_return_np(Bnp)
+            dense.storage_level = constants.StorageLevel.DENSE
+
+            return dense
+
+    def split(self, ix):
+
+        assert hasattr(self, 'Xmn'), 'Run set_preprocess_params() first.'
+
+        first = type(self)(
+            self.path_data,
+            self.format,
+            self.D,
+            N=len(ix),
+            classification=self.classification,
+            preprocess=self.preprocess,
+            n_to_estimate=None,
+            MAXMEMGB=self.MAXMEMGB,
+            set_params=False)
+        second = type(self)(
+            self.path_data,
+            self.format,
+            self.D,
+            N=self.N - len(ix),
+            classification=self.classification,
+            preprocess=self.preprocess,
+            n_to_estimate=None,
+            MAXMEMGB=self.MAXMEMGB,
+            set_params=False)
+
+        first.storage_level = self.storage_level
+        second.storage_level = self.storage_level
+
+        # copy preprocess params
+        if not self.classification:
+            first.ymn = self.ymn
+            second.ymn = self.ymn
+            first.ysd = self.ysd
+            second.ysd = self.ysd
+
+        first.Xmn = self.Xmn
+        second.Xmn = self.Xmn
+        first.sv1 = self.sv1
+        second.sv1 = self.sv1
+
+        if self.storage_level == constants.StorageLevel.DISK:
+            if self.format == constants.DataFormat.SVM:
+                first.Xsd = self.Xsd
+                second.Xsd = self.Xsd
+            else:
+                raise NotImplementedError
+
+        # initialize data structures
+        if self.storage_level == constants.StorageLevel.DISK:
+            if self.format == constants.DataFormat.SVM:
+                raise NotImplementedError
+            raise NotImplementedError
+        elif self.storage_level in [constants.StorageLevel.SPARSE,
+                                    constants.StorageLevel.DENSE]:
+            first.X, first.y = self.X[ix], self.y[ix]
+            ixsec = list(set(range(self.N)).difference(set(ix)))
+            second.X, second.y = self.X[ixsec], self.y[ixsec]
+
+        return first, second
+
+    @staticmethod
+    def sparse_std(X, X_mean):
+        """
+        Calculate the column wise standard deviations of a sparse matrix.
+        """
+        X_copy = X.copy()
+        X_copy.data **= 2  # square non zero elements
+        E_x_squared = np.array(X_copy.mean(axis=0)).ravel()
+        Xsd = np.sqrt(E_x_squared - X_mean**2)
+        return Xsd
+
+    def compute_data_stats(self):
+        """
+        1. computes/estimates feature means
+        2. if preprocess == 'zscore', computes/estimates feature standard devs
+        3. if not classification, computes/estimates target mean/standard dev
+        4. estimates largest singular value of data matrix
+        """
+        t = time.time()
+        X, y = self.X[self.ix_statistics], self.y[self.ix_statistics]
+        preprocess = self.preprocess
+        classification = self.classification
+
+        Xmn = (X.mean(dim=0)
+               if not scipy.sparse.issparse(X)
+               else np.array(X.mean(axis=0)).ravel())
+
+        if preprocess == constants.Preprocess.ZSCORE:
+            Xsd = (X.std(dim=0)
+                   if not scipy.sparse.issparse(X)
+                   else PrepareData.sparse_std(X, Xmn))
+            Xsd[Xsd == 0] = 1.
+        else:
+            Xsd = 1.
+
+        if preprocess is not None and preprocess:
+            if preprocess == constants.Preprocess.ZSCORE:
+                Xc = (X - Xmn) / Xsd
+            else:
+                Xc = X - Xmn
+        else:
+            Xc = X - Xmn
+
+        sv1 = scipy.sparse.linalg.svds(Xc / (
+            torch.sqrt(torch.prod(torch.as_tensor(y.size(), dtype=torch.get_default_dtype())))
+            if not scipy.sparse.issparse(X) else y.numpy().size),
+                                       k=1,
+                                       which='LM',
+                                       return_singular_vectors=False)
+        # avoid runaway sv1
+        sv1 = np.array([min(np.finfo(np.float32).max,
+                            sv1[0])])
+
+        if not classification:
+            ymn = y.mean()
+            ysd = y.std()
+        else:
+            # TODO: set these, for each class?
+            ymn = 0.
+            ysd = 1.
+        if self.verbose:
+            print(" computing data statistics took: ", time.time() - t)
+
+        return Xmn, sv1, Xsd, ymn, ysd
+
+
+    def set_data_stats(self, Xmn, sv1, Xsd=1., ymn=0., ysd=1.):
+        """
+        Saves dataset stats to self to be used for preprocessing.
+        """
+
+        self.Xmn = torch.as_tensor(
+            Xmn, dtype=torch.get_default_dtype()).to(self.device)
+        self.sv1 = torch.as_tensor(
+            sv1, dtype=torch.get_default_dtype()).to(self.device)
+        self.Xsd = torch.as_tensor(
+            Xsd, dtype=torch.get_default_dtype()).to(self.device)
+        self.ymn = torch.as_tensor(
+            ymn, dtype=torch.get_default_dtype()).to(self.device)
+        self.ysd = torch.as_tensor(
+            ysd, dtype=torch.get_default_dtype()).to(self.device)
+
+
+    def apply_preprocess(self, X, y):
+        """
+        Faster on gpu device, while dataloading takes up a large portion of the time.
+        """
+
+        with torch.no_grad():
+            if not self.classification:
+                y = (y.reshape((-1, 1)) - self.ymn) / self.ysd
+            else:
+                y = y.reshape((-1, 1))
+            X = (X - self.Xmn) / self.sv1
+
+            if self.preprocess == constants.Preprocess.ZSCORE:
+                X /= self.Xsd
+
+            return X, y
+
+
+    def max_batch_size(self):
+        """
+        Return the maximum batchsize for the dataset.
+        """
+
+        return int(np.min([self.max_rows, self.N]))
+
+
+    def apply(self, ix_rows=None, ix_cols=None, f_Xy=None):
+
+        if f_Xy is None:
+            return
+
+        if ix_rows is None:
+            ix_rows = range(self.N)
+
+        if ix_cols is None:
+            ix_cols = range(self.n_features)
+
+        f_Xy((self.X[ix_rows, ix_cols]
+              if not self.storage_level == constants.StorageLevel.SPARSE
+              else self.X[ix_rows, ix_cols].toarray()), self.y[ix_rows])
+
+
+    def get_dense_data(self, ix_cols=None, ix_rows=None):
+
+        if ix_cols is None:
+            ix_cols = range(self.n_features)
+
+        X = [np.zeros((0, len(ix_cols)))]
+        y = [np.zeros((0, 1))]
+        Bnp = self.return_np
+
+        def f_Xy(Xb, yb, n):
+            X[-1] = np.concatenate((X[-1], Xb), axis=0)
+            y[-1] = np.concatenate((y[-1], yb), axis=0)
+        self.apply(f_Xy=f_Xy, ix_rows=ix_rows, ix_cols=ix_cols)
+        self.set_return_np(Bnp)
+
+        return X[-1], y[-1]
+
+
+    def __len__(self):
+
+        return self.N
+
+
+    def getXy(self, idx):
+
+        if self.storage_level == constants.StorageLevel.DENSE:
+            X, y = self.X[idx], self.y[idx]
+        elif self.storage_level == constants.StorageLevel.SPARSE:
+            # assume subset can fit into memory even if whole matrix cant
+            X, y = self.X[idx].toarray(), self.y[idx]
+        else:
+            raise NotImplementedError
+
+        return X, y
+
+
+    def __getitem__(self, idx):
+
+        with torch.no_grad():
+            X, y = self.getXy(idx)
+            X = X.toarray() if scipy.sparse.issparse(X) else X
+
+            X = torch.as_tensor(
+                X, dtype=torch.get_default_dtype()).to(self.device)
+            y = torch.as_tensor(
+                y, dtype=torch.get_default_dtype()).to(self.device)
+
+            if not self.return_raw:
+                X, y = self.apply_preprocess(X, y)
+
+            if self.classification and (
+                    self.n_classes is None or self.n_classes == 2):
+                y[y == 0] = -1
+
+            if self.return_np:
+                if constants.Device.CPU not in self.device:
+                    X = X.cpu()
+                    y = y.cpu()
+                X = X.numpy()
+                y = y.numpy()
+                return X, y
+
+            return X, y
+
+
+class ChunkDataLoader(DataLoader):
+    """
+    DataLoader class used to more quickly load a batch of indices at once.
+    """
+
+    def __iter__(self):
+        return _ChunkDataLoaderIter(self)
+
+
+class _ChunkDataLoaderIter(_DataLoaderIter):
+    """
+    DataLoaderIter class used to more quickly load a batch of indices at once.
+    """
+
+    def __next__(self):
+        # only chunk that is edited from base
+        if self.num_workers == 0:  # same-process loading
+            indices = next(self.sample_iter)  # may raise StopIteration
+            if len(indices) > 1:
+                batch = self.dataset[np.array(indices)]
+            else:
+                batch = self.collate_fn([self.dataset[i] for i in indices])
+
+            if self.pin_memory:
+                batch = _utils.pin_memory.pin_memory_batch(batch)
+            return batch
+
+        # check if the next sample has already been generated
+        if self.rcvd_idx in self.reorder_dict:
+            batch = self.reorder_dict.pop(self.rcvd_idx)
+            return self._process_next_batch(batch)
+
+        if self.batches_outstanding == 0:
+            self._shutdown_workers()
+            raise StopIteration
+
+        while True:
+            assert (not self.shutdown and self.batches_outstanding > 0)
+            idx, batch = self._get_batch()
+            self.batches_outstanding -= 1
+            if idx != self.rcvd_idx:
+                # store out-of-order samples
+                self.reorder_dict[idx] = batch
+                continue
+            return self._process_next_batch(batch)
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/fgtrain.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/fgtrain.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+
+import time
+
+import numpy as np
+import torch
+from sklearn.feature_selection import SelectKBest, \
+    f_classif, mutual_info_classif, f_regression, mutual_info_regression
+
+import nni.feature_engineering.gradient_selector.constants as constants
+import nni.feature_engineering.gradient_selector.syssettings as syssettings
+from nni.feature_engineering.gradient_selector.learnability import Solver
+from nni.feature_engineering.gradient_selector.utils import EMA
+
+torch.set_default_tensor_type(syssettings.torch.tensortype)
+
+
+def get_optim_f_stop(maxiter, maxtime, dftol_stop, freltol_stop,
+                     minibatch=True):
+    """
+    Check stopping conditions.
+    """
+
+    discount_factor = 1. / 3
+
+    total_t = [0.]
+    df_store = [np.nan]
+    it_store = [0]
+    relchange_store = [np.nan]
+    f_ma = EMA(discount_factor=discount_factor)
+    df_ma = EMA(discount_factor=discount_factor)
+
+    def f_stop(f0, v0, it, t):
+
+        flag_stop = False
+
+        total_t[-1] += t
+        g = f0.x.grad.clone().cpu().detach()
+        df = g.abs().max().numpy().squeeze()
+        v = v0.clone().cpu().detach()
+        f = v.numpy().squeeze()
+
+        if it >= maxiter:
+            flag_stop = True
+
+        elif total_t[-1] >= maxtime:
+            flag_stop = True
+
+        f_ma.update(f)
+        df_ma.update(df)
+        rel_change = f_ma.relchange()
+
+        if ((not minibatch) and (df < dftol_stop)) \
+           or (minibatch and (df_ma() < dftol_stop)):
+            flag_stop = True
+
+        if rel_change < freltol_stop:
+            flag_stop = True
+
+        if not minibatch:
+            df_store[-1] = df
+        else:
+            df_store[-1] = df_ma()
+        relchange_store[-1] = rel_change
+        it_store[-1] = it
+
+        return flag_stop
+
+    return f_stop, {'t': total_t, 'it': it_store, 'df': df_store,
+                    'relchange': relchange_store}
+
+
+def get_init(data_train, init_type='on', rng=np.random.RandomState(0), prev_score=None):
+    """
+    Initialize the 'x' variable with different settings
+    """
+
+    D = data_train.n_features
+    value_off = constants.Initialization.VALUE_DICT[
+        constants.Initialization.OFF]
+    value_on = constants.Initialization.VALUE_DICT[
+        constants.Initialization.ON]
+
+    if prev_score is not None:
+        x0 = prev_score
+    elif not isinstance(init_type, str):
+        x0 = value_off * np.ones(D)
+        x0[init_type] = value_on
+    elif init_type.startswith(constants.Initialization.RANDOM):
+        d = int(init_type.replace(constants.Initialization.RANDOM, ''))
+        x0 = value_off * np.ones(D)
+        x0[rng.permutation(D)[:d]] = value_on
+    elif init_type == constants.Initialization.SKLEARN:
+        B = data_train.return_raw
+        X, y = data_train.get_dense_data()
+        data_train.set_return_raw(B)
+        ix = train_sk_dense(init_type, X, y, data_train.classification)
+        x0 = value_off * np.ones(D)
+        x0[ix] = value_on
+    elif init_type in constants.Initialization.VALUE_DICT:
+        x0 = constants.Initialization.VALUE_DICT[init_type] * np.ones(D)
+    else:
+        raise NotImplementedError(
+            'init_type {0} not supported yet'.format(init_type))
+    # pylint: disable=E1102
+    return torch.tensor(x0.reshape((-1, 1)),
+                        dtype=torch.get_default_dtype())
+
+
+def get_checkpoint(S, stop_conds, rng=None, get_state=True):
+    """
+    Save the necessary information into a dictionary
+    """
+
+    m = {}
+    m['ninitfeats'] = S.ninitfeats
+    m['x0'] = S.x0
+    x = S.x.clone().cpu().detach()
+    m['feats'] = np.where(x.numpy() >= 0)[0]
+    m.update({k: v[0] for k, v in stop_conds.items()})
+    if get_state:
+        m.update({constants.Checkpoint.MODEL: S.state_dict(),
+                  constants.Checkpoint.OPT: S.opt_train.state_dict(),
+                  constants.Checkpoint.RNG: torch.get_rng_state(),
+                  })
+    if rng:
+        m.update({'rng_state': rng.get_state()})
+
+    return m
+
+
+def _train(data_train, Nminibatch, order, C, rng, lr_train, debug, maxiter,
+           maxtime, init, dftol_stop, freltol_stop, dn_log, accum_steps,
+           path_save, shuffle, device=constants.Device.CPU,
+           verbose=1,
+           prev_checkpoint=None,
+           groups=None,
+           soft_groups=None):
+    """
+    Main training loop.
+    """
+
+    t_init = time.time()
+
+    x0 = get_init(data_train, init, rng)
+    if isinstance(init, str) and init == constants.Initialization.ZERO:
+        ninitfeats = -1
+    else:
+        ninitfeats = np.where(x0.detach().numpy() > 0)[0].size
+
+    S = Solver(data_train, order,
+               Nminibatch=Nminibatch, x0=x0, C=C,
+               ftransform=lambda x: torch.sigmoid(2 * x),
+               get_train_opt=lambda p: torch.optim.Adam(p, lr_train),
+               rng=rng,
+               accum_steps=accum_steps,
+               shuffle=shuffle,
+               groups=groups,
+               soft_groups=soft_groups,
+               device=device,
+               verbose=verbose)
+    S = S.to(device)
+
+    S.ninitfeats = ninitfeats
+    S.x0 = x0
+
+    if prev_checkpoint:
+        S.load_state_dict(prev_checkpoint[constants.Checkpoint.MODEL])
+        S.opt_train.load_state_dict(prev_checkpoint[constants.Checkpoint.OPT])
+        torch.set_rng_state(prev_checkpoint[constants.Checkpoint.RNG])
+
+    minibatch = S.Ntrain != S.Nminibatch
+
+    f_stop, stop_conds = get_optim_f_stop(maxiter, maxtime, dftol_stop,
+                                          freltol_stop, minibatch=minibatch)
+
+    if debug:
+        pass
+    else:
+        f_callback = None
+    stop_conds['t'][-1] = time.time() - t_init
+
+    S.train(f_stop=f_stop, f_callback=f_callback)
+
+    return get_checkpoint(S, stop_conds, rng), S
+
+
+def train_sk_dense(ty, X, y, classification):
+    if classification:
+        if ty.startswith('skf'):
+            d = int(ty.replace('skf', ''))
+            f_sk = f_classif
+        elif ty.startswith('skmi'):
+            d = int(ty.replace('skmi', ''))
+            f_sk = mutual_info_classif
+    else:
+        if ty.startswith('skf'):
+            d = int(ty.replace('skf', ''))
+            f_sk = f_regression
+        elif ty.startswith('skmi'):
+            d = int(ty.replace('skmi', ''))
+            f_sk = mutual_info_regression
+    t = time.time()
+    clf = SelectKBest(f_sk, k=d)
+    clf.fit_transform(X, y.squeeze())
+    ix = np.argsort(-clf.scores_)
+    ix = ix[np.where(np.invert(np.isnan(clf.scores_[ix])))[0]][:d]
+    t = time.time() - t
+    return {'feats': ix, 't': t}
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/gradient_selector.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+import time
+
+import numpy as np
+import pandas as pd
+
+from sklearn.base import BaseEstimator
+from sklearn.feature_selection.base import SelectorMixin
+from sklearn.utils.validation import check_is_fitted
+
+import torch
+
+from nni.feature_engineering.feature_selector import FeatureSelector
+import nni.feature_engineering.gradient_selector.constants as constants
+from nni.feature_engineering.gradient_selector.fginitialize import PrepareData
+from nni.feature_engineering.gradient_selector.fgtrain import _train
+
+
+class FeatureGradientSelector(FeatureSelector, BaseEstimator, SelectorMixin):
+    def __init__(self,
+                 order=4,
+                 penalty=1,
+                 n_features=None,
+                 max_features=None,
+                 learning_rate=1e-1,
+                 init='zero',
+                 n_epochs=1,
+                 shuffle=True,
+                 batch_size=1000,
+                 target_batch_size=1000,
+                 max_time=np.inf,
+                 classification=True,
+                 ordinal=False,
+                 balanced=True,
+                 preprocess='zscore',
+                 soft_grouping=False,
+                 verbose=0,
+                 device='cpu'):
+        """
+            FeatureGradientSelector is a class that selects features for a machine
+            learning model using a gradient based search.
+
+            Parameters
+            ----------
+            order : int
+                What order of interactions to include. Higher orders
+                may be more accurate but increase the run time. 12 is the maximum allowed order.
+            penatly : int
+                Constant that multiplies the regularization term.
+            n_features: int
+                If None, will automatically choose number of features based on search.
+                Otherwise, number of top features to select.
+            max_features : int
+                If not None, will use the 'elbow method' to determine the number of features
+                with max_features as the upper limit.
+            learning_rate : float
+            init : str
+                How to initialize the vector of scores. 'zero' is the default.
+                Options: {'zero', 'on', 'off', 'onhigh', 'offhigh', 'sklearn'}
+            n_epochs : int
+                number of epochs to run
+            shuffle : bool
+                Shuffle "rows" prior to an epoch.
+            batch_size : int
+                Nnumber of "rows" to process at a time
+            target_batch_size : int
+                Number of "rows" to accumulate gradients over.
+                Useful when many rows will not fit into memory but are needed for accurate estimation.
+            classification : bool
+                If True, problem is classification, else regression.
+            ordinal : bool
+                If True, problem is ordinal classification. Requires classification to be True.
+            balanced : bool
+                If true, each class is weighted equally in optimization, otherwise
+                weighted is done via support of each class. Requires classification to be True.
+            prerocess : str
+                'zscore' which refers to centering and normalizing data to unit variance or
+                'center' which only centers the data to 0 mean
+            soft_grouping : bool
+                if True, groups represent features that come from the same source.
+                Used to encourage sparsity of groups and features within groups.
+            verbose : int
+                Controls the verbosity when fitting. Set to 0 for no printing
+                1 or higher for printing every verbose number of gradient steps.
+            device : str
+                'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
+        """
+        assert order <= 12 and order >= 1, 'order must be an integer between 1 and 12, inclusive'
+        assert n_features is None or max_features is None, \
+            'only specify one of n_features and max_features at a time'
+
+        self.order = order
+        self.penalty = penalty
+        self.n_features = n_features
+        self.max_features = max_features
+        self.learning_rate = learning_rate
+        self.init = init
+        self.n_epochs = n_epochs
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.target_batch_size = target_batch_size
+        self.max_time = max_time
+        self.dftol_stop = -1
+        self.freltol_stop = -1
+        self.classification = classification
+        self.ordinal = ordinal
+        self.balanced = balanced
+        self.preprocess = preprocess
+        self.soft_grouping = soft_grouping
+        self.verbose = verbose
+        self.device = device
+
+        self.model_ = None
+        self.scores_ = None
+        self._prev_checkpoint = None
+        self._data_train = None
+
+    def partial_fit(self, X, y,
+                    n_classes=None,
+                    groups=None):
+        """
+        Select Features via a gradient based search on (X, y) on the given samples.
+        Can be called repeatedly with different X and y to handle streaming datasets.
+
+        Parameters
+        ----------
+        X : array-like
+            Shape = [n_samples, n_features]
+            The training input samples.
+        y :  array-like
+            Shape = [n_samples]
+            The target values (class labels in classification, real numbers in
+            regression).
+        n_classes : int
+            Number of classes
+            Classes across all calls to partial_fit.
+            Can be obtained by via `np.unique(y_all).shape[0]`, where y_all is the
+            target vector of the entire dataset.
+            This argument is expected for the first call to partial_fit,
+            otherwise will assume all classes are present in the batch of y given.
+            It will be ignored in the subsequent calls.
+            Note that y doesn't need to contain all labels in `classes`.
+        groups : array-like
+            Optional, shape = [n_features]
+            Groups of columns that must be selected as a unit
+            e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
+            This argument is expected for the first call to partial_fit,
+            otherwise will assume all classes are present in the batch of y given.
+            It will be ignored in the subsequent calls.
+        """
+        try:
+            self._partial_fit(X, y, n_classes=n_classes, groups=groups)
+        except constants.NanError:
+            if hasattr(self, '_prev_checkpoint'):
+                # if it's already done some batches successfully just ignore it
+                print('failed fitting this batch, loss was nan')
+            else:
+                # if this is the first batch, reset and try with doubles
+                if self.verbose:
+                    print('Loss was nan, trying with Doubles')
+                self._reset()
+                torch.set_default_tensor_type(torch.DoubleTensor)
+                self._partial_fit(X, y, n_classes=n_classes, groups=groups)
+
+        return self
+
+    def _partial_fit(self, X, y, n_classes=None, groups=None):
+        """
+        Private function for partial_fit to enable trying floats before doubles.
+        """
+        # pass in X and y in chunks
+        if hasattr(self, '_data_train'):
+            # just overwrite the X and y from the new chunk but make them tensors
+            # keep dataset stats from previous
+            self._data_train.X = X.values if isinstance(X, pd.DataFrame) else X
+            self._data_train.N, self._data_train.D = self._data_train.X.shape
+            self._data_train.dense_size_gb = self._data_train.get_dense_size()
+            self._data_train.set_dense_X()
+
+            self._data_train.y = y.values if isinstance(y, pd.Series) else y
+            self._data_train.y = torch.as_tensor(
+                y, dtype=torch.get_default_dtype())
+        else:
+            data_train = self._prepare_data(X, y, n_classes=n_classes)
+            self._data_train = data_train
+
+        batch_size, _, accum_steps, max_iter = self._set_batch_size(
+            self._data_train)
+
+        rng = None  # not used
+        debug = 0  # {0,1} print messages and do other stuff?
+        dn_logs = None  # tensorboard logs; only specify if debug=1
+        path_save = None  # intermediate models saves; only specify if debug=1
+        m, solver = _train(self._data_train,
+                           batch_size,
+                           self.order,
+                           self.penalty,
+                           rng,
+                           self.learning_rate,
+                           debug,
+                           max_iter,
+                           self.max_time,
+                           self.init,
+                           self.dftol_stop,
+                           self.freltol_stop,
+                           dn_logs,
+                           accum_steps,
+                           path_save,
+                           self.shuffle,
+                           device=self.device,
+                           verbose=self.verbose,
+                           prev_checkpoint=self._prev_checkpoint if hasattr(
+                               self, '_prev_checkpoint') else None,
+                           groups=groups if not self.soft_grouping else None,
+                           soft_groups=groups if self.soft_grouping else None)
+
+        self._prev_checkpoint = m
+        self._process_results(m, solver, X, groups=groups)
+        return self
+
+    def fit(self, X, y,
+            groups=None):
+        """
+        Select Features via a gradient based search on (X, y).
+
+        Parameters
+        ----------
+        X : array-like
+            Shape = [n_samples, n_features]
+            The training input samples.
+        y : array-like
+            Shape = [n_samples]
+            The target values (class labels in classification, real numbers in
+            regression).
+        groups : array-like
+            Optional, shape = [n_features]
+            Groups of columns that must be selected as a unit
+            e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
+        """
+        try:
+            self._fit(X, y, groups=groups)
+        except constants.NanError:
+            if self.verbose:
+                print('Loss was nan, trying with Doubles')
+            torch.set_default_tensor_type(torch.DoubleTensor)
+            self._fit(X, y, groups=groups)
+        return self
+
+    def get_selected_features(self):
+        return self.selected_features_
+
+    def _prepare_data(self, X, y, n_classes=None):
+        """
+        Returns a PrepareData object.
+        """
+        return PrepareData(X=X.values if isinstance(X, pd.DataFrame) else X,
+                           y=y.values if isinstance(y, pd.Series) else y,
+                           data_format=constants.DataFormat.NUMPY,
+                           classification=int(self.classification),
+                           ordinal=self.ordinal,
+                           balanced=self.balanced,
+                           preprocess=self.preprocess,
+                           verbose=self.verbose,
+                           device=self.device,
+                           n_classes=n_classes)
+
+    def _fit(self, X, y, groups=None):
+        """
+        Private function for fit to enable trying floats before doubles.
+        """
+        data_train = self._prepare_data(X, y)
+
+        batch_size, _, accum_steps, max_iter = self._set_batch_size(
+            data_train)
+
+        rng = None  # not used
+        debug = 0  # {0,1} print messages and log to tensorboard
+        dn_logs = None  # tensorboard logs; only specify if debug=1
+        path_save = None  # intermediate models saves; only specify if debug=1
+        m, solver = _train(data_train,
+                           batch_size,
+                           self.order,
+                           self.penalty,
+                           rng,
+                           self.learning_rate,
+                           debug,
+                           max_iter,
+                           self.max_time,
+                           self.init,
+                           self.dftol_stop,
+                           self.freltol_stop,
+                           dn_logs,
+                           accum_steps,
+                           path_save,
+                           self.shuffle,
+                           device=self.device,
+                           verbose=self.verbose,
+                           groups=groups if not self.soft_grouping else None,
+                           soft_groups=groups if self.soft_grouping else None)
+
+        self._process_results(m, solver, X, groups=groups)
+        return self
+
+    def _process_torch_scores(self, scores):
+        """
+        Convert scores into flat numpy arrays.
+        """
+        if constants.Device.CUDA in scores.device.type:
+            scores = scores.cpu()
+        return scores.numpy().ravel()
+
+    def _set_batch_size(self, data_train):
+        """
+        Ensures that batch_size is less than the number of rows.
+        """
+        batch_size = min(self.batch_size, data_train.N)
+        target_batch_size = min(max(
+            self.batch_size, self.target_batch_size), data_train.N)
+        accum_steps = max(int(np.ceil(target_batch_size / self.batch_size)), 1)
+        max_iter = self.n_epochs * (data_train.N // batch_size)
+        return batch_size, target_batch_size, accum_steps, max_iter
+
+    def _process_results(self, m, solver, X, groups=None):
+        """
+        Process the results of a run into something suitable for transform().
+        """
+        self.scores_ = self._process_torch_scores(
+            torch.sigmoid(m[constants.Checkpoint.MODEL]['x'] * 2))
+        if self.max_features:
+            self.max_features = min([self.max_features, self.scores_.shape[0]])
+            n_features = self._recommend_number_features(solver)
+            self.set_n_features(n_features, groups=groups)
+        elif self.n_features:
+            self.set_n_features(self.n_features, groups=groups)
+        else:
+            self.selected_features_ = m['feats']
+
+        # subtract elapsed time from max_time
+        self.max_time -= m['t']
+
+        self.model_ = m
+
+        return self
+
+    def transform(self, X):
+        """
+        Returns selected features from X.
+
+        Paramters
+        ---------
+        X: array-like
+            Shape = [n_samples, n_features]
+            The training input samples.
+        """
+
+        self._get_support_mask()
+        if self.selected_features_.shape[0] == 0:
+            raise ValueError(
+                'No Features selected, consider lowering the penalty or specifying n_features')
+        return (X.iloc[:, self.selected_features_]
+                if isinstance(X, pd.DataFrame)
+                else X[:, self.selected_features_])
+
+    def get_support(self, indices=False):
+        """
+        Get a mask, or integer index, of the features selected.
+
+        Parameters
+        ----------
+        indices : bool
+            Default False
+            If True, the return value will be an array of integers, rather than a boolean mask.
+
+        Returns
+        -------
+        list :
+            returns support: An index that selects the retained features from a feature vector.
+            If indices is False, this is a boolean array of shape [# input features],
+            in which an element is True iff its corresponding feature is selected for retention.
+            If indices is True, this is an integer array of shape [# output features] whose values
+            are indices into the input feature vector.
+        """
+        self._get_support_mask()
+        if indices:
+            return self.selected_features_
+
+        mask = np.zeros_like(self.scores_, dtype=bool)
+        # pylint: disable=E1137
+        mask[self.selected_features_] = True
+        return mask
+
+    def inverse_transform(self, X):
+        """
+        Returns transformed X to the original number of column.
+        This operation is lossy and all columns not in the transformed data
+        will be returned as columns of 0s.
+        """
+        self._get_support_mask()
+        X_new = np.zeros((X.shape[0], self.scores_.shape[0]))
+        X_new[self.selected_features_] = X
+        return X_new
+
+    def get_params(self, deep=True):
+        """
+        Get parameters for this estimator.
+        """
+        params = self.__dict__
+        params = {key: val for (key, val) in params.items()
+                  if not key.endswith('_')}
+        return params
+
+    def set_params(self, **params):
+        """
+        Set the parameters of this estimator.
+        """
+        for param in params:
+            if hasattr(self, param):
+                setattr(self, param, params[param])
+        return self
+
+    def fit_transform(self, X, y):
+        """
+        Select features and then return X with the selected features.
+
+        Parameters
+        ----------
+        X : array-like
+            Shape = [n_samples, n_features]
+            The training input samples.
+        y : array-like
+            Shape = [n_samples]
+            The target values (class labels in classification, real numbers in
+            regression).
+        """
+        self.fit(X, y)
+        return self.transform(X)
+
+    def _get_support_mask(self):
+        """
+        Check if it is fitted.
+        """
+        check_is_fitted(self, 'scores_')
+
+    def _generate_scores(self, solver, xsub, ysub, step_size, feature_order):
+        """
+        Generate forward passes to determine the number of features when max_features is set.
+        """
+        scores = []
+        for i in np.arange(1, self.max_features + 1, step_size):
+            # optimization possible since xsub is growing?
+            i = int(np.ceil(i))
+            # pylint: disable=E1102
+            score = solver.f_train(torch.tensor(np.ones(i),
+                                                dtype=torch.get_default_dtype()
+                                                ).unsqueeze(1).to(self.device),
+                                   xsub[:, feature_order[:i]],
+                                   ysub)
+            if constants.Device.CUDA in score.device.type:
+                score = score.cpu()
+            # score.numpy()[0][0]
+            scores.append(score)
+        return scores
+
+    def set_n_features(self, n, groups=None):
+        """
+        Set the number of features to return after fitting.
+        """
+        self._get_support_mask()
+        self.n_features = n
+        return self._set_top_features(groups=groups)
+
+    def _set_top_features(self, groups=None):
+        """
+        Set the selected features after a run.
+
+        With groups, ensures that if any member of a group is selected, all members are selected
+        """
+        self._get_support_mask()
+        assert self.n_features <= self.scores_.shape[0], \
+            'n_features must be less than or equal to the number of columns in X'
+        # pylint: disable=E1130
+        self.selected_features_ = np.argpartition(
+            self.scores_, -self.n_features)[-self.n_features:]
+        if groups is not None and not self.soft_grouping:
+            selected_feature_set = set(self.selected_features_.tolist())
+            for _ in np.unique(groups):
+                group_members = np.where(groups == groups)[0].tolist()
+                if selected_feature_set.intersection(group_members):
+                    selected_feature_set.update(group_members)
+            self.selected_features_ = np.array(list(selected_feature_set))
+        self.selected_features_ = np.sort(self.selected_features_)
+        return self
+
+    def set_top_percentile(self, percentile, groups=None):
+        """
+        Set the percentile of features to return after fitting.
+        """
+        self._get_support_mask()
+        assert percentile <= 1 and percentile >= 0, \
+            'percentile must between 0 and 1 inclusive'
+        self.n_features = int(self.scores_.shape[0] * percentile)
+        return self._set_top_features(groups=groups)
+
+    def _recommend_number_features(self, solver, max_time=None):
+        """
+        Get the recommended number of features by doing forward passes when max_features is set.
+        """
+        max_time = max_time if max_time else self.max_time
+        if max_time < 0:
+            max_time = 60  # allow 1 minute extra if we already spent max_time
+        MAX_FORWARD_PASS = 200
+        MAX_FULL_BATCHES = 3  # the forward passes can take longer than the fitting
+        # if we allow a full epoch of data to be included. By only doing 3 full batches at most
+        # we get enough accuracy without increasing the time too much. This
+        # constant may not be optimal
+        accum_steps = solver.accum_steps
+        step_size = max(self.max_features / MAX_FORWARD_PASS, 1)
+        # pylint: disable=E1130
+        feature_order = np.argsort(-self.scores_)  # note the negative
+        t = time.time()
+
+        dataloader_iterator = iter(solver.ds_train)
+        full_scores = []
+        # keep_going = True
+        with torch.no_grad():
+            # might want to only consider a batch valid if there are at least
+            # two classes
+            for _ in range(accum_steps * MAX_FULL_BATCHES):
+                scores = []
+                try:
+                    xsub, ysub = next(dataloader_iterator)
+                except StopIteration:
+                    # done with epoch, don't do more than one epoch
+                    break
+                except Exception as e:
+                    print(e)
+                    break
+                if max_time and time.time() - t > max_time:
+                    if self.verbose:
+                        print(
+                            "Stoppinn forward passes because they reached max_time: ",
+                            max_time)
+                    if not full_scores:
+                        # no forward passes worked, return half of max_features
+                        return self.max_features // 2
+                    break
+                if solver.multiclass:
+                    for target_class in range(solver.n_classes):
+                        ysub_binary = solver.transform_y_into_binary(
+                            ysub, target_class)
+                        scaling_value = solver._get_scaling_value(
+                            ysub, target_class)
+                        if not solver._skip_y_forward(ysub_binary):
+                            scores = self._generate_scores(
+                                solver, xsub, ysub_binary, step_size, feature_order)
+                            # one row will represent one class that is present in the data
+                            # all classes are weighted equally
+                            full_scores.append(
+                                [score * scaling_value for score in scores])
+                else:
+                    if not solver._skip_y_forward(ysub):
+                        scores = self._generate_scores(
+                            solver, xsub, ysub, step_size, feature_order)
+                        full_scores.append(scores)
+        best_index = FeatureGradientSelector._find_best_index_elbow(
+            full_scores)
+        if self.verbose:
+            print("Forward passes took: ", time.time() - t)
+        # account for step size and off by one (n_features is 1 indexed, not 0
+        # )
+        return int(
+            np.ceil(
+                np.arange(
+                    1,
+                    self.max_features +
+                    1,
+                    step_size))[best_index])
+
+    @staticmethod
+    def _find_best_index_elbow(full_scores):
+        """
+        Finds the point on the curve that maximizes distance from the line determined by the endpoints.
+        """
+        scores = pd.DataFrame(full_scores).mean(0).values.tolist()
+        first_point = np.array([0, scores[0]])
+        last_point = np.array([len(scores) - 1, scores[-1]])
+        elbow_metric = []
+        for i in range(len(scores)):
+            elbow_metric.append(
+                FeatureGradientSelector._distance_to_line(
+                    first_point, last_point, np.array([i, scores[i]])))
+        return np.argmax(elbow_metric)
+
+    @staticmethod
+    def _distance_to_line(start_point, end_point, new_point):
+        """
+        Calculates the shortest distance from new_point to the line determined by start_point and end_point.
+        """
+        # for calculating elbow method
+        return np.cross(new_point - start_point,
+                        end_point - start_point) / np.linalg.norm(
+                            end_point - start_point)
+
+    def _reset(self):
+        """
+        Reset the estimator by deleting all private and fit parameters.
+        """
+        params = self.__dict__
+        for key, _ in params.items():
+            if key.endswith('_') or key.startswith('_'):
+                delattr(self, key)
+        return self
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/learnability.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/learnability.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+import time
+
+import numpy as np
+import scipy.special
+import torch
+import torch.nn as nn
+
+import nni.feature_engineering.gradient_selector.constants as constants
+import nni.feature_engineering.gradient_selector.syssettings as syssettings
+from nni.feature_engineering.gradient_selector.fginitialize import ChunkDataLoader
+
+torch.set_default_tensor_type(syssettings.torch.tensortype)
+sparsetensor = syssettings.torch.sparse.tensortype
+
+
+def def_train_opt(p):
+    """
+    Return the default optimizer.
+    """
+    return torch.optim.Adam(p, 1e-1, amsgrad=False)
+
+
+def revcumsum(U):
+    """
+    Reverse cumulative sum for faster performance.
+    """
+    return U.flip(dims=[0]).cumsum(dim=0).flip(dims=[0])
+
+
+def triudr(X, r):
+
+    Zr = torch.zeros_like(X, requires_grad=False)
+    U = X * r
+    Zr[:-1] = X[:-1] * revcumsum(U)[1:]
+
+    return Zr
+
+
+def triudl(X, l):
+
+    Zl = torch.zeros_like(X, requires_grad=False)
+    U = X * l
+    Zl[1:] = X[1:] * (U.cumsum(dim=0)[:-1])
+
+    return Zl
+
+
+class ramp(torch.autograd.Function):
+    """
+    Ensures input is between 0 and 1
+    """
+
+    @staticmethod
+    def forward(ctx, input_data):
+        ctx.save_for_backward(input_data)
+        return input_data.clamp(min=0, max=1)
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_data, = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        grad_input[input_data < 0] = 1e-2
+        grad_input[input_data > 1] = -1e-2
+        return grad_input
+
+
+class safesqrt(torch.autograd.Function):
+    """
+    Square root without dividing by 0.
+    """
+    @staticmethod
+    def forward(ctx, input_data):
+        o = input_data.sqrt()
+        ctx.save_for_backward(input_data, o)
+        return o
+
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        _, o = ctx.saved_tensors
+        grad_input = grad_output.clone()
+        grad_input *= 0.5 / (o + constants.EPSILON)
+        return grad_input
+
+
+class LearnabilityMB(nn.Module):
+    """
+    Calculates the learnability of a set of features.
+    mini-batch version w/ "left" and "right" multiplies
+    """
+
+
+    def __init__(self, Nminibatch, D, coeff, groups=None, binary=False,
+                 device=constants.Device.CPU):
+        super(LearnabilityMB, self).__init__()
+
+        a = coeff / scipy.special.binom(Nminibatch, np.arange(coeff.size) + 2)
+        self.order = a.size
+        # pylint: disable=E1102
+        self.a = torch.tensor(a, dtype=torch.get_default_dtype(), requires_grad=False)
+        self.binary = binary
+
+        self.a = self.a.to(device)
+
+
+    def ret_val(self, z):
+        """
+        Get the return value based on z.
+        """
+
+        if not self.binary:
+            return 1 - z
+
+        else:
+            return 0.5 * (1 - safesqrt.apply(ramp.apply(z)))
+
+
+    def forward(self, s, X, y):
+
+        l = y.clone()
+        r = y.clone()
+        z = 0
+
+        for i in range(self.order):
+            if i % 2 == 0:
+                Z = triudr(X, r)
+                r = torch.mm(Z, s)
+            else:
+                Z = triudl(X, l)
+                l = torch.mm(Z, s)
+            if self.a[i] != 0:
+                # same the computation if a[i] is 0
+                p = torch.mm(l.t(), r)
+                z += self.a[i] * p
+        return self.ret_val(z)
+
+
+class Solver(nn.Module):
+    """
+    Class that performs the main optimization.
+    Keeps track of the current x and iterates through data to learn x given the penalty and order.
+    """
+
+    def __init__(self,
+                 PreparedData,
+                 order,
+                 Nminibatch=None,
+                 groups=None,
+                 soft_groups=None,
+                 x0=None,
+                 C=1,
+                 ftransform=torch.sigmoid,
+                 get_train_opt=def_train_opt,
+                 accum_steps=1,
+                 rng=np.random.RandomState(0),
+                 max_norm_clip=1.,
+                 shuffle=True,
+                 device=constants.Device.CPU,
+                 verbose=1):
+        """
+
+        Parameters
+        ----------
+        PreparedData : Dataset of PrepareData class
+        order : int
+            What order of interactions to include. Higher orders
+            may be more accurate but increase the run time. 12 is the maximum allowed order.
+        Nminibatch : int
+            Number of rows in a mini batch
+        groups : array-like
+            Optional, shape = [n_features]
+            Groups of columns that must be selected as a unit
+            e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
+        soft_groups : array-like
+            optional, shape = [n_features]
+            Groups of columns come from the same source
+            Used to encourage sparsity of number of sources selected
+            e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
+        x0 : torch.tensor
+            Optional, initialization of x.
+        C : float
+            Penalty parameter.
+        get_train_opt : function
+            Function that returns a pytorch optimizer, Adam is the default
+        accum_steps : int
+            Number of steps
+        rng : random state
+        max_norm_clip : float
+            Maximum allowable size of the gradient
+        shuffle : bool
+            Whether or not to shuffle data within the dataloader
+        order : int
+            What order of interactions to include. Higher orders
+            may be more accurate but increase the run time. 12 is the maximum allowed order.
+        penalty : int
+            Constant that multiplies the regularization term.
+        ftransform : function
+            Function to transform the x. sigmoid is the default.
+        device : str
+            'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
+        verbose : int
+            Controls the verbosity when fitting. Set to 0 for no printing
+            1 or higher for printing every verbose number of gradient steps.
+        """
+        super(Solver, self).__init__()
+
+        self.Ntrain, self.D = PreparedData.N, PreparedData.n_features
+        if groups is not None:
+            # pylint: disable=E1102
+            groups = torch.tensor(groups, dtype=torch.long)
+            self.groups = groups
+        else:
+            self.groups = None
+        if soft_groups is not None:
+            # pylint: disable=E1102
+            soft_groups = torch.tensor(soft_groups, dtype=torch.long)
+            self.soft_D = torch.unique(soft_groups).size()[0]
+        else:
+            self.soft_D = None
+        self.soft_groups = soft_groups
+
+        if Nminibatch is None:
+            Nminibatch = self.Ntrain
+        else:
+            if Nminibatch > self.Ntrain:
+                print('Minibatch larger than sample size.'
+                      + (' Reducing from %d to %d.'
+                         % (Nminibatch, self.Ntrain)))
+                Nminibatch = self.Ntrain
+        if Nminibatch > PreparedData.max_rows:
+            print('Minibatch larger than mem-allowed.'
+                  + (' Reducing from %d to %d.' % (Nminibatch,
+                                                   PreparedData.max_rows)))
+            Nminibatch = int(np.min([Nminibatch, PreparedData.max_rows]))
+        self.Nminibatch = Nminibatch
+        self.accum_steps = accum_steps
+
+        if x0 is None:
+            x0 = torch.zeros(self.D, 1, dtype=torch.get_default_dtype())
+        self.ftransform = ftransform
+        self.x = nn.Parameter(x0)
+        self.max_norm = max_norm_clip
+
+        self.device = device
+        self.verbose = verbose
+
+        self.multiclass = PreparedData.classification and PreparedData.n_classes and PreparedData.n_classes > 2
+        if self.multiclass:
+            self.n_classes = PreparedData.n_classes
+        else:
+            self.n_classes = None
+        # whether to treat all classes equally
+        self.balanced = PreparedData.balanced
+        self.ordinal = PreparedData.ordinal
+
+        if (hasattr(PreparedData, 'mappings')
+                or PreparedData.storage_level == 'disk'):
+            num_workers = PreparedData.num_workers
+        elif PreparedData.storage_level == constants.StorageLevel.DENSE:
+            num_workers = 0
+        else:
+            num_workers = 0
+
+        if constants.Device.CUDA in device:
+            pin_memory = False
+        else:
+            pin_memory = False
+
+        self.ds_train = ChunkDataLoader(
+            PreparedData,
+            batch_size=self.Nminibatch,
+            shuffle=shuffle,
+            drop_last=True,
+            num_workers=num_workers,
+            pin_memory=pin_memory,
+            timeout=60)
+        self.f_train = LearnabilityMB(self.Nminibatch, self.D,
+                                      constants.Coefficients.SLE[order],
+                                      self.groups,
+                                      binary=PreparedData.classification,
+                                      device=self.device)
+        self.opt_train = get_train_opt(torch.nn.ParameterList([self.x]))
+        self.it = 0
+        self.iters_per_epoch = int(np.ceil(len(self.ds_train.dataset)
+                                           / self.ds_train.batch_size))
+        self.f_train = self.f_train.to(device)
+        # pylint: disable=E1102
+        self.w = torch.tensor(
+            C / (C + 1),
+            dtype=torch.get_default_dtype(), requires_grad=False)
+        self.w = self.w.to(device)
+
+
+    def penalty(self, s):
+        """
+        Calculate L1 Penalty.
+        """
+        to_return = torch.sum(s) / self.D
+        if self.soft_groups is not None:
+            # if soft_groups, there is an additional penalty for using more
+            # groups
+            s_grouped = torch.zeros(self.soft_D, 1,
+                                    dtype=torch.get_default_dtype(),
+                                    device=self.device)
+            for group in torch.unique(self.soft_groups):
+                # groups should be indexed 0 to n_group - 1
+                # TODO: consider other functions here
+                s_grouped[group] = s[self.soft_groups == group].max()
+            # each component of the penalty contributes .5
+            # TODO: could make this a user given parameter
+            to_return = (to_return + torch.sum(s_grouped) / self.soft_D) * .5
+        return to_return
+
+
+    def forward_and_backward(self, s, xsub, ysub, retain_graph=False):
+        """
+        Completes the forward operation and computes gradients for learnability and penalty.
+        """
+        f_train = self.f_train(s, xsub, ysub)
+        pen = self.penalty(s)
+        # pylint: disable=E1102
+        grad_outputs = torch.tensor([[1]], dtype=torch.get_default_dtype(),
+                                    device=self.device)
+        g1, = torch.autograd.grad([f_train], [self.x], grad_outputs,
+                                  retain_graph=True)
+        # pylint: disable=E1102
+        grad_outputs = torch.tensor([[1]], dtype=torch.get_default_dtype(),
+                                    device=self.device)
+        g2, = torch.autograd.grad([pen], [self.x], grad_outputs,
+                                  retain_graph=retain_graph)
+        return f_train, pen, g1, g2
+
+
+    def combine_gradient(self, g1, g2):
+        """
+        Combine gradients from learnability and penalty
+
+        Parameters
+        ----------
+        g1 : array-like
+            gradient from learnability
+        g2 : array-like
+            gradient from penalty
+        """
+        to_return = ((1 - self.w) * g1 + self.w * g2) / self.accum_steps
+        if self.groups is not None:
+            # each column will get a gradient
+            # but we can only up or down groups, so the gradient for the group
+            # should be the average of the gradients of the columns
+            to_return_grouped = torch.zeros_like(self.x)
+            for group in torch.unique(self.groups):
+                to_return_grouped[self.groups ==
+                                  group] = to_return[self.groups == group].mean()
+            to_return = to_return_grouped
+        return to_return
+
+
+    def combine_loss(self, f_train, pen):
+        """
+        Combine the learnability and L1 penalty.
+        """
+        return ((1 - self.w) * f_train.detach() + self.w * pen.detach()) \
+            / self.accum_steps
+
+
+    def transform_y_into_binary(self, ysub, target_class):
+        """
+        Transforms multiclass classification problems into a binary classification problem.
+        """
+        with torch.no_grad():
+            ysub_binary = torch.zeros_like(ysub)
+            if self.ordinal:
+                # turn ordinal problems into n-1 classifications of is this
+                # example less than rank k
+                if target_class == 0:
+                    return None
+
+                ysub_binary[ysub >= target_class] = 1
+                ysub_binary[ysub < target_class] = -1
+            else:
+                # turn multiclass problems into n binary classifications
+                ysub_binary[ysub == target_class] = 1
+                ysub_binary[ysub != target_class] = -1
+        return ysub_binary
+
+
+    def _get_scaling_value(self, ysub, target_class):
+        """
+        Returns the weight given to a class for multiclass classification.
+        """
+        if self.balanced:
+            if self.ordinal:
+                return 1 / (torch.unique(ysub).size()[0] - 1)
+
+            return 1 / torch.unique(ysub).size()[0]
+        else:
+            if self.ordinal:
+                this_class_proportion = torch.mean(ysub >= target_class)
+                normalizing_constant = 0
+                for i in range(1, self.n_classes):
+                    normalizing_constant += torch.mean(ysub >= i)
+                return this_class_proportion / normalizing_constant
+            else:
+                return torch.mean(ysub == target_class)
+
+
+    def _skip_y_forward(self, y):
+        """
+        Returns boolean of whether to skip the currrent y if there is nothing to be learned from it.
+        """
+        if y is None:
+            return True
+        elif torch.unique(y).size()[0] < 2:
+            return True
+        else:
+            return False
+
+
+    def train(self, f_callback=None, f_stop=None):
+        """
+        Trains the estimator to determine which features to include.
+
+        Parameters
+        ----------
+        f_callback : function
+            Function that performs a callback
+        f_stop: function
+            Function that tells you when to stop
+        """
+
+        t = time.time()
+        h = torch.zeros([1, 1], dtype=torch.get_default_dtype())
+        h = h.to(self.device)
+        # h_complete is so when we divide by the number of classes
+        # we only do that for that minibatch if accumulating
+        h_complete = h.clone()
+        flag_stop = False
+        dataloader_iterator = iter(self.ds_train)
+        self.x.grad = torch.zeros_like(self.x)
+        while not flag_stop:
+            try:
+                xsub, ysub = next(dataloader_iterator)
+            except StopIteration:
+                dataloader_iterator = iter(self.ds_train)
+                xsub, ysub = next(dataloader_iterator)
+            try:
+                s = self.ftransform(self.x)
+                s = s.to(self.device)
+                if self.multiclass:
+                    # accumulate gradients over each class, classes range from
+                    # 0 to n_classes - 1
+                    #num_classes_batch = torch.unique(ysub).size()[0]
+                    for target_class in range(self.n_classes):
+                        ysub_binary = self.transform_y_into_binary(
+                            ysub, target_class)
+                        if self._skip_y_forward(ysub_binary):
+                            continue
+                        # should should skip if target class is not included
+                        # but that changes what we divide by
+                        scaling_value = self._get_scaling_value(
+                            ysub, target_class)
+                        f_train, pen, g1, g2 = self.forward_and_backward(
+                            s, xsub, ysub_binary, retain_graph=True)
+                        self.x.grad += self.combine_gradient(
+                            g1, g2) * scaling_value
+                        h += self.combine_loss(f_train,
+                                               pen) * scaling_value
+                else:
+                    if not self._skip_y_forward(ysub):
+                        f_train, pen, g1, g2 = self.forward_and_backward(
+                            s, xsub, ysub)
+                        self.x.grad += self.combine_gradient(g1, g2)
+                        h += self.combine_loss(f_train, pen)
+                    else:
+                        continue
+                h_complete += h
+                self.it += 1
+                if torch.isnan(h):
+                    raise constants.NanError(
+                        'Loss is nan, something may be misconfigured')
+                if self.it % self.accum_steps == 0:
+                    torch.nn.utils.clip_grad_norm_(
+                        torch.nn.ParameterList([self.x]),
+                        max_norm=self.max_norm)
+                    self.opt_train.step()
+
+                    t = time.time() - t
+                    if f_stop is not None:
+                        flag_stop = f_stop(self, h, self.it, t)
+
+                    if f_callback is not None:
+                        f_callback(self, h, self.it, t)
+                    elif self.verbose and (self.it // self.accum_steps) % self.verbose == 0:
+                        epoch = int(self.it / self.iters_per_epoch)
+                        print(
+                            '[Minibatch: %6d/ Epoch: %3d/ t: %3.3f s] Loss: %0.3f' %
+                            (self.it, epoch, t, h_complete / self.accum_steps))
+
+                    if flag_stop:
+                        break
+
+                    self.opt_train.zero_grad()
+                    h = 0
+                    h_complete = 0
+                    t = time.time()
+            except KeyboardInterrupt:
+                flag_stop = True
+                break
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/requirements.txt
+numpy==1.14.3
+scikit-learn==0.20.0
+scipy==1.1.0
+torch==1.1.0
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/syssettings.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/syssettings.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+
+import torch
+
+# pytorch
+torch.tensortype = torch.FloatTensor
+torch.sparse.tensortype = torch.sparse.FloatTensor
+
+# mem
+MAXMEMGB = 10
--- a/src/sdk/pynni/nni/feature_engineering/gradient_selector/utils.py
+++ b/src/sdk/pynni/nni/feature_engineering/gradient_selector/utils.py
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+# associated documentation files (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge, publish, distribute,
+# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or
+# substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# ==================================================================================================
+
+
+import numpy as np
+
+class EMA():
+    """
+    maintains an exponential moving average
+    """
+
+    def __init__(self, f=np.nan, discount_factor=0.1, valid_after=None,
+                 n_iters_relchange=3):
+
+        self.f_ma = [f]
+        self.fs = [f]
+        self.gamma = discount_factor
+        self.rel_change = [np.nan]
+        if valid_after is None:
+            self.valid_after = int(1/discount_factor)
+        else:
+            self.valid_after = valid_after
+        self.n_iters_relchange = n_iters_relchange
+        self.initialized = False
+
+    def reset(self, f):
+
+        self.f_ma = [f]
+        self.fs = [f]
+        self.rel_change = [np.nan]
+        self.initialized = True
+
+    def relchange(self):
+
+        if self.num_updates() > np.max([self.valid_after,
+                                        self.n_iters_relchange]):
+            return np.max(self.rel_change[-self.n_iters_relchange:])
+        else:
+            return np.nan
+
+    def update(self, f_new):
+
+        if not self.initialized:
+            self.reset(f_new)
+        else:
+            self.fs.append(f_new)
+            self.f_ma.append(self.f_ma[-1]*(1-self.gamma) + self.gamma*f_new)
+            if self.num_updates() > self.valid_after:
+                self.rel_change.append(np.abs((self.f_ma[-1]-self.f_ma[-2])
+                                              / self.f_ma[-2]))
+
+    def num_updates(self):
+
+        return len(self.f_ma)
+
+    def __call__(self):
+
+        if self.num_updates() > self.valid_after:
+            return self.f_ma[-1]
+        else:
+            return np.nan