Unverified Commit cd3a912a authored by SparkSnail's avatar SparkSnail Committed by GitHub
Browse files

Merge pull request #218 from microsoft/master

merge master
parents a0846f2a e9cba778
# Copyright (c) Microsoft Corporation # Copyright (c) Microsoft Corporation.
# All rights reserved. # Licensed under the MIT license.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
""" """
A family of functions used by CurvefittingAssessor A family of functions used by CurvefittingAssessor
""" """
import numpy as np import numpy as np
all_models = {} all_models = {}
......
# Copyright (c) Microsoft Corporation # Copyright (c) Microsoft Corporation.
# All rights reserved. # Licensed under the MIT license.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import logging import logging
import numpy as np import numpy as np
......
# Copyright (c) Microsoft Corporation # Copyright (c) Microsoft Corporation.
# All rights reserved. # Licensed under the MIT license.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import numpy as np import numpy as np
import unittest import unittest
......
# Copyright (c) Microsoft Corporation. All rights reserved. # Copyright (c) Microsoft Corporation.
# # Licensed under the MIT license.
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import os import os
from collections import namedtuple from collections import namedtuple
......
# Copyright (c) Microsoft Corporation # Copyright (c) Microsoft Corporation.
# All rights reserved. # Licensed under the MIT license.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
""" """
evolution_tuner.py evolution_tuner.py
""" """
......
# Copyright (c) Microsoft Corporation # Copyright (c) Microsoft Corporation.
# All rights reserved. # Licensed under the MIT license.
#
# MIT License
#
# Permission is hereby granted, free of charge,
# to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
""" """
test_evolution_tuner.py test_evolution_tuner.py
""" """
......
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import logging
_logger = logging.getLogger(__name__)
class FeatureSelector():
def __init__(self, **kwargs):
self.selected_features_ = None
self.X = None
self.y = None
def fit(self, X, y, **kwargs):
"""
Fit the training data to FeatureSelector
Paramters
---------
X : array-like numpy matrix
The training input samples, which shape is [n_samples, n_features].
y: array-like numpy matrix
The target values (class labels in classification, real numbers in
regression). Which shape is [n_samples].
"""
self.X = X
self.y = y
def get_selected_features(self):
"""
Fit the training data to FeatureSelector
Returns
-------
list :
Return the index of imprtant feature.
"""
return self.selected_features_
from .gbdt_selector import GBDTSelector
\ No newline at end of file
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
"""
gbdt_selector.py including:
class GBDTSelector
"""
import random
from sklearn.model_selection import train_test_split
from nni.feature_engineering.feature_selector import FeatureSelector
# pylint: disable=E0401
import lightgbm as lgb
class GBDTSelector(FeatureSelector):
def __init__(self, **kwargs):
self.selected_features_ = None
self.X = None
self.y = None
self.feature_importance = None
self.lgb_params = None
self.eval_ratio = None
self.early_stopping_rounds = None
self.importance_type = None
self.num_boost_round = None
self.model = None
def fit(self, X, y, **kwargs):
"""
Fit the training data to FeatureSelector
Paramters
---------
X : array-like numpy matrix
The training input samples, which shape is [n_samples, n_features].
y : array-like numpy matrix
The target values (class labels in classification, real numbers in
regression). Which shape is [n_samples].
lgb_params : dict
Parameters of lightgbm
eval_ratio : float
The ratio of data size. It's used for split the eval data and train data from self.X.
early_stopping_rounds : int
The early stopping setting in lightgbm.
importance_type : str
Supporting type is 'gain' or 'split'.
num_boost_round : int
num_boost_round in lightgbm.
"""
assert kwargs['lgb_params']
assert kwargs['eval_ratio']
assert kwargs['early_stopping_rounds']
assert kwargs['importance_type']
assert kwargs['num_boost_round']
self.X = X
self.y = y
self.lgb_params = kwargs['lgb_params']
self.eval_ratio = kwargs['eval_ratio']
self.early_stopping_rounds = kwargs['early_stopping_rounds']
self.importance_type = kwargs['importance_type']
self.num_boost_round = kwargs['num_boost_round']
X_train, X_test, y_train, y_test = train_test_split(self.X,
self.y,
test_size=self.eval_ratio,
random_state=random.seed(41))
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
self.model = lgb.train(self.lgb_params,
lgb_train,
num_boost_round=self.num_boost_round,
valid_sets=lgb_eval,
early_stopping_rounds=self.early_stopping_rounds)
self.feature_importance = self.model.feature_importance(self.importance_type)
def get_selected_features(self, topk):
"""
Fit the training data to FeatureSelector
Returns
-------
list :
Return the index of imprtant feature.
"""
assert topk > 0
self.selected_features_ = self.feature_importance.argsort()[-topk:][::-1]
return self.selected_features_
from .gradient_selector import FeatureGradientSelector
\ No newline at end of file
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import numpy as np
class StorageLevel:
DISK = 'disk'
SPARSE = 'sparse'
DENSE = 'dense'
class DataFormat:
SVM = 'svm'
NUMPY = 'numpy'
ALL_FORMATS = [SVM, NUMPY]
class Preprocess:
"""
center the data to mean 0 and create unit variance
center the data to mean 0
"""
ZSCORE = 'zscore'
CENTER = 'center'
class Device:
CUDA = 'cuda'
CPU = 'cpu'
class Checkpoint:
MODEL = 'model_state_dict'
OPT = 'optimizer_state_dict'
RNG = 'torch_rng_state'
class NanError(ValueError):
pass
class Initialization:
ZERO = 'zero'
ON = 'on'
OFF = 'off'
ON_HIGH = 'onhigh'
OFF_HIGH = 'offhigh'
SKLEARN = 'sklearn'
RANDOM = 'random'
VALUE_DICT = {ZERO: 0,
ON: 1,
OFF: -1,
ON_HIGH: 5,
OFF_HIGH: -1,
SKLEARN: None,
RANDOM: None}
class Coefficients:
""""
coefficients for sublinear estimator were computed running the sublinear
paper's authors' code
"""
SLE = {1: np.array([0.60355337]),
2: np.array([1.52705001, -0.34841729]),
3: np.array([2.90254224, -1.87216745, 0.]),
4: np.array([4.63445685, -5.19936195, 0., 1.50391676]),
5: np.array([6.92948049, -14.12216211, 9.4475009, 0., -1.21093546]),
6: np.array([9.54431082, -28.09414643, 31.84703652, -11.18763791, -1.14175281, 0.]),
7: np.array([12.54505041, -49.64891525, 79.78828031, -46.72250909, 0., 0., 5.02973646]),
8: np.array([16.03550163, -84.286182, 196.86078756, -215.36747071, 92.63961263, 0., 0., -4.86280869]),
9: np.array([19.86409184, -130.76801006, 390.95349861, -570.09210416, 354.77764899, 0., -73.84234865, 0., 10.09148767]),
10: np.array([2.41117752e+01, -1.94946061e+02, 7.34214614e+02, -1.42851995e+03, 1.41567410e+03, \
-5.81738134e+02, 0., 0., 3.11664751e+01, 1.05018365e+00]),
11: np.array([28.75280839, -279.22576729, 1280.46325445, -3104.47148101, 3990.6092248, -2300.29413333, \
0., 427.35289033, 0., 0., -42.17587475]),
12: np.array([33.85141912, -391.4229382, 2184.97827882, -6716.28280208, 11879.75233977, -11739.97267239, \
5384.94542245, 0., -674.23291712, 0., 0., 39.37456439])}
EPSILON = 1e-8
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import os
import pickle
import sys
import time
import numpy as np
import scipy.sparse
from sklearn.datasets import load_svmlight_file
import torch
from torch.utils.data import DataLoader, Dataset
# pylint: disable=E0611
from torch.utils.data.dataloader import _DataLoaderIter, _utils
import nni.feature_engineering.gradient_selector.constants as constants
import nni.feature_engineering.gradient_selector.syssettings as syssettings
torch.set_default_tensor_type(syssettings.torch.tensortype)
sparsetensor = syssettings.torch.sparse.tensortype
BYTESPERREAL = 8.
BYTESPERGB = 1024. ** 3
class PrepareData(Dataset):
def __init__(self,
path_data=None,
data_format=constants.DataFormat.NUMPY,
D=None, N=None,
classification=True,
ordinal=False,
balanced=True,
preprocess=None,
n_to_estimate=None,
MAXMEMGB=syssettings.MAXMEMGB,
set_params=True,
path_mappings=None,
X=None,
y=None,
verbose=0,
n_classes=None,
device=constants.Device.CPU):
"""
Dataset class with helpful features and functions for being included in a dataloader
and managing memory usage.
can read following formats:
svm: svm light format (sklearn.datasets.load_svmlight_file)
numpy: Pass X and y as numpy or sparse arrays
assumes
1. if classification, y is in {-1, 1} or continuous and 0 indexed
2. y can fit into memory
3. consecutive calls to __getitem__() have consecutive idx values
notes:
1. this implementation is not careful wrt/ precise memory reqts. for
example, being able to store one dense row in memory is necessary,
but not sufficient.
2. for y with 4.2 billion elements, 31.3 GB of memory is necessary
@ 8 bytes/scalar. Use partial fit to avoid loading the entire dataset
at once
3. disk_size always refer to size of complete data file, even after
a split().
Parameters
----------
path_data : str
Path to load data from
data_format : str
File ending for path data.
"numpy" is the default when passing in X and y
D : int
Number of features.
N : int
Number of rows.
classification : bool
If True, problem is classification, else regression.
ordinal: bool
If True, problem is ordinal classification. Requires classification to be True.
balanced : bool
If true, each class is weighted equally in optimization, otherwise
weighted is done via support of each class. Requires classification to be True.
prerocess : str
'zscore' which refers to centering and normalizing data to unit variance or
'center' which only centers the data to 0 mean
n_to_estimate : int
Number of rows of data to estimate
MAXMEMGB : float
Maximum allowable size for a minibatch
set_params : bool
Whether or not to determine the statistics of the dataset
path_mappings : str
Used when streaming from disk
X : array-like
Shape = [n_samples, n_features]
The training input samples.
y : array-like
Shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
verbose : int
Controls the verbosity when fitting. Set to 0 for no printing
1 or higher for printing every verbose number of gradient steps.
device : str
'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
n_classes : int
number of classes
"""
self.path_data = path_data
if self.path_data:
self.disk_size = os.path.getsize(path_data)
else:
assert X is not None, 'X must be specified if no path data'
self.disk_size = X.nbytes if not scipy.sparse.issparse(
X) else X.data.nbytes
assert data_format in constants.DataFormat.ALL_FORMATS, 'Format must in {0}.'.format(
", ".join(constants.DataFormat.ALL_FORMATS))
self.format = data_format
self.classification = classification
self.ordinal = ordinal
self.balanced = balanced
self.MAXMEMGB = MAXMEMGB
self.preprocess = preprocess
self.set_params = set_params
self.verbose = verbose
self.n_classes = n_classes
self.device = device
self.path_data_stats = None
if D is None:
assert self.disk_size / BYTESPERGB <= self.MAXMEMGB, \
'Cannot load data into memory. Supply D.'
if self.format == constants.DataFormat.SVM:
self.X, self.y = load_svmlight_file(path_data)
elif self.format == constants.DataFormat.NUMPY:
assert X is not None, 'X must be specified in numpy mode'
assert y is not None, 'y must be specified in numpy mode'
self.X = X
self.y = y
if self.n_classes is None:
self.n_classes = np.unique(y).shape[0]
elif self.classification:
assert self.n_classes >= np.unique(y).shape[0], \
'n_classes given must be greater than or equal to the number of classes in y'
else:
raise NotImplementedError
self.y = torch.as_tensor(self.y, dtype=torch.get_default_dtype())
self.N, self.D = self.X.shape
# assumes X was returned as a sparse array
self.storage_level = (constants.StorageLevel.SPARSE
if scipy.sparse.issparse(self.X)
else constants.StorageLevel.DENSE)
else:
assert N is not None, 'Supply N.'
self.N, self.D = N, D
# assume sparse matrix cannot fit into memory
self.storage_level = constants.StorageLevel.DISK
self.dense_size_gb = self.get_dense_size()
# check dense size
self.set_dense_X()
self.max_rows = int(self.MAXMEMGB * BYTESPERGB / BYTESPERREAL / self.D)
assert self.max_rows, \
'Cannot fit one dense row into %d GB memory.' % self.MAXMEMGB
self.max_rows = self.max_batch_size()
sys.stdout.flush()
if n_to_estimate is None:
self.n_to_estimate = self.max_batch_size()
else:
assert n_to_estimate <= self.N, 'n_to_estimate must be <= N.'
self.n_to_estimate = n_to_estimate
# initialize disk loader
if self.storage_level == constants.StorageLevel.DISK and self.set_params:
if self.format == constants.DataFormat.SVM:
raise NotImplementedError(
'Please use partial fit to train on datasets that do not fit in memory')
else:
raise NotImplementedError
# TODO: use a passed-in RNG here
self.ix_statistics = np.random.permutation(self.N)[:self.n_to_estimate]
self.n_features = self.D
if self.set_params:
if self.verbose:
print('Finding data statistics...', end='')
sys.stdout.flush()
Xmn, sv1, Xsd, ymn, ysd = self.compute_data_stats()
self.set_data_stats(Xmn, sv1, Xsd, ymn, ysd)
if self.verbose:
print()
self.set_return_raw(False)
else:
self.set_return_raw(True)
self.set_return_np(False)
# this needs to occur after setting preprocessing params
if (self.storage_level == constants.StorageLevel.DISK and
self.format == constants.DataFormat.SVM and self.set_params):
self.loader.batchsize = 1
def get_dense_size(self):
return self.N * self.D * BYTESPERREAL / BYTESPERGB
def set_dense_X(self):
if self.storage_level != constants.StorageLevel.DISK:
if self.dense_size_gb <= self.MAXMEMGB:
if self.storage_level == constants.StorageLevel.SPARSE:
self.X = self.X.toarray()
self.X = torch.as_tensor(
self.X, dtype=torch.get_default_dtype())
self.storage_level = constants.StorageLevel.DENSE
def set_return_np(self, boolean):
self.return_np = boolean
def set_return_raw(self, boolean):
self.return_raw = boolean
def save_data_stats(self, path_data_stats):
"""
Dumps dataset statistics to pickle file.
"""
data_stats = {
'Xmn': self.Xmn,
'sv1': self.sv1,
'Xsd': self.Xsd,
'ymn': self.ymn,
'ysd': self.ysd,
'ix_statistics': self.ix_statistics,
}
pickle.dump(data_stats, open(path_data_stats, 'wb'))
def load_data_stats(self, path_data_stats):
stats = pickle.load(open(path_data_stats, 'rb'))
self.path_data_stats = path_data_stats
self.set_data_stats(np.asarray(stats['Xmn']), stats['sv1'],
stats['Xsd'], stats['ymn'], stats['ysd'])
if self.storage_level == constants.StorageLevel.DISK and hasattr(
self, 'path_mappings'):
if 'ix_statistics' in stats:
self.ix_statistics = stats['ix_statistics']
else:
self.ix_statistics = range(self.N)
self.set_return_raw(False)
def reset(self):
"""
Resets the dataloader. Only implemented for disk StorageLevel.
"""
if self.storage_level == constants.StorageLevel.DENSE:
pass
elif self.storage_level == constants.StorageLevel.SPARSE:
pass
elif self.storage_level == constants.StorageLevel.DISK:
if self.format == constants.DataFormat.SVM:
self.loader.reset()
else:
raise NotImplementedError
def todense(self):
assert hasattr(self, 'Xmn'), 'Set preprocess params first.'
assert len(self) <= self.max_batch_size(
), 'N must be <= max_batch_size().'
with torch.no_grad():
dense, _ = self.split(range(len(self)))
Braw = self.return_raw
Bnp = self.return_np
self.set_return_raw(True)
self.set_return_np(True)
dense.X, dense.y = [], []
def f_Xy(X, y):
dense.X.append(X)
dense.y.append(y)
self.apply(f_Xy=f_Xy)
dense.X = dense.X[-1]
dense.y = dense.y[-1]
self.set_return_raw(Braw)
self.set_return_np(Bnp)
dense.storage_level = constants.StorageLevel.DENSE
return dense
def split(self, ix):
assert hasattr(self, 'Xmn'), 'Run set_preprocess_params() first.'
first = type(self)(
self.path_data,
self.format,
self.D,
N=len(ix),
classification=self.classification,
preprocess=self.preprocess,
n_to_estimate=None,
MAXMEMGB=self.MAXMEMGB,
set_params=False)
second = type(self)(
self.path_data,
self.format,
self.D,
N=self.N - len(ix),
classification=self.classification,
preprocess=self.preprocess,
n_to_estimate=None,
MAXMEMGB=self.MAXMEMGB,
set_params=False)
first.storage_level = self.storage_level
second.storage_level = self.storage_level
# copy preprocess params
if not self.classification:
first.ymn = self.ymn
second.ymn = self.ymn
first.ysd = self.ysd
second.ysd = self.ysd
first.Xmn = self.Xmn
second.Xmn = self.Xmn
first.sv1 = self.sv1
second.sv1 = self.sv1
if self.storage_level == constants.StorageLevel.DISK:
if self.format == constants.DataFormat.SVM:
first.Xsd = self.Xsd
second.Xsd = self.Xsd
else:
raise NotImplementedError
# initialize data structures
if self.storage_level == constants.StorageLevel.DISK:
if self.format == constants.DataFormat.SVM:
raise NotImplementedError
raise NotImplementedError
elif self.storage_level in [constants.StorageLevel.SPARSE,
constants.StorageLevel.DENSE]:
first.X, first.y = self.X[ix], self.y[ix]
ixsec = list(set(range(self.N)).difference(set(ix)))
second.X, second.y = self.X[ixsec], self.y[ixsec]
return first, second
@staticmethod
def sparse_std(X, X_mean):
"""
Calculate the column wise standard deviations of a sparse matrix.
"""
X_copy = X.copy()
X_copy.data **= 2 # square non zero elements
E_x_squared = np.array(X_copy.mean(axis=0)).ravel()
Xsd = np.sqrt(E_x_squared - X_mean**2)
return Xsd
def compute_data_stats(self):
"""
1. computes/estimates feature means
2. if preprocess == 'zscore', computes/estimates feature standard devs
3. if not classification, computes/estimates target mean/standard dev
4. estimates largest singular value of data matrix
"""
t = time.time()
X, y = self.X[self.ix_statistics], self.y[self.ix_statistics]
preprocess = self.preprocess
classification = self.classification
Xmn = (X.mean(dim=0)
if not scipy.sparse.issparse(X)
else np.array(X.mean(axis=0)).ravel())
if preprocess == constants.Preprocess.ZSCORE:
Xsd = (X.std(dim=0)
if not scipy.sparse.issparse(X)
else PrepareData.sparse_std(X, Xmn))
Xsd[Xsd == 0] = 1.
else:
Xsd = 1.
if preprocess is not None and preprocess:
if preprocess == constants.Preprocess.ZSCORE:
Xc = (X - Xmn) / Xsd
else:
Xc = X - Xmn
else:
Xc = X - Xmn
sv1 = scipy.sparse.linalg.svds(Xc / (
torch.sqrt(torch.prod(torch.as_tensor(y.size(), dtype=torch.get_default_dtype())))
if not scipy.sparse.issparse(X) else y.numpy().size),
k=1,
which='LM',
return_singular_vectors=False)
# avoid runaway sv1
sv1 = np.array([min(np.finfo(np.float32).max,
sv1[0])])
if not classification:
ymn = y.mean()
ysd = y.std()
else:
# TODO: set these, for each class?
ymn = 0.
ysd = 1.
if self.verbose:
print(" computing data statistics took: ", time.time() - t)
return Xmn, sv1, Xsd, ymn, ysd
def set_data_stats(self, Xmn, sv1, Xsd=1., ymn=0., ysd=1.):
"""
Saves dataset stats to self to be used for preprocessing.
"""
self.Xmn = torch.as_tensor(
Xmn, dtype=torch.get_default_dtype()).to(self.device)
self.sv1 = torch.as_tensor(
sv1, dtype=torch.get_default_dtype()).to(self.device)
self.Xsd = torch.as_tensor(
Xsd, dtype=torch.get_default_dtype()).to(self.device)
self.ymn = torch.as_tensor(
ymn, dtype=torch.get_default_dtype()).to(self.device)
self.ysd = torch.as_tensor(
ysd, dtype=torch.get_default_dtype()).to(self.device)
def apply_preprocess(self, X, y):
"""
Faster on gpu device, while dataloading takes up a large portion of the time.
"""
with torch.no_grad():
if not self.classification:
y = (y.reshape((-1, 1)) - self.ymn) / self.ysd
else:
y = y.reshape((-1, 1))
X = (X - self.Xmn) / self.sv1
if self.preprocess == constants.Preprocess.ZSCORE:
X /= self.Xsd
return X, y
def max_batch_size(self):
"""
Return the maximum batchsize for the dataset.
"""
return int(np.min([self.max_rows, self.N]))
def apply(self, ix_rows=None, ix_cols=None, f_Xy=None):
if f_Xy is None:
return
if ix_rows is None:
ix_rows = range(self.N)
if ix_cols is None:
ix_cols = range(self.n_features)
f_Xy((self.X[ix_rows, ix_cols]
if not self.storage_level == constants.StorageLevel.SPARSE
else self.X[ix_rows, ix_cols].toarray()), self.y[ix_rows])
def get_dense_data(self, ix_cols=None, ix_rows=None):
if ix_cols is None:
ix_cols = range(self.n_features)
X = [np.zeros((0, len(ix_cols)))]
y = [np.zeros((0, 1))]
Bnp = self.return_np
def f_Xy(Xb, yb, n):
X[-1] = np.concatenate((X[-1], Xb), axis=0)
y[-1] = np.concatenate((y[-1], yb), axis=0)
self.apply(f_Xy=f_Xy, ix_rows=ix_rows, ix_cols=ix_cols)
self.set_return_np(Bnp)
return X[-1], y[-1]
def __len__(self):
return self.N
def getXy(self, idx):
if self.storage_level == constants.StorageLevel.DENSE:
X, y = self.X[idx], self.y[idx]
elif self.storage_level == constants.StorageLevel.SPARSE:
# assume subset can fit into memory even if whole matrix cant
X, y = self.X[idx].toarray(), self.y[idx]
else:
raise NotImplementedError
return X, y
def __getitem__(self, idx):
with torch.no_grad():
X, y = self.getXy(idx)
X = X.toarray() if scipy.sparse.issparse(X) else X
X = torch.as_tensor(
X, dtype=torch.get_default_dtype()).to(self.device)
y = torch.as_tensor(
y, dtype=torch.get_default_dtype()).to(self.device)
if not self.return_raw:
X, y = self.apply_preprocess(X, y)
if self.classification and (
self.n_classes is None or self.n_classes == 2):
y[y == 0] = -1
if self.return_np:
if constants.Device.CPU not in self.device:
X = X.cpu()
y = y.cpu()
X = X.numpy()
y = y.numpy()
return X, y
return X, y
class ChunkDataLoader(DataLoader):
"""
DataLoader class used to more quickly load a batch of indices at once.
"""
def __iter__(self):
return _ChunkDataLoaderIter(self)
class _ChunkDataLoaderIter(_DataLoaderIter):
"""
DataLoaderIter class used to more quickly load a batch of indices at once.
"""
def __next__(self):
# only chunk that is edited from base
if self.num_workers == 0: # same-process loading
indices = next(self.sample_iter) # may raise StopIteration
if len(indices) > 1:
batch = self.dataset[np.array(indices)]
else:
batch = self.collate_fn([self.dataset[i] for i in indices])
if self.pin_memory:
batch = _utils.pin_memory.pin_memory_batch(batch)
return batch
# check if the next sample has already been generated
if self.rcvd_idx in self.reorder_dict:
batch = self.reorder_dict.pop(self.rcvd_idx)
return self._process_next_batch(batch)
if self.batches_outstanding == 0:
self._shutdown_workers()
raise StopIteration
while True:
assert (not self.shutdown and self.batches_outstanding > 0)
idx, batch = self._get_batch()
self.batches_outstanding -= 1
if idx != self.rcvd_idx:
# store out-of-order samples
self.reorder_dict[idx] = batch
continue
return self._process_next_batch(batch)
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import time
import numpy as np
import torch
from sklearn.feature_selection import SelectKBest, \
f_classif, mutual_info_classif, f_regression, mutual_info_regression
import nni.feature_engineering.gradient_selector.constants as constants
import nni.feature_engineering.gradient_selector.syssettings as syssettings
from nni.feature_engineering.gradient_selector.learnability import Solver
from nni.feature_engineering.gradient_selector.utils import EMA
torch.set_default_tensor_type(syssettings.torch.tensortype)
def get_optim_f_stop(maxiter, maxtime, dftol_stop, freltol_stop,
minibatch=True):
"""
Check stopping conditions.
"""
discount_factor = 1. / 3
total_t = [0.]
df_store = [np.nan]
it_store = [0]
relchange_store = [np.nan]
f_ma = EMA(discount_factor=discount_factor)
df_ma = EMA(discount_factor=discount_factor)
def f_stop(f0, v0, it, t):
flag_stop = False
total_t[-1] += t
g = f0.x.grad.clone().cpu().detach()
df = g.abs().max().numpy().squeeze()
v = v0.clone().cpu().detach()
f = v.numpy().squeeze()
if it >= maxiter:
flag_stop = True
elif total_t[-1] >= maxtime:
flag_stop = True
f_ma.update(f)
df_ma.update(df)
rel_change = f_ma.relchange()
if ((not minibatch) and (df < dftol_stop)) \
or (minibatch and (df_ma() < dftol_stop)):
flag_stop = True
if rel_change < freltol_stop:
flag_stop = True
if not minibatch:
df_store[-1] = df
else:
df_store[-1] = df_ma()
relchange_store[-1] = rel_change
it_store[-1] = it
return flag_stop
return f_stop, {'t': total_t, 'it': it_store, 'df': df_store,
'relchange': relchange_store}
def get_init(data_train, init_type='on', rng=np.random.RandomState(0), prev_score=None):
"""
Initialize the 'x' variable with different settings
"""
D = data_train.n_features
value_off = constants.Initialization.VALUE_DICT[
constants.Initialization.OFF]
value_on = constants.Initialization.VALUE_DICT[
constants.Initialization.ON]
if prev_score is not None:
x0 = prev_score
elif not isinstance(init_type, str):
x0 = value_off * np.ones(D)
x0[init_type] = value_on
elif init_type.startswith(constants.Initialization.RANDOM):
d = int(init_type.replace(constants.Initialization.RANDOM, ''))
x0 = value_off * np.ones(D)
x0[rng.permutation(D)[:d]] = value_on
elif init_type == constants.Initialization.SKLEARN:
B = data_train.return_raw
X, y = data_train.get_dense_data()
data_train.set_return_raw(B)
ix = train_sk_dense(init_type, X, y, data_train.classification)
x0 = value_off * np.ones(D)
x0[ix] = value_on
elif init_type in constants.Initialization.VALUE_DICT:
x0 = constants.Initialization.VALUE_DICT[init_type] * np.ones(D)
else:
raise NotImplementedError(
'init_type {0} not supported yet'.format(init_type))
# pylint: disable=E1102
return torch.tensor(x0.reshape((-1, 1)),
dtype=torch.get_default_dtype())
def get_checkpoint(S, stop_conds, rng=None, get_state=True):
"""
Save the necessary information into a dictionary
"""
m = {}
m['ninitfeats'] = S.ninitfeats
m['x0'] = S.x0
x = S.x.clone().cpu().detach()
m['feats'] = np.where(x.numpy() >= 0)[0]
m.update({k: v[0] for k, v in stop_conds.items()})
if get_state:
m.update({constants.Checkpoint.MODEL: S.state_dict(),
constants.Checkpoint.OPT: S.opt_train.state_dict(),
constants.Checkpoint.RNG: torch.get_rng_state(),
})
if rng:
m.update({'rng_state': rng.get_state()})
return m
def _train(data_train, Nminibatch, order, C, rng, lr_train, debug, maxiter,
maxtime, init, dftol_stop, freltol_stop, dn_log, accum_steps,
path_save, shuffle, device=constants.Device.CPU,
verbose=1,
prev_checkpoint=None,
groups=None,
soft_groups=None):
"""
Main training loop.
"""
t_init = time.time()
x0 = get_init(data_train, init, rng)
if isinstance(init, str) and init == constants.Initialization.ZERO:
ninitfeats = -1
else:
ninitfeats = np.where(x0.detach().numpy() > 0)[0].size
S = Solver(data_train, order,
Nminibatch=Nminibatch, x0=x0, C=C,
ftransform=lambda x: torch.sigmoid(2 * x),
get_train_opt=lambda p: torch.optim.Adam(p, lr_train),
rng=rng,
accum_steps=accum_steps,
shuffle=shuffle,
groups=groups,
soft_groups=soft_groups,
device=device,
verbose=verbose)
S = S.to(device)
S.ninitfeats = ninitfeats
S.x0 = x0
if prev_checkpoint:
S.load_state_dict(prev_checkpoint[constants.Checkpoint.MODEL])
S.opt_train.load_state_dict(prev_checkpoint[constants.Checkpoint.OPT])
torch.set_rng_state(prev_checkpoint[constants.Checkpoint.RNG])
minibatch = S.Ntrain != S.Nminibatch
f_stop, stop_conds = get_optim_f_stop(maxiter, maxtime, dftol_stop,
freltol_stop, minibatch=minibatch)
if debug:
pass
else:
f_callback = None
stop_conds['t'][-1] = time.time() - t_init
S.train(f_stop=f_stop, f_callback=f_callback)
return get_checkpoint(S, stop_conds, rng), S
def train_sk_dense(ty, X, y, classification):
if classification:
if ty.startswith('skf'):
d = int(ty.replace('skf', ''))
f_sk = f_classif
elif ty.startswith('skmi'):
d = int(ty.replace('skmi', ''))
f_sk = mutual_info_classif
else:
if ty.startswith('skf'):
d = int(ty.replace('skf', ''))
f_sk = f_regression
elif ty.startswith('skmi'):
d = int(ty.replace('skmi', ''))
f_sk = mutual_info_regression
t = time.time()
clf = SelectKBest(f_sk, k=d)
clf.fit_transform(X, y.squeeze())
ix = np.argsort(-clf.scores_)
ix = ix[np.where(np.invert(np.isnan(clf.scores_[ix])))[0]][:d]
t = time.time() - t
return {'feats': ix, 't': t}
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import time
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.feature_selection.base import SelectorMixin
from sklearn.utils.validation import check_is_fitted
import torch
from nni.feature_engineering.feature_selector import FeatureSelector
import nni.feature_engineering.gradient_selector.constants as constants
from nni.feature_engineering.gradient_selector.fginitialize import PrepareData
from nni.feature_engineering.gradient_selector.fgtrain import _train
class FeatureGradientSelector(FeatureSelector, BaseEstimator, SelectorMixin):
def __init__(self,
order=4,
penalty=1,
n_features=None,
max_features=None,
learning_rate=1e-1,
init='zero',
n_epochs=1,
shuffle=True,
batch_size=1000,
target_batch_size=1000,
max_time=np.inf,
classification=True,
ordinal=False,
balanced=True,
preprocess='zscore',
soft_grouping=False,
verbose=0,
device='cpu'):
"""
FeatureGradientSelector is a class that selects features for a machine
learning model using a gradient based search.
Parameters
----------
order : int
What order of interactions to include. Higher orders
may be more accurate but increase the run time. 12 is the maximum allowed order.
penatly : int
Constant that multiplies the regularization term.
n_features: int
If None, will automatically choose number of features based on search.
Otherwise, number of top features to select.
max_features : int
If not None, will use the 'elbow method' to determine the number of features
with max_features as the upper limit.
learning_rate : float
init : str
How to initialize the vector of scores. 'zero' is the default.
Options: {'zero', 'on', 'off', 'onhigh', 'offhigh', 'sklearn'}
n_epochs : int
number of epochs to run
shuffle : bool
Shuffle "rows" prior to an epoch.
batch_size : int
Nnumber of "rows" to process at a time
target_batch_size : int
Number of "rows" to accumulate gradients over.
Useful when many rows will not fit into memory but are needed for accurate estimation.
classification : bool
If True, problem is classification, else regression.
ordinal : bool
If True, problem is ordinal classification. Requires classification to be True.
balanced : bool
If true, each class is weighted equally in optimization, otherwise
weighted is done via support of each class. Requires classification to be True.
prerocess : str
'zscore' which refers to centering and normalizing data to unit variance or
'center' which only centers the data to 0 mean
soft_grouping : bool
if True, groups represent features that come from the same source.
Used to encourage sparsity of groups and features within groups.
verbose : int
Controls the verbosity when fitting. Set to 0 for no printing
1 or higher for printing every verbose number of gradient steps.
device : str
'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
"""
assert order <= 12 and order >= 1, 'order must be an integer between 1 and 12, inclusive'
assert n_features is None or max_features is None, \
'only specify one of n_features and max_features at a time'
self.order = order
self.penalty = penalty
self.n_features = n_features
self.max_features = max_features
self.learning_rate = learning_rate
self.init = init
self.n_epochs = n_epochs
self.shuffle = shuffle
self.batch_size = batch_size
self.target_batch_size = target_batch_size
self.max_time = max_time
self.dftol_stop = -1
self.freltol_stop = -1
self.classification = classification
self.ordinal = ordinal
self.balanced = balanced
self.preprocess = preprocess
self.soft_grouping = soft_grouping
self.verbose = verbose
self.device = device
self.model_ = None
self.scores_ = None
self._prev_checkpoint = None
self._data_train = None
def partial_fit(self, X, y,
n_classes=None,
groups=None):
"""
Select Features via a gradient based search on (X, y) on the given samples.
Can be called repeatedly with different X and y to handle streaming datasets.
Parameters
----------
X : array-like
Shape = [n_samples, n_features]
The training input samples.
y : array-like
Shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
n_classes : int
Number of classes
Classes across all calls to partial_fit.
Can be obtained by via `np.unique(y_all).shape[0]`, where y_all is the
target vector of the entire dataset.
This argument is expected for the first call to partial_fit,
otherwise will assume all classes are present in the batch of y given.
It will be ignored in the subsequent calls.
Note that y doesn't need to contain all labels in `classes`.
groups : array-like
Optional, shape = [n_features]
Groups of columns that must be selected as a unit
e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
This argument is expected for the first call to partial_fit,
otherwise will assume all classes are present in the batch of y given.
It will be ignored in the subsequent calls.
"""
try:
self._partial_fit(X, y, n_classes=n_classes, groups=groups)
except constants.NanError:
if hasattr(self, '_prev_checkpoint'):
# if it's already done some batches successfully just ignore it
print('failed fitting this batch, loss was nan')
else:
# if this is the first batch, reset and try with doubles
if self.verbose:
print('Loss was nan, trying with Doubles')
self._reset()
torch.set_default_tensor_type(torch.DoubleTensor)
self._partial_fit(X, y, n_classes=n_classes, groups=groups)
return self
def _partial_fit(self, X, y, n_classes=None, groups=None):
"""
Private function for partial_fit to enable trying floats before doubles.
"""
# pass in X and y in chunks
if hasattr(self, '_data_train'):
# just overwrite the X and y from the new chunk but make them tensors
# keep dataset stats from previous
self._data_train.X = X.values if isinstance(X, pd.DataFrame) else X
self._data_train.N, self._data_train.D = self._data_train.X.shape
self._data_train.dense_size_gb = self._data_train.get_dense_size()
self._data_train.set_dense_X()
self._data_train.y = y.values if isinstance(y, pd.Series) else y
self._data_train.y = torch.as_tensor(
y, dtype=torch.get_default_dtype())
else:
data_train = self._prepare_data(X, y, n_classes=n_classes)
self._data_train = data_train
batch_size, _, accum_steps, max_iter = self._set_batch_size(
self._data_train)
rng = None # not used
debug = 0 # {0,1} print messages and do other stuff?
dn_logs = None # tensorboard logs; only specify if debug=1
path_save = None # intermediate models saves; only specify if debug=1
m, solver = _train(self._data_train,
batch_size,
self.order,
self.penalty,
rng,
self.learning_rate,
debug,
max_iter,
self.max_time,
self.init,
self.dftol_stop,
self.freltol_stop,
dn_logs,
accum_steps,
path_save,
self.shuffle,
device=self.device,
verbose=self.verbose,
prev_checkpoint=self._prev_checkpoint if hasattr(
self, '_prev_checkpoint') else None,
groups=groups if not self.soft_grouping else None,
soft_groups=groups if self.soft_grouping else None)
self._prev_checkpoint = m
self._process_results(m, solver, X, groups=groups)
return self
def fit(self, X, y,
groups=None):
"""
Select Features via a gradient based search on (X, y).
Parameters
----------
X : array-like
Shape = [n_samples, n_features]
The training input samples.
y : array-like
Shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
groups : array-like
Optional, shape = [n_features]
Groups of columns that must be selected as a unit
e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
"""
try:
self._fit(X, y, groups=groups)
except constants.NanError:
if self.verbose:
print('Loss was nan, trying with Doubles')
torch.set_default_tensor_type(torch.DoubleTensor)
self._fit(X, y, groups=groups)
return self
def get_selected_features(self):
return self.selected_features_
def _prepare_data(self, X, y, n_classes=None):
"""
Returns a PrepareData object.
"""
return PrepareData(X=X.values if isinstance(X, pd.DataFrame) else X,
y=y.values if isinstance(y, pd.Series) else y,
data_format=constants.DataFormat.NUMPY,
classification=int(self.classification),
ordinal=self.ordinal,
balanced=self.balanced,
preprocess=self.preprocess,
verbose=self.verbose,
device=self.device,
n_classes=n_classes)
def _fit(self, X, y, groups=None):
"""
Private function for fit to enable trying floats before doubles.
"""
data_train = self._prepare_data(X, y)
batch_size, _, accum_steps, max_iter = self._set_batch_size(
data_train)
rng = None # not used
debug = 0 # {0,1} print messages and log to tensorboard
dn_logs = None # tensorboard logs; only specify if debug=1
path_save = None # intermediate models saves; only specify if debug=1
m, solver = _train(data_train,
batch_size,
self.order,
self.penalty,
rng,
self.learning_rate,
debug,
max_iter,
self.max_time,
self.init,
self.dftol_stop,
self.freltol_stop,
dn_logs,
accum_steps,
path_save,
self.shuffle,
device=self.device,
verbose=self.verbose,
groups=groups if not self.soft_grouping else None,
soft_groups=groups if self.soft_grouping else None)
self._process_results(m, solver, X, groups=groups)
return self
def _process_torch_scores(self, scores):
"""
Convert scores into flat numpy arrays.
"""
if constants.Device.CUDA in scores.device.type:
scores = scores.cpu()
return scores.numpy().ravel()
def _set_batch_size(self, data_train):
"""
Ensures that batch_size is less than the number of rows.
"""
batch_size = min(self.batch_size, data_train.N)
target_batch_size = min(max(
self.batch_size, self.target_batch_size), data_train.N)
accum_steps = max(int(np.ceil(target_batch_size / self.batch_size)), 1)
max_iter = self.n_epochs * (data_train.N // batch_size)
return batch_size, target_batch_size, accum_steps, max_iter
def _process_results(self, m, solver, X, groups=None):
"""
Process the results of a run into something suitable for transform().
"""
self.scores_ = self._process_torch_scores(
torch.sigmoid(m[constants.Checkpoint.MODEL]['x'] * 2))
if self.max_features:
self.max_features = min([self.max_features, self.scores_.shape[0]])
n_features = self._recommend_number_features(solver)
self.set_n_features(n_features, groups=groups)
elif self.n_features:
self.set_n_features(self.n_features, groups=groups)
else:
self.selected_features_ = m['feats']
# subtract elapsed time from max_time
self.max_time -= m['t']
self.model_ = m
return self
def transform(self, X):
"""
Returns selected features from X.
Paramters
---------
X: array-like
Shape = [n_samples, n_features]
The training input samples.
"""
self._get_support_mask()
if self.selected_features_.shape[0] == 0:
raise ValueError(
'No Features selected, consider lowering the penalty or specifying n_features')
return (X.iloc[:, self.selected_features_]
if isinstance(X, pd.DataFrame)
else X[:, self.selected_features_])
def get_support(self, indices=False):
"""
Get a mask, or integer index, of the features selected.
Parameters
----------
indices : bool
Default False
If True, the return value will be an array of integers, rather than a boolean mask.
Returns
-------
list :
returns support: An index that selects the retained features from a feature vector.
If indices is False, this is a boolean array of shape [# input features],
in which an element is True iff its corresponding feature is selected for retention.
If indices is True, this is an integer array of shape [# output features] whose values
are indices into the input feature vector.
"""
self._get_support_mask()
if indices:
return self.selected_features_
mask = np.zeros_like(self.scores_, dtype=bool)
# pylint: disable=E1137
mask[self.selected_features_] = True
return mask
def inverse_transform(self, X):
"""
Returns transformed X to the original number of column.
This operation is lossy and all columns not in the transformed data
will be returned as columns of 0s.
"""
self._get_support_mask()
X_new = np.zeros((X.shape[0], self.scores_.shape[0]))
X_new[self.selected_features_] = X
return X_new
def get_params(self, deep=True):
"""
Get parameters for this estimator.
"""
params = self.__dict__
params = {key: val for (key, val) in params.items()
if not key.endswith('_')}
return params
def set_params(self, **params):
"""
Set the parameters of this estimator.
"""
for param in params:
if hasattr(self, param):
setattr(self, param, params[param])
return self
def fit_transform(self, X, y):
"""
Select features and then return X with the selected features.
Parameters
----------
X : array-like
Shape = [n_samples, n_features]
The training input samples.
y : array-like
Shape = [n_samples]
The target values (class labels in classification, real numbers in
regression).
"""
self.fit(X, y)
return self.transform(X)
def _get_support_mask(self):
"""
Check if it is fitted.
"""
check_is_fitted(self, 'scores_')
def _generate_scores(self, solver, xsub, ysub, step_size, feature_order):
"""
Generate forward passes to determine the number of features when max_features is set.
"""
scores = []
for i in np.arange(1, self.max_features + 1, step_size):
# optimization possible since xsub is growing?
i = int(np.ceil(i))
# pylint: disable=E1102
score = solver.f_train(torch.tensor(np.ones(i),
dtype=torch.get_default_dtype()
).unsqueeze(1).to(self.device),
xsub[:, feature_order[:i]],
ysub)
if constants.Device.CUDA in score.device.type:
score = score.cpu()
# score.numpy()[0][0]
scores.append(score)
return scores
def set_n_features(self, n, groups=None):
"""
Set the number of features to return after fitting.
"""
self._get_support_mask()
self.n_features = n
return self._set_top_features(groups=groups)
def _set_top_features(self, groups=None):
"""
Set the selected features after a run.
With groups, ensures that if any member of a group is selected, all members are selected
"""
self._get_support_mask()
assert self.n_features <= self.scores_.shape[0], \
'n_features must be less than or equal to the number of columns in X'
# pylint: disable=E1130
self.selected_features_ = np.argpartition(
self.scores_, -self.n_features)[-self.n_features:]
if groups is not None and not self.soft_grouping:
selected_feature_set = set(self.selected_features_.tolist())
for _ in np.unique(groups):
group_members = np.where(groups == groups)[0].tolist()
if selected_feature_set.intersection(group_members):
selected_feature_set.update(group_members)
self.selected_features_ = np.array(list(selected_feature_set))
self.selected_features_ = np.sort(self.selected_features_)
return self
def set_top_percentile(self, percentile, groups=None):
"""
Set the percentile of features to return after fitting.
"""
self._get_support_mask()
assert percentile <= 1 and percentile >= 0, \
'percentile must between 0 and 1 inclusive'
self.n_features = int(self.scores_.shape[0] * percentile)
return self._set_top_features(groups=groups)
def _recommend_number_features(self, solver, max_time=None):
"""
Get the recommended number of features by doing forward passes when max_features is set.
"""
max_time = max_time if max_time else self.max_time
if max_time < 0:
max_time = 60 # allow 1 minute extra if we already spent max_time
MAX_FORWARD_PASS = 200
MAX_FULL_BATCHES = 3 # the forward passes can take longer than the fitting
# if we allow a full epoch of data to be included. By only doing 3 full batches at most
# we get enough accuracy without increasing the time too much. This
# constant may not be optimal
accum_steps = solver.accum_steps
step_size = max(self.max_features / MAX_FORWARD_PASS, 1)
# pylint: disable=E1130
feature_order = np.argsort(-self.scores_) # note the negative
t = time.time()
dataloader_iterator = iter(solver.ds_train)
full_scores = []
# keep_going = True
with torch.no_grad():
# might want to only consider a batch valid if there are at least
# two classes
for _ in range(accum_steps * MAX_FULL_BATCHES):
scores = []
try:
xsub, ysub = next(dataloader_iterator)
except StopIteration:
# done with epoch, don't do more than one epoch
break
except Exception as e:
print(e)
break
if max_time and time.time() - t > max_time:
if self.verbose:
print(
"Stoppinn forward passes because they reached max_time: ",
max_time)
if not full_scores:
# no forward passes worked, return half of max_features
return self.max_features // 2
break
if solver.multiclass:
for target_class in range(solver.n_classes):
ysub_binary = solver.transform_y_into_binary(
ysub, target_class)
scaling_value = solver._get_scaling_value(
ysub, target_class)
if not solver._skip_y_forward(ysub_binary):
scores = self._generate_scores(
solver, xsub, ysub_binary, step_size, feature_order)
# one row will represent one class that is present in the data
# all classes are weighted equally
full_scores.append(
[score * scaling_value for score in scores])
else:
if not solver._skip_y_forward(ysub):
scores = self._generate_scores(
solver, xsub, ysub, step_size, feature_order)
full_scores.append(scores)
best_index = FeatureGradientSelector._find_best_index_elbow(
full_scores)
if self.verbose:
print("Forward passes took: ", time.time() - t)
# account for step size and off by one (n_features is 1 indexed, not 0
# )
return int(
np.ceil(
np.arange(
1,
self.max_features +
1,
step_size))[best_index])
@staticmethod
def _find_best_index_elbow(full_scores):
"""
Finds the point on the curve that maximizes distance from the line determined by the endpoints.
"""
scores = pd.DataFrame(full_scores).mean(0).values.tolist()
first_point = np.array([0, scores[0]])
last_point = np.array([len(scores) - 1, scores[-1]])
elbow_metric = []
for i in range(len(scores)):
elbow_metric.append(
FeatureGradientSelector._distance_to_line(
first_point, last_point, np.array([i, scores[i]])))
return np.argmax(elbow_metric)
@staticmethod
def _distance_to_line(start_point, end_point, new_point):
"""
Calculates the shortest distance from new_point to the line determined by start_point and end_point.
"""
# for calculating elbow method
return np.cross(new_point - start_point,
end_point - start_point) / np.linalg.norm(
end_point - start_point)
def _reset(self):
"""
Reset the estimator by deleting all private and fit parameters.
"""
params = self.__dict__
for key, _ in params.items():
if key.endswith('_') or key.startswith('_'):
delattr(self, key)
return self
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import time
import numpy as np
import scipy.special
import torch
import torch.nn as nn
import nni.feature_engineering.gradient_selector.constants as constants
import nni.feature_engineering.gradient_selector.syssettings as syssettings
from nni.feature_engineering.gradient_selector.fginitialize import ChunkDataLoader
torch.set_default_tensor_type(syssettings.torch.tensortype)
sparsetensor = syssettings.torch.sparse.tensortype
def def_train_opt(p):
"""
Return the default optimizer.
"""
return torch.optim.Adam(p, 1e-1, amsgrad=False)
def revcumsum(U):
"""
Reverse cumulative sum for faster performance.
"""
return U.flip(dims=[0]).cumsum(dim=0).flip(dims=[0])
def triudr(X, r):
Zr = torch.zeros_like(X, requires_grad=False)
U = X * r
Zr[:-1] = X[:-1] * revcumsum(U)[1:]
return Zr
def triudl(X, l):
Zl = torch.zeros_like(X, requires_grad=False)
U = X * l
Zl[1:] = X[1:] * (U.cumsum(dim=0)[:-1])
return Zl
class ramp(torch.autograd.Function):
"""
Ensures input is between 0 and 1
"""
@staticmethod
def forward(ctx, input_data):
ctx.save_for_backward(input_data)
return input_data.clamp(min=0, max=1)
@staticmethod
def backward(ctx, grad_output):
input_data, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input_data < 0] = 1e-2
grad_input[input_data > 1] = -1e-2
return grad_input
class safesqrt(torch.autograd.Function):
"""
Square root without dividing by 0.
"""
@staticmethod
def forward(ctx, input_data):
o = input_data.sqrt()
ctx.save_for_backward(input_data, o)
return o
@staticmethod
def backward(ctx, grad_output):
_, o = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input *= 0.5 / (o + constants.EPSILON)
return grad_input
class LearnabilityMB(nn.Module):
"""
Calculates the learnability of a set of features.
mini-batch version w/ "left" and "right" multiplies
"""
def __init__(self, Nminibatch, D, coeff, groups=None, binary=False,
device=constants.Device.CPU):
super(LearnabilityMB, self).__init__()
a = coeff / scipy.special.binom(Nminibatch, np.arange(coeff.size) + 2)
self.order = a.size
# pylint: disable=E1102
self.a = torch.tensor(a, dtype=torch.get_default_dtype(), requires_grad=False)
self.binary = binary
self.a = self.a.to(device)
def ret_val(self, z):
"""
Get the return value based on z.
"""
if not self.binary:
return 1 - z
else:
return 0.5 * (1 - safesqrt.apply(ramp.apply(z)))
def forward(self, s, X, y):
l = y.clone()
r = y.clone()
z = 0
for i in range(self.order):
if i % 2 == 0:
Z = triudr(X, r)
r = torch.mm(Z, s)
else:
Z = triudl(X, l)
l = torch.mm(Z, s)
if self.a[i] != 0:
# same the computation if a[i] is 0
p = torch.mm(l.t(), r)
z += self.a[i] * p
return self.ret_val(z)
class Solver(nn.Module):
"""
Class that performs the main optimization.
Keeps track of the current x and iterates through data to learn x given the penalty and order.
"""
def __init__(self,
PreparedData,
order,
Nminibatch=None,
groups=None,
soft_groups=None,
x0=None,
C=1,
ftransform=torch.sigmoid,
get_train_opt=def_train_opt,
accum_steps=1,
rng=np.random.RandomState(0),
max_norm_clip=1.,
shuffle=True,
device=constants.Device.CPU,
verbose=1):
"""
Parameters
----------
PreparedData : Dataset of PrepareData class
order : int
What order of interactions to include. Higher orders
may be more accurate but increase the run time. 12 is the maximum allowed order.
Nminibatch : int
Number of rows in a mini batch
groups : array-like
Optional, shape = [n_features]
Groups of columns that must be selected as a unit
e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
soft_groups : array-like
optional, shape = [n_features]
Groups of columns come from the same source
Used to encourage sparsity of number of sources selected
e.g. [0, 0, 1, 2] specifies the first two columns are part of a group.
x0 : torch.tensor
Optional, initialization of x.
C : float
Penalty parameter.
get_train_opt : function
Function that returns a pytorch optimizer, Adam is the default
accum_steps : int
Number of steps
rng : random state
max_norm_clip : float
Maximum allowable size of the gradient
shuffle : bool
Whether or not to shuffle data within the dataloader
order : int
What order of interactions to include. Higher orders
may be more accurate but increase the run time. 12 is the maximum allowed order.
penalty : int
Constant that multiplies the regularization term.
ftransform : function
Function to transform the x. sigmoid is the default.
device : str
'cpu' to run on CPU and 'cuda' to run on GPU. Runs much faster on GPU
verbose : int
Controls the verbosity when fitting. Set to 0 for no printing
1 or higher for printing every verbose number of gradient steps.
"""
super(Solver, self).__init__()
self.Ntrain, self.D = PreparedData.N, PreparedData.n_features
if groups is not None:
# pylint: disable=E1102
groups = torch.tensor(groups, dtype=torch.long)
self.groups = groups
else:
self.groups = None
if soft_groups is not None:
# pylint: disable=E1102
soft_groups = torch.tensor(soft_groups, dtype=torch.long)
self.soft_D = torch.unique(soft_groups).size()[0]
else:
self.soft_D = None
self.soft_groups = soft_groups
if Nminibatch is None:
Nminibatch = self.Ntrain
else:
if Nminibatch > self.Ntrain:
print('Minibatch larger than sample size.'
+ (' Reducing from %d to %d.'
% (Nminibatch, self.Ntrain)))
Nminibatch = self.Ntrain
if Nminibatch > PreparedData.max_rows:
print('Minibatch larger than mem-allowed.'
+ (' Reducing from %d to %d.' % (Nminibatch,
PreparedData.max_rows)))
Nminibatch = int(np.min([Nminibatch, PreparedData.max_rows]))
self.Nminibatch = Nminibatch
self.accum_steps = accum_steps
if x0 is None:
x0 = torch.zeros(self.D, 1, dtype=torch.get_default_dtype())
self.ftransform = ftransform
self.x = nn.Parameter(x0)
self.max_norm = max_norm_clip
self.device = device
self.verbose = verbose
self.multiclass = PreparedData.classification and PreparedData.n_classes and PreparedData.n_classes > 2
if self.multiclass:
self.n_classes = PreparedData.n_classes
else:
self.n_classes = None
# whether to treat all classes equally
self.balanced = PreparedData.balanced
self.ordinal = PreparedData.ordinal
if (hasattr(PreparedData, 'mappings')
or PreparedData.storage_level == 'disk'):
num_workers = PreparedData.num_workers
elif PreparedData.storage_level == constants.StorageLevel.DENSE:
num_workers = 0
else:
num_workers = 0
if constants.Device.CUDA in device:
pin_memory = False
else:
pin_memory = False
self.ds_train = ChunkDataLoader(
PreparedData,
batch_size=self.Nminibatch,
shuffle=shuffle,
drop_last=True,
num_workers=num_workers,
pin_memory=pin_memory,
timeout=60)
self.f_train = LearnabilityMB(self.Nminibatch, self.D,
constants.Coefficients.SLE[order],
self.groups,
binary=PreparedData.classification,
device=self.device)
self.opt_train = get_train_opt(torch.nn.ParameterList([self.x]))
self.it = 0
self.iters_per_epoch = int(np.ceil(len(self.ds_train.dataset)
/ self.ds_train.batch_size))
self.f_train = self.f_train.to(device)
# pylint: disable=E1102
self.w = torch.tensor(
C / (C + 1),
dtype=torch.get_default_dtype(), requires_grad=False)
self.w = self.w.to(device)
def penalty(self, s):
"""
Calculate L1 Penalty.
"""
to_return = torch.sum(s) / self.D
if self.soft_groups is not None:
# if soft_groups, there is an additional penalty for using more
# groups
s_grouped = torch.zeros(self.soft_D, 1,
dtype=torch.get_default_dtype(),
device=self.device)
for group in torch.unique(self.soft_groups):
# groups should be indexed 0 to n_group - 1
# TODO: consider other functions here
s_grouped[group] = s[self.soft_groups == group].max()
# each component of the penalty contributes .5
# TODO: could make this a user given parameter
to_return = (to_return + torch.sum(s_grouped) / self.soft_D) * .5
return to_return
def forward_and_backward(self, s, xsub, ysub, retain_graph=False):
"""
Completes the forward operation and computes gradients for learnability and penalty.
"""
f_train = self.f_train(s, xsub, ysub)
pen = self.penalty(s)
# pylint: disable=E1102
grad_outputs = torch.tensor([[1]], dtype=torch.get_default_dtype(),
device=self.device)
g1, = torch.autograd.grad([f_train], [self.x], grad_outputs,
retain_graph=True)
# pylint: disable=E1102
grad_outputs = torch.tensor([[1]], dtype=torch.get_default_dtype(),
device=self.device)
g2, = torch.autograd.grad([pen], [self.x], grad_outputs,
retain_graph=retain_graph)
return f_train, pen, g1, g2
def combine_gradient(self, g1, g2):
"""
Combine gradients from learnability and penalty
Parameters
----------
g1 : array-like
gradient from learnability
g2 : array-like
gradient from penalty
"""
to_return = ((1 - self.w) * g1 + self.w * g2) / self.accum_steps
if self.groups is not None:
# each column will get a gradient
# but we can only up or down groups, so the gradient for the group
# should be the average of the gradients of the columns
to_return_grouped = torch.zeros_like(self.x)
for group in torch.unique(self.groups):
to_return_grouped[self.groups ==
group] = to_return[self.groups == group].mean()
to_return = to_return_grouped
return to_return
def combine_loss(self, f_train, pen):
"""
Combine the learnability and L1 penalty.
"""
return ((1 - self.w) * f_train.detach() + self.w * pen.detach()) \
/ self.accum_steps
def transform_y_into_binary(self, ysub, target_class):
"""
Transforms multiclass classification problems into a binary classification problem.
"""
with torch.no_grad():
ysub_binary = torch.zeros_like(ysub)
if self.ordinal:
# turn ordinal problems into n-1 classifications of is this
# example less than rank k
if target_class == 0:
return None
ysub_binary[ysub >= target_class] = 1
ysub_binary[ysub < target_class] = -1
else:
# turn multiclass problems into n binary classifications
ysub_binary[ysub == target_class] = 1
ysub_binary[ysub != target_class] = -1
return ysub_binary
def _get_scaling_value(self, ysub, target_class):
"""
Returns the weight given to a class for multiclass classification.
"""
if self.balanced:
if self.ordinal:
return 1 / (torch.unique(ysub).size()[0] - 1)
return 1 / torch.unique(ysub).size()[0]
else:
if self.ordinal:
this_class_proportion = torch.mean(ysub >= target_class)
normalizing_constant = 0
for i in range(1, self.n_classes):
normalizing_constant += torch.mean(ysub >= i)
return this_class_proportion / normalizing_constant
else:
return torch.mean(ysub == target_class)
def _skip_y_forward(self, y):
"""
Returns boolean of whether to skip the currrent y if there is nothing to be learned from it.
"""
if y is None:
return True
elif torch.unique(y).size()[0] < 2:
return True
else:
return False
def train(self, f_callback=None, f_stop=None):
"""
Trains the estimator to determine which features to include.
Parameters
----------
f_callback : function
Function that performs a callback
f_stop: function
Function that tells you when to stop
"""
t = time.time()
h = torch.zeros([1, 1], dtype=torch.get_default_dtype())
h = h.to(self.device)
# h_complete is so when we divide by the number of classes
# we only do that for that minibatch if accumulating
h_complete = h.clone()
flag_stop = False
dataloader_iterator = iter(self.ds_train)
self.x.grad = torch.zeros_like(self.x)
while not flag_stop:
try:
xsub, ysub = next(dataloader_iterator)
except StopIteration:
dataloader_iterator = iter(self.ds_train)
xsub, ysub = next(dataloader_iterator)
try:
s = self.ftransform(self.x)
s = s.to(self.device)
if self.multiclass:
# accumulate gradients over each class, classes range from
# 0 to n_classes - 1
#num_classes_batch = torch.unique(ysub).size()[0]
for target_class in range(self.n_classes):
ysub_binary = self.transform_y_into_binary(
ysub, target_class)
if self._skip_y_forward(ysub_binary):
continue
# should should skip if target class is not included
# but that changes what we divide by
scaling_value = self._get_scaling_value(
ysub, target_class)
f_train, pen, g1, g2 = self.forward_and_backward(
s, xsub, ysub_binary, retain_graph=True)
self.x.grad += self.combine_gradient(
g1, g2) * scaling_value
h += self.combine_loss(f_train,
pen) * scaling_value
else:
if not self._skip_y_forward(ysub):
f_train, pen, g1, g2 = self.forward_and_backward(
s, xsub, ysub)
self.x.grad += self.combine_gradient(g1, g2)
h += self.combine_loss(f_train, pen)
else:
continue
h_complete += h
self.it += 1
if torch.isnan(h):
raise constants.NanError(
'Loss is nan, something may be misconfigured')
if self.it % self.accum_steps == 0:
torch.nn.utils.clip_grad_norm_(
torch.nn.ParameterList([self.x]),
max_norm=self.max_norm)
self.opt_train.step()
t = time.time() - t
if f_stop is not None:
flag_stop = f_stop(self, h, self.it, t)
if f_callback is not None:
f_callback(self, h, self.it, t)
elif self.verbose and (self.it // self.accum_steps) % self.verbose == 0:
epoch = int(self.it / self.iters_per_epoch)
print(
'[Minibatch: %6d/ Epoch: %3d/ t: %3.3f s] Loss: %0.3f' %
(self.it, epoch, t, h_complete / self.accum_steps))
if flag_stop:
break
self.opt_train.zero_grad()
h = 0
h_complete = 0
t = time.time()
except KeyboardInterrupt:
flag_stop = True
break
numpy==1.14.3
scikit-learn==0.20.0
scipy==1.1.0
torch==1.1.0
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import torch
# pytorch
torch.tensortype = torch.FloatTensor
torch.sparse.tensortype = torch.sparse.FloatTensor
# mem
MAXMEMGB = 10
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
# OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================================
import numpy as np
class EMA():
"""
maintains an exponential moving average
"""
def __init__(self, f=np.nan, discount_factor=0.1, valid_after=None,
n_iters_relchange=3):
self.f_ma = [f]
self.fs = [f]
self.gamma = discount_factor
self.rel_change = [np.nan]
if valid_after is None:
self.valid_after = int(1/discount_factor)
else:
self.valid_after = valid_after
self.n_iters_relchange = n_iters_relchange
self.initialized = False
def reset(self, f):
self.f_ma = [f]
self.fs = [f]
self.rel_change = [np.nan]
self.initialized = True
def relchange(self):
if self.num_updates() > np.max([self.valid_after,
self.n_iters_relchange]):
return np.max(self.rel_change[-self.n_iters_relchange:])
else:
return np.nan
def update(self, f_new):
if not self.initialized:
self.reset(f_new)
else:
self.fs.append(f_new)
self.f_ma.append(self.f_ma[-1]*(1-self.gamma) + self.gamma*f_new)
if self.num_updates() > self.valid_after:
self.rel_change.append(np.abs((self.f_ma[-1]-self.f_ma[-2])
/ self.f_ma[-2]))
def num_updates(self):
return len(self.f_ma)
def __call__(self):
if self.num_updates() > self.valid_after:
return self.f_ma[-1]
else:
return np.nan
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment