"src/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "a80480f0f2c6b7981297e62a4878a4515148f1ad"
Unverified Commit c9c6171b authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

Revert "[Feature] Basic utils to handle raw data features (#2102)" (#2147)

This reverts commit 33a8bb93

.
Co-authored-by: default avatarMinjie Wang <wmjlyjemaine@gmail.com>
parent 3659fb05
...@@ -8,18 +8,10 @@ import warnings ...@@ -8,18 +8,10 @@ import warnings
import requests import requests
import pickle import pickle
import errno import errno
from multiprocessing import Manager,Process
import numpy as np import numpy as np
import scipy.sparse as sp
try:
import spacy
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
except ImportError:
pass
import pickle
import errno
from .graph_serialize import save_graphs, load_graphs, load_labels from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors from .tensor_serialize import save_tensors, load_tensors
...@@ -28,10 +20,7 @@ from .. import backend as F ...@@ -28,10 +20,7 @@ from .. import backend as F
__all__ = ['loadtxt','download', 'check_sha1', 'extract_archive', __all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
'get_download_dir', 'Subset', 'split_dataset', 'get_download_dir', 'Subset', 'split_dataset',
'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors", 'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
'parse_word2vec_feature', 'parse_category_single_feat',
'parse_category_multi_feat', 'parse_numerical_feat',
'parse_numerical_multihot_feat']
def loadtxt(path, delimiter, dtype=None): def loadtxt(path, delimiter, dtype=None):
try: try:
...@@ -361,412 +350,3 @@ class Subset(object): ...@@ -361,412 +350,3 @@ class Subset(object):
Number of datapoints in the subset Number of datapoints in the subset
""" """
return len(self.indices) return len(self.indices)
################### Feature Processing #######################
def row_normalize(features):
mx = sp.csr_matrix(features, dtype=np.float32)
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return np.array(mx.todense())
def col_normalize(features):
mx = sp.csr_matrix(features, dtype=np.float32)
colsum = np.array(mx.sum(0))
c_inv = np.power(colsum, -1).flatten()
c_inv[np.isinf(c_inv)] = 0.
c_mat_inv = sp.diags(c_inv).transpose()
mx = mx.dot(c_mat_inv)
return np.array(mx.todense())
def float_row_l1_normalize(features):
rowsum = np.sum(np.abs(features), axis=1)
r_inv = np.power(rowsum, -1).reshape(-1,1)
r_inv[np.isinf(r_inv)] = 0.
return features * r_inv
def float_col_l1_normalize(features):
colsum = np.sum(np.abs(features), axis=0)
c_inv = np.power(colsum, -1)
c_inv[np.isinf(c_inv)] = 0.
return features * c_inv
def float_col_maxmin_normalize(features):
feats = np.transpose(features)
min_val = np.reshape(np.amin(feats, axis=1), (-1, 1))
max_val = np.reshape(np.amax(feats, axis=1), (-1, 1))
norm = (feats - min_val) / (max_val - min_val)
norm[np.isnan(norm)] = 0.
return np.transpose(norm)
def embed_word2vec(str_val, nlps):
""" Use NLP encoder to encode the string into vector
There can be multiple NLP encoders in nlps. Each encoder
is invoded to generate a embedding for the input string and
the resulting embeddings are concatenated.
Parameters
----------
str_val : str
words to encode
nlps : list of func
a list of nlp encoder functions
"""
vector = None
for nlp in nlps:
doc = nlp(str_val)
if vector is None:
vector = doc.vector
else:
vector = np.concatenate((vector, doc.vector))
return vector
def parse_lang_feat(str_feats, nlp_encoders, verbose=False):
""" Parse a list of strings using word2vec encoding using NLP encoders in nlps
Parameters
----------
str_feats : list of str
list of strings to encode
nlp_encoders : list of func
a list of nlp encoder functions
verbose : bool, optional
print out debug info
Default: False
Return
------
numpy.array
the encoded features
"""
features = []
num_feats = len(str_feats)
num_process = num_feats if num_feats < 8 else 8 # TODO(xiangsx) get system nproc
batch_size = (num_feats + num_process - 1) // num_process
def embed_lang(d, proc_idx, feats):
res_feats = []
for s_feat in feats:
res_feats.append(embed_word2vec(s_feat, nlp_encoders))
d[proc_idx] = res_feats
# use multi process to process the feature
manager = Manager()
d = manager.dict()
job=[]
for i in range(num_process):
sub_info = str_feats[i * batch_size : (i+1) * batch_size \
if (i+1) * batch_size < num_feats else num_feats]
job.append(Process(target=embed_lang, args=(d, i, sub_info)))
for p in job:
p.start()
for p in job:
p.join()
for i in range(num_process):
if len(d[i]) > 0:
features.append(d[i])
features = np.concatenate(features)
if verbose:
print(features.shape)
return features
def parse_word2vec_feature(str_feats, languages, verbose=False):
""" Parse a list of strings using word2vec encoding using NLP encoders in nlps
Parameters
----------
str_feats : list of str
list of strings to encode
languages : list of string
list of languages used to encode the feature string.
verbose : bool, optional
print out debug info
Default: False
Return
------
numpy.array
the encoded features
Examples
--------
>>> inputs = ['hello', 'world']
>>> languages = ['en_core_web_lg', 'fr_core_news_lg']
>>> feats = parse_word2vec_node_feature(inputs, languages)
"""
import spacy
nlp_encoders = []
for lang in languages:
encoder = spacy.load(lang)
nlp_encoders.append(encoder)
return parse_lang_feat(str_feats, nlp_encoders, verbose)
def parse_category_single_feat(category_inputs, norm=None):
""" Parse categorical features and convert it into onehot encoding.
Each entity of category_inputs should only contain only one category.
Parameters
----------
category_inputs : list of str
input categorical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, sane as None
Note
----
sklearn.preprocessing.LabelBinarizer is used to convert
categorical features into a onehot encoding format.
Return
------
numpy.array
The features in numpy array
Examples
--------
>>> inputs = ['A', 'B', 'C', 'A']
>>> feats = parse_category_single_feat(inputs)
>>> feats
array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
"""
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
feat = lb.fit_transform(category_inputs)
# if there are only 2 catebories,
# fit_transform only create a array of [0, 1, ...]
if feat.shape[1] == 1:
f = np.zeros((feat.shape[0], 2))
f[range(f.shape[0]),feat.squeeze()] = 1.
feat = f
if norm == 'col':
return col_normalize(feat)
else:
return feat
def parse_category_multi_feat(category_inputs, norm=None):
""" Parse categorical features and convert it into multi-hot encoding.
Each entity of category_inputs may contain multiple categorical labels.
It uses multi-hot encoding to encode these labels.
Parameters
----------
category_inputs : list of list of str
input categorical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, row-based normalization. Normalize the data for
each row:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
Default: None
Note
----
sklearn.preprocessing.MultiLabelBinarizer is used to convert
categorical features into a multilabel format.
Return
------
numpy.array
The features in numpy array
Example
-------
>>> inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
>>> feats = parse_category_multi_feat(inputs)
>>> feats
array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
"""
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
feat = mlb.fit_transform(category_inputs)
if norm == 'col':
return col_normalize(feat)
if norm == 'row':
return row_normalize(feat)
else:
return feat
def parse_numerical_feat(numerical_inputs, norm=None):
""" Parse numerical features.
Parameters
----------
numerical_inputs : list of float or list of list of float
input numerical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `standard`:, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{|x_{ij}|}}
(3) `min-max`: column-based min-max normalization. Normalize the data
for each column:
.. math::
norm_i = \frac{x_i - min(x[:])}{max(x[:])-min(x[:])}
Return
------
numpy.array
The features in numpy array
Example
>>> inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
>>> feat = parse_numerical_feat(inputs, norm='col')
>>> feat
array([[0.25, 0., 0.],[0.5, 0.33333333, 0.25],[0.25, 0.66666667, -0.75]])
"""
feat = np.array(numerical_inputs, dtype='float')
if norm == 'standard':
return float_col_l1_normalize(feat)
elif norm == 'min-max':
return float_col_maxmin_normalize(feat)
else:
return feat
def parse_numerical_multihot_feat(input_feats, low, high, bucket_cnt, window_size, norm=None):
r""" Parse numerical features by matching them into
different buckets.
A bucket range based algorithm is used to convert numerical value into multi-hop
encoding features.
A numerical value range [low, high) is defined, and it is
divied into #bucket_cnt buckets. For a input V, we get its effected range as
[V - window_size/2, V + window_size/2] and check how many buckets it covers in
[low, high).
Parameters
----------
input_feats : list of float
Input numerical features
low : float
Lower bound of the range of the numerical values.
All v_i < low will be set to v_i = low.
high : float
Upper bound of the range of the numerical values.
All v_j > high will be set to v_j = high.
bucket_cnt: int
Number of bucket to use.
slide_window_size: int
The sliding window used to convert numerical value into bucket number.
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, row-based normalization. Normalize the data for
each row:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
Example
-------
>>> inputs = [0., 15., 26., 40.]
>>> low = 10.
>>> high = 30.
>>> bucket_cnt = 4
>>> window_size = 10. # range is 10 ~ 15; 15 ~ 20; 20 ~ 25; 25 ~ 30
>>> feat = parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
>>> feat
array([[1., 0., 0., 0],
[1., 1., 1., 0.],
[0., 0., 1., 1.],
[0., 0., 0., 1.]])
"""
raw_feats = np.array(input_feats, dtype=np.float32)
num_nodes = raw_feats.shape[0]
feat = np.zeros((num_nodes, bucket_cnt), dtype=np.float32)
bucket_size = (high - low) / bucket_cnt
eposilon = bucket_size / 10
low_val = raw_feats - window_size/2
high_val = raw_feats + window_size/2
low_val[low_val < low] = low
high_val[high_val < low] = low
high_val[high_val >= high] = high - eposilon
low_val[low_val >= high] = high - eposilon
low_val -= low
high_val -= low
low_idx = (low_val / bucket_size).astype('int')
high_idx = (high_val / bucket_size).astype('int') + 1
for i in range(raw_feats.shape[0]):
idx = np.arange(start=low_idx[i], stop=high_idx[i])
feat[i][idx] = 1.
if norm == 'col':
return col_normalize(feat)
if norm == 'row':
return row_normalize(feat)
else:
return feat
import dgl.data as data import dgl.data as data
import unittest, pytest
import numpy as np
def test_minigc(): def test_minigc():
ds = data.MiniGCDataset(16, 10, 20) ds = data.MiniGCDataset(16, 10, 20)
...@@ -20,331 +18,6 @@ def test_data_hash(): ...@@ -20,331 +18,6 @@ def test_data_hash():
assert a.hash == b.hash assert a.hash == b.hash
assert a.hash != c.hash assert a.hash != c.hash
def test_row_normalize():
features = np.array([[1., 1., 1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([1./3., 1./3., 1./3.]), row_norm_feat)
features = np.array([[1.], [1.], [1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1.], [1.], [1.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[0., 1., 1.],[0., 0., 0.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0., 0.5, 0.5],[0., 0., 0.]]),
row_norm_feat)
# input (2, 3)
features = np.array([[1., 0., 0.],[2., 1., 1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25]]),
row_norm_feat)
# input (3, 2)
features = np.array([[1., 0.],[1., 1.],[0., 0.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0.],[0.5, 0.5],[0., 0.]]),
row_norm_feat)
def test_col_normalize():
features = np.array([[1., 1., 1.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[1., 1., 1.]]), col_norm_feat)
features = np.array([[1.], [1.], [1.]])
row_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[1./3.],[1./3.], [1./3.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[1., 1., 0.],[0., 0., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.],[0., 0., 0.]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[1., 1., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[1., 1.],[2., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.25, 0.],[0.25, 1.0],[0.5, 0.]]),
col_norm_feat)
def test_float_row_normalize():
features = np.array([[1.],[2.],[-3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1.],[1.],[-1.]]), row_norm_feat)
features = np.array([[1., 2., -3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1./6., 2./6., -3./6.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25],[1./6., 2./6., -3./6.]]),
row_norm_feat)
# input (2 3)
features = np.array([[1., 0., 0.],[-2., 1., 1.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[-0.5, 0.25, 0.25]]),
row_norm_feat)
# input (3, 2)
features = np.array([[1., 0.],[-2., 1.],[1., 2.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0.],[-2./3., 1./3.],[1./3., 2./3.]]),
row_norm_feat)
def test_float_col_normalize():
features = np.array([[1., 2., -3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
features = np.array([[1.], [2.], [-3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[2., 1., -1.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[2., 1.],[1., -2.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
col_norm_feat)
def test_float_col_maxmin_normalize():
features = np.array([[1., 2., -3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
features = np.array([[1.], [2.], [-3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[2., 1., -1.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[2., 1.],[4., -2.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 2./3.],[1./3., 1.],[1., 0.]]),
col_norm_feat)
@unittest.skip("spacy language test is too heavy")
def test_embed_word2vec():
import spacy
inputs = ['hello', 'world']
languages = ['en_core_web_lg', 'fr_core_news_lg']
nlps = [spacy.load(languages[0])]
feats = data.utils.embed_word2vec(inputs[0], nlps)
doc = nlps[0](inputs[0])
assert np.allclose(doc.vector, feats)
nlps.append(spacy.load(languages[1]))
for input in inputs:
feats = data.utils.embed_word2vec(input, nlps)
doc0 = nlps[0](input)
doc1 = nlps[1](input)
assert np.allclose(np.concatenate((doc0.vector, doc1.vector)),
feats)
@unittest.skip("spacy language test is too heavy")
def test_parse_lang_feat():
import spacy
inputs = ['hello', 'world']
languages = ['en_core_web_lg', 'fr_core_news_lg']
nlps = [spacy.load(languages[0]), spacy.load(languages[1])]
feats = data.utils.parse_lang_feat(inputs, nlps)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
feats = data.utils.parse_lang_feat(inputs, nlps)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
feats = data.utils.parse_word2vec_feature(inputs, languages)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
@unittest.skip("LabelBinarizer and MultiLabelBinarizer is not included in CI env")
def test_parse_category_feat():
# single-hot
inputs = ['A', 'B']
feats = data.utils.parse_category_single_feat(inputs)
assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
inputs = ['A', 'B', 'C', 'A']
feats = data.utils.parse_category_single_feat(inputs)
assert np.allclose(np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# col norm
feats = data.utils.parse_category_single_feat(inputs, norm='col')
assert np.allclose(np.array([[.5,0.,0.],[0.,1.,0.],[0.,0.,1.],[.5,0.,0.]]), feats)
# multi-hot
inputs = [['A'], ['B']]
feats = data.utils.parse_category_multi_feat(inputs)
assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
feats = data.utils.parse_category_multi_feat(inputs)
assert np.allclose(np.array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# row norm
feats = data.utils.parse_category_multi_feat(inputs, norm='row')
assert np.allclose(np.array([[1./3.,1./3.,1./3.],[.5,.5,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# col norm
feats = data.utils.parse_category_multi_feat(inputs, norm='col')
assert np.allclose(np.array([[1./3.,0.5,0.5],[1./3.,0.5,0.],[0.,0.,0.5],[1./3.,0.,0.]]), feats)
def test_parse_numerical_feat():
inputs = [[1., 2., -3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
inputs = [[1.], [2.], [-3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
col_norm_feat)
# input (2. 3)
inputs = [[1., 0., 0.],[2., 1., -1.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
col_norm_feat)
# input (3. 2)
inputs = [[1., 0.],[2., 1.],[1., -2.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 2./3.],[1., 1.],[0., 0.]]),
col_norm_feat)
def test_parse_numerical_multihot_feat():
inputs = [0., 15., 20., 10.1, 25., 40.]
low = 10.
high = 30.
bucket_cnt = 2 #10~20, 20~30
window_size = 0.
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
assert np.allclose(np.array([[1., 0.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.]]), feat)
inputs = [0., 5., 15., 20., 10.1, 25., 30.1, 40.]
low = 10.
high = 30.
bucket_cnt = 4 #10~15,15~20,20~25,25~30
window_size = 10.
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
assert np.allclose(np.array([[1., 0., 0., 0],
[1., 0., 0., 0],
[1., 1., 1., 0.],
[0., 1., 1., 1.],
[1., 1., 0., 0.],
[0., 0., 1., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]]), feat)
# col norm
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='col')
assert np.allclose(np.array([[1./4., 0., 0., 0],
[1./4., 0., 0., 0],
[1./4., 1./3., 1./3., 0.],
[0., 1./3., 1./3., 1./4.],
[1./4., 1./3., 0., 0.],
[0., 0., 1./3., 1./4.],
[0., 0., 0., 1./4.],
[0., 0., 0., 1./4.]]), feat)
# row norm
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='row')
assert np.allclose(np.array([[1., 0., 0., 0],
[1., 0., 0., 0],
[1./3., 1./3., 1./3., 0.],
[0., 1./3., 1./3., 1./3.],
[1./2., 1./2., 0., 0.],
[0., 0., 1./2., 1./2.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]]), feat)
if __name__ == '__main__': if __name__ == '__main__':
#test_minigc() test_minigc()
#test_data_hash() test_data_hash()
test_row_normalize()
test_col_normalize()
test_float_row_normalize()
test_float_col_normalize()
test_float_col_maxmin_normalize()
#test_embed_word2vec()
#test_parse_lang_feat()
#test_parse_category_feat()
test_parse_numerical_feat()
test_parse_numerical_multihot_feat()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment