"docs/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "9dc84448aca9718f9e1175cf83a6a9c10467882a"
Unverified Commit 33a8bb93 authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

[Feature] Basic utils to handle raw data features (#2102)



* add feature utils and add test for feature norm

* Add docstring and test

* upd

* dis able some test

* Update

* update doc string

* update
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-68-185.ec2.internal>
parent b2ac89f2
......@@ -8,10 +8,18 @@ import warnings
import requests
import pickle
import errno
from multiprocessing import Manager,Process
import numpy as np
import scipy.sparse as sp
try:
import spacy
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
except ImportError:
pass
import pickle
import errno
from .graph_serialize import save_graphs, load_graphs, load_labels
from .tensor_serialize import save_tensors, load_tensors
......@@ -20,7 +28,10 @@ from .. import backend as F
__all__ = ['loadtxt','download', 'check_sha1', 'extract_archive',
'get_download_dir', 'Subset', 'split_dataset',
'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors"]
'save_graphs', "load_graphs", "load_labels", "save_tensors", "load_tensors",
'parse_word2vec_feature', 'parse_category_single_feat',
'parse_category_multi_feat', 'parse_numerical_feat',
'parse_numerical_multihot_feat']
def loadtxt(path, delimiter, dtype=None):
try:
......@@ -350,3 +361,412 @@ class Subset(object):
Number of datapoints in the subset
"""
return len(self.indices)
################### Feature Processing #######################
def row_normalize(features):
mx = sp.csr_matrix(features, dtype=np.float32)
"""Row-normalize sparse matrix"""
rowsum = np.array(mx.sum(1))
r_inv = np.power(rowsum, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
mx = r_mat_inv.dot(mx)
return np.array(mx.todense())
def col_normalize(features):
mx = sp.csr_matrix(features, dtype=np.float32)
colsum = np.array(mx.sum(0))
c_inv = np.power(colsum, -1).flatten()
c_inv[np.isinf(c_inv)] = 0.
c_mat_inv = sp.diags(c_inv).transpose()
mx = mx.dot(c_mat_inv)
return np.array(mx.todense())
def float_row_l1_normalize(features):
rowsum = np.sum(np.abs(features), axis=1)
r_inv = np.power(rowsum, -1).reshape(-1,1)
r_inv[np.isinf(r_inv)] = 0.
return features * r_inv
def float_col_l1_normalize(features):
colsum = np.sum(np.abs(features), axis=0)
c_inv = np.power(colsum, -1)
c_inv[np.isinf(c_inv)] = 0.
return features * c_inv
def float_col_maxmin_normalize(features):
feats = np.transpose(features)
min_val = np.reshape(np.amin(feats, axis=1), (-1, 1))
max_val = np.reshape(np.amax(feats, axis=1), (-1, 1))
norm = (feats - min_val) / (max_val - min_val)
norm[np.isnan(norm)] = 0.
return np.transpose(norm)
def embed_word2vec(str_val, nlps):
""" Use NLP encoder to encode the string into vector
There can be multiple NLP encoders in nlps. Each encoder
is invoded to generate a embedding for the input string and
the resulting embeddings are concatenated.
Parameters
----------
str_val : str
words to encode
nlps : list of func
a list of nlp encoder functions
"""
vector = None
for nlp in nlps:
doc = nlp(str_val)
if vector is None:
vector = doc.vector
else:
vector = np.concatenate((vector, doc.vector))
return vector
def parse_lang_feat(str_feats, nlp_encoders, verbose=False):
""" Parse a list of strings using word2vec encoding using NLP encoders in nlps
Parameters
----------
str_feats : list of str
list of strings to encode
nlp_encoders : list of func
a list of nlp encoder functions
verbose : bool, optional
print out debug info
Default: False
Return
------
numpy.array
the encoded features
"""
features = []
num_feats = len(str_feats)
num_process = num_feats if num_feats < 8 else 8 # TODO(xiangsx) get system nproc
batch_size = (num_feats + num_process - 1) // num_process
def embed_lang(d, proc_idx, feats):
res_feats = []
for s_feat in feats:
res_feats.append(embed_word2vec(s_feat, nlp_encoders))
d[proc_idx] = res_feats
# use multi process to process the feature
manager = Manager()
d = manager.dict()
job=[]
for i in range(num_process):
sub_info = str_feats[i * batch_size : (i+1) * batch_size \
if (i+1) * batch_size < num_feats else num_feats]
job.append(Process(target=embed_lang, args=(d, i, sub_info)))
for p in job:
p.start()
for p in job:
p.join()
for i in range(num_process):
if len(d[i]) > 0:
features.append(d[i])
features = np.concatenate(features)
if verbose:
print(features.shape)
return features
def parse_word2vec_feature(str_feats, languages, verbose=False):
""" Parse a list of strings using word2vec encoding using NLP encoders in nlps
Parameters
----------
str_feats : list of str
list of strings to encode
languages : list of string
list of languages used to encode the feature string.
verbose : bool, optional
print out debug info
Default: False
Return
------
numpy.array
the encoded features
Examples
--------
>>> inputs = ['hello', 'world']
>>> languages = ['en_core_web_lg', 'fr_core_news_lg']
>>> feats = parse_word2vec_node_feature(inputs, languages)
"""
import spacy
nlp_encoders = []
for lang in languages:
encoder = spacy.load(lang)
nlp_encoders.append(encoder)
return parse_lang_feat(str_feats, nlp_encoders, verbose)
def parse_category_single_feat(category_inputs, norm=None):
""" Parse categorical features and convert it into onehot encoding.
Each entity of category_inputs should only contain only one category.
Parameters
----------
category_inputs : list of str
input categorical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, sane as None
Note
----
sklearn.preprocessing.LabelBinarizer is used to convert
categorical features into a onehot encoding format.
Return
------
numpy.array
The features in numpy array
Examples
--------
>>> inputs = ['A', 'B', 'C', 'A']
>>> feats = parse_category_single_feat(inputs)
>>> feats
array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
"""
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
feat = lb.fit_transform(category_inputs)
# if there are only 2 catebories,
# fit_transform only create a array of [0, 1, ...]
if feat.shape[1] == 1:
f = np.zeros((feat.shape[0], 2))
f[range(f.shape[0]),feat.squeeze()] = 1.
feat = f
if norm == 'col':
return col_normalize(feat)
else:
return feat
def parse_category_multi_feat(category_inputs, norm=None):
""" Parse categorical features and convert it into multi-hot encoding.
Each entity of category_inputs may contain multiple categorical labels.
It uses multi-hot encoding to encode these labels.
Parameters
----------
category_inputs : list of list of str
input categorical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, row-based normalization. Normalize the data for
each row:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
Default: None
Note
----
sklearn.preprocessing.MultiLabelBinarizer is used to convert
categorical features into a multilabel format.
Return
------
numpy.array
The features in numpy array
Example
-------
>>> inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
>>> feats = parse_category_multi_feat(inputs)
>>> feats
array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]])
"""
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
feat = mlb.fit_transform(category_inputs)
if norm == 'col':
return col_normalize(feat)
if norm == 'row':
return row_normalize(feat)
else:
return feat
def parse_numerical_feat(numerical_inputs, norm=None):
""" Parse numerical features.
Parameters
----------
numerical_inputs : list of float or list of list of float
input numerical features
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `standard`:, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{|x_{ij}|}}
(3) `min-max`: column-based min-max normalization. Normalize the data
for each column:
.. math::
norm_i = \frac{x_i - min(x[:])}{max(x[:])-min(x[:])}
Return
------
numpy.array
The features in numpy array
Example
>>> inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
>>> feat = parse_numerical_feat(inputs, norm='col')
>>> feat
array([[0.25, 0., 0.],[0.5, 0.33333333, 0.25],[0.25, 0.66666667, -0.75]])
"""
feat = np.array(numerical_inputs, dtype='float')
if norm == 'standard':
return float_col_l1_normalize(feat)
elif norm == 'min-max':
return float_col_maxmin_normalize(feat)
else:
return feat
def parse_numerical_multihot_feat(input_feats, low, high, bucket_cnt, window_size, norm=None):
r""" Parse numerical features by matching them into
different buckets.
A bucket range based algorithm is used to convert numerical value into multi-hop
encoding features.
A numerical value range [low, high) is defined, and it is
divied into #bucket_cnt buckets. For a input V, we get its effected range as
[V - window_size/2, V + window_size/2] and check how many buckets it covers in
[low, high).
Parameters
----------
input_feats : list of float
Input numerical features
low : float
Lower bound of the range of the numerical values.
All v_i < low will be set to v_i = low.
high : float
Upper bound of the range of the numerical values.
All v_j > high will be set to v_j = high.
bucket_cnt: int
Number of bucket to use.
slide_window_size: int
The sliding window used to convert numerical value into bucket number.
norm: str, optional
Which kind of normalization is applied to the features.
Supported normalization ops include:
(1) None, do nothing.
(2) `col`, column-based normalization. Normalize the data
for each column:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{i=0}^N{x_{ij}}}
(3) `row`, row-based normalization. Normalize the data for
each row:
.. math::
x_{ij} = \frac{x_{ij}}{\sum_{j=0}^N{x_{ij}}}
Example
-------
>>> inputs = [0., 15., 26., 40.]
>>> low = 10.
>>> high = 30.
>>> bucket_cnt = 4
>>> window_size = 10. # range is 10 ~ 15; 15 ~ 20; 20 ~ 25; 25 ~ 30
>>> feat = parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
>>> feat
array([[1., 0., 0., 0],
[1., 1., 1., 0.],
[0., 0., 1., 1.],
[0., 0., 0., 1.]])
"""
raw_feats = np.array(input_feats, dtype=np.float32)
num_nodes = raw_feats.shape[0]
feat = np.zeros((num_nodes, bucket_cnt), dtype=np.float32)
bucket_size = (high - low) / bucket_cnt
eposilon = bucket_size / 10
low_val = raw_feats - window_size/2
high_val = raw_feats + window_size/2
low_val[low_val < low] = low
high_val[high_val < low] = low
high_val[high_val >= high] = high - eposilon
low_val[low_val >= high] = high - eposilon
low_val -= low
high_val -= low
low_idx = (low_val / bucket_size).astype('int')
high_idx = (high_val / bucket_size).astype('int') + 1
for i in range(raw_feats.shape[0]):
idx = np.arange(start=low_idx[i], stop=high_idx[i])
feat[i][idx] = 1.
if norm == 'col':
return col_normalize(feat)
if norm == 'row':
return row_normalize(feat)
else:
return feat
import dgl.data as data
import unittest, pytest
import numpy as np
def test_minigc():
ds = data.MiniGCDataset(16, 10, 20)
......@@ -18,6 +20,331 @@ def test_data_hash():
assert a.hash == b.hash
assert a.hash != c.hash
def test_row_normalize():
features = np.array([[1., 1., 1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([1./3., 1./3., 1./3.]), row_norm_feat)
features = np.array([[1.], [1.], [1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1.], [1.], [1.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[0., 1., 1.],[0., 0., 0.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0., 0.5, 0.5],[0., 0., 0.]]),
row_norm_feat)
# input (2, 3)
features = np.array([[1., 0., 0.],[2., 1., 1.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25]]),
row_norm_feat)
# input (3, 2)
features = np.array([[1., 0.],[1., 1.],[0., 0.]])
row_norm_feat = data.utils.row_normalize(features)
assert np.allclose(np.array([[1., 0.],[0.5, 0.5],[0., 0.]]),
row_norm_feat)
def test_col_normalize():
features = np.array([[1., 1., 1.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[1., 1., 1.]]), col_norm_feat)
features = np.array([[1.], [1.], [1.]])
row_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[1./3.],[1./3.], [1./3.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[1., 1., 0.],[0., 0., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.],[0., 0., 0.]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[1., 1., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.5, 0., 0.],[0.5, 1.0, 0.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[1., 1.],[2., 0.]])
col_norm_feat = data.utils.col_normalize(features)
assert np.allclose(np.array([[0.25, 0.],[0.25, 1.0],[0.5, 0.]]),
col_norm_feat)
def test_float_row_normalize():
features = np.array([[1.],[2.],[-3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1.],[1.],[-1.]]), row_norm_feat)
features = np.array([[1., 2., -3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1./6., 2./6., -3./6.]]), row_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[0.5, 0.25, 0.25],[1./6., 2./6., -3./6.]]),
row_norm_feat)
# input (2 3)
features = np.array([[1., 0., 0.],[-2., 1., 1.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0., 0.],[-0.5, 0.25, 0.25]]),
row_norm_feat)
# input (3, 2)
features = np.array([[1., 0.],[-2., 1.],[1., 2.]])
row_norm_feat = data.utils.float_row_l1_normalize(features)
assert np.allclose(np.array([[1., 0.],[-2./3., 1./3.],[1./3., 2./3.]]),
row_norm_feat)
def test_float_col_normalize():
features = np.array([[1., 2., -3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
features = np.array([[1.], [2.], [-3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[2., 1., -1.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[2., 1.],[1., -2.]])
col_norm_feat = data.utils.float_col_l1_normalize(features)
assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
col_norm_feat)
def test_float_col_maxmin_normalize():
features = np.array([[1., 2., -3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
features = np.array([[1.], [2.], [-3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
features = np.array([[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
col_norm_feat)
# input (2. 3)
features = np.array([[1., 0., 0.],[2., 1., -1.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
col_norm_feat)
# input (3. 2)
features = np.array([[1., 0.],[2., 1.],[4., -2.]])
col_norm_feat = data.utils.float_col_maxmin_normalize(features)
assert np.allclose(np.array([[0., 2./3.],[1./3., 1.],[1., 0.]]),
col_norm_feat)
@unittest.skip("spacy language test is too heavy")
def test_embed_word2vec():
import spacy
inputs = ['hello', 'world']
languages = ['en_core_web_lg', 'fr_core_news_lg']
nlps = [spacy.load(languages[0])]
feats = data.utils.embed_word2vec(inputs[0], nlps)
doc = nlps[0](inputs[0])
assert np.allclose(doc.vector, feats)
nlps.append(spacy.load(languages[1]))
for input in inputs:
feats = data.utils.embed_word2vec(input, nlps)
doc0 = nlps[0](input)
doc1 = nlps[1](input)
assert np.allclose(np.concatenate((doc0.vector, doc1.vector)),
feats)
@unittest.skip("spacy language test is too heavy")
def test_parse_lang_feat():
import spacy
inputs = ['hello', 'world']
languages = ['en_core_web_lg', 'fr_core_news_lg']
nlps = [spacy.load(languages[0]), spacy.load(languages[1])]
feats = data.utils.parse_lang_feat(inputs, nlps)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
feats = data.utils.parse_lang_feat(inputs, nlps)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
inputs = ["1", "2", "3", "4", "1", "2", "3", "4", "5", "6", "7", "8"]
feats = data.utils.parse_word2vec_feature(inputs, languages)
res_feats = []
for input in inputs:
doc0 = nlps[0](input)
doc1 = nlps[1](input)
res_feats.append(np.concatenate((doc0.vector, doc1.vector)))
res_feats = np.stack(res_feats)
assert np.allclose(feats, res_feats)
@unittest.skip("LabelBinarizer and MultiLabelBinarizer is not included in CI env")
def test_parse_category_feat():
# single-hot
inputs = ['A', 'B']
feats = data.utils.parse_category_single_feat(inputs)
assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
inputs = ['A', 'B', 'C', 'A']
feats = data.utils.parse_category_single_feat(inputs)
assert np.allclose(np.array([[1.,0.,0.],[0.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# col norm
feats = data.utils.parse_category_single_feat(inputs, norm='col')
assert np.allclose(np.array([[.5,0.,0.],[0.,1.,0.],[0.,0.,1.],[.5,0.,0.]]), feats)
# multi-hot
inputs = [['A'], ['B']]
feats = data.utils.parse_category_multi_feat(inputs)
assert np.allclose(np.array([[1.,0.],[0.,1.]]), feats)
inputs = [['A', 'B', 'C',], ['A', 'B'], ['C'], ['A']]
feats = data.utils.parse_category_multi_feat(inputs)
assert np.allclose(np.array([[1.,1.,1.],[1.,1.,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# row norm
feats = data.utils.parse_category_multi_feat(inputs, norm='row')
assert np.allclose(np.array([[1./3.,1./3.,1./3.],[.5,.5,0.],[0.,0.,1.],[1.,0.,0.]]), feats)
# col norm
feats = data.utils.parse_category_multi_feat(inputs, norm='col')
assert np.allclose(np.array([[1./3.,0.5,0.5],[1./3.,0.5,0.],[0.,0.,0.5],[1./3.,0.,0.]]), feats)
def test_parse_numerical_feat():
inputs = [[1., 2., -3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1., 1., -1.]]), col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 0.]]), col_norm_feat)
inputs = [[1.], [2.], [-3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1./6.],[2./6.], [-3./6.]]), col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[4./5.],[5./5.], [0.]]), col_norm_feat)
inputs = [[1., 0., 0.],[2., 1., 1.],[1., 2., -3.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[0.25, 0., 0.],[0.5, 1./3., 0.25],[0.25, 2./3., -0.75]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 3./4.],[1., 0.5, 1.],[0., 1., 0.]]),
col_norm_feat)
# input (2. 3)
inputs = [[1., 0., 0.],[2., 1., -1.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[1./3., 0., 0.],[2./3., 1.0, -1.]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 0., 1.],[1., 1., 0.]]),
col_norm_feat)
# input (3. 2)
inputs = [[1., 0.],[2., 1.],[1., -2.]]
feat = data.utils.parse_numerical_feat(inputs)
assert np.allclose(inputs, feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='standard')
assert np.allclose(np.array([[0.25, 0.],[0.5, 1./3.],[0.25, -2./3.]]),
col_norm_feat)
col_norm_feat = data.utils.parse_numerical_feat(inputs, norm='min-max')
assert np.allclose(np.array([[0., 2./3.],[1., 1.],[0., 0.]]),
col_norm_feat)
def test_parse_numerical_multihot_feat():
inputs = [0., 15., 20., 10.1, 25., 40.]
low = 10.
high = 30.
bucket_cnt = 2 #10~20, 20~30
window_size = 0.
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
assert np.allclose(np.array([[1., 0.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.]]), feat)
inputs = [0., 5., 15., 20., 10.1, 25., 30.1, 40.]
low = 10.
high = 30.
bucket_cnt = 4 #10~15,15~20,20~25,25~30
window_size = 10.
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size)
assert np.allclose(np.array([[1., 0., 0., 0],
[1., 0., 0., 0],
[1., 1., 1., 0.],
[0., 1., 1., 1.],
[1., 1., 0., 0.],
[0., 0., 1., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]]), feat)
# col norm
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='col')
assert np.allclose(np.array([[1./4., 0., 0., 0],
[1./4., 0., 0., 0],
[1./4., 1./3., 1./3., 0.],
[0., 1./3., 1./3., 1./4.],
[1./4., 1./3., 0., 0.],
[0., 0., 1./3., 1./4.],
[0., 0., 0., 1./4.],
[0., 0., 0., 1./4.]]), feat)
# row norm
feat = data.utils.parse_numerical_multihot_feat(inputs, low, high, bucket_cnt, window_size, norm='row')
assert np.allclose(np.array([[1., 0., 0., 0],
[1., 0., 0., 0],
[1./3., 1./3., 1./3., 0.],
[0., 1./3., 1./3., 1./3.],
[1./2., 1./2., 0., 0.],
[0., 0., 1./2., 1./2.],
[0., 0., 0., 1.],
[0., 0., 0., 1.]]), feat)
if __name__ == '__main__':
test_minigc()
test_data_hash()
#test_minigc()
#test_data_hash()
test_row_normalize()
test_col_normalize()
test_float_row_normalize()
test_float_col_normalize()
test_float_col_maxmin_normalize()
#test_embed_word2vec()
#test_parse_lang_feat()
#test_parse_category_feat()
test_parse_numerical_feat()
test_parse_numerical_multihot_feat()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment